xref: /openbmc/qemu/target/i386/kvm/kvm.c (revision 4e647fa0)
1 /*
2  * QEMU KVM support
3  *
4  * Copyright (C) 2006-2008 Qumranet Technologies
5  * Copyright IBM, Corp. 2008
6  *
7  * Authors:
8  *  Anthony Liguori   <aliguori@us.ibm.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
11  * See the COPYING file in the top-level directory.
12  *
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qapi/qapi-events-run-state.h"
17 #include "qapi/error.h"
18 #include "qapi/visitor.h"
19 #include <sys/ioctl.h>
20 #include <sys/utsname.h>
21 #include <sys/syscall.h>
22 
23 #include <linux/kvm.h>
24 #include <linux/kvm_para.h>
25 #include "standard-headers/asm-x86/kvm_para.h"
26 #include "hw/xen/interface/arch-x86/cpuid.h"
27 
28 #include "cpu.h"
29 #include "host-cpu.h"
30 #include "sysemu/sysemu.h"
31 #include "sysemu/hw_accel.h"
32 #include "sysemu/kvm_int.h"
33 #include "sysemu/runstate.h"
34 #include "kvm_i386.h"
35 #include "../confidential-guest.h"
36 #include "sev.h"
37 #include "xen-emu.h"
38 #include "hyperv.h"
39 #include "hyperv-proto.h"
40 
41 #include "gdbstub/enums.h"
42 #include "qemu/host-utils.h"
43 #include "qemu/main-loop.h"
44 #include "qemu/ratelimit.h"
45 #include "qemu/config-file.h"
46 #include "qemu/error-report.h"
47 #include "qemu/memalign.h"
48 #include "hw/i386/x86.h"
49 #include "hw/i386/kvm/xen_evtchn.h"
50 #include "hw/i386/pc.h"
51 #include "hw/i386/apic.h"
52 #include "hw/i386/apic_internal.h"
53 #include "hw/i386/apic-msidef.h"
54 #include "hw/i386/intel_iommu.h"
55 #include "hw/i386/topology.h"
56 #include "hw/i386/x86-iommu.h"
57 #include "hw/i386/e820_memory_layout.h"
58 
59 #include "hw/xen/xen.h"
60 
61 #include "hw/pci/pci.h"
62 #include "hw/pci/msi.h"
63 #include "hw/pci/msix.h"
64 #include "migration/blocker.h"
65 #include "exec/memattrs.h"
66 #include "trace.h"
67 
68 #include CONFIG_DEVICES
69 
70 //#define DEBUG_KVM
71 
72 #ifdef DEBUG_KVM
73 #define DPRINTF(fmt, ...) \
74     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
75 #else
76 #define DPRINTF(fmt, ...) \
77     do { } while (0)
78 #endif
79 
80 /* From arch/x86/kvm/lapic.h */
81 #define KVM_APIC_BUS_CYCLE_NS       1
82 #define KVM_APIC_BUS_FREQUENCY      (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
83 
84 #define MSR_KVM_WALL_CLOCK  0x11
85 #define MSR_KVM_SYSTEM_TIME 0x12
86 
87 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
88  * 255 kvm_msr_entry structs */
89 #define MSR_BUF_SIZE 4096
90 
91 static void kvm_init_msrs(X86CPU *cpu);
92 
93 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
94     KVM_CAP_INFO(SET_TSS_ADDR),
95     KVM_CAP_INFO(EXT_CPUID),
96     KVM_CAP_INFO(MP_STATE),
97     KVM_CAP_INFO(SIGNAL_MSI),
98     KVM_CAP_INFO(IRQ_ROUTING),
99     KVM_CAP_INFO(DEBUGREGS),
100     KVM_CAP_INFO(XSAVE),
101     KVM_CAP_INFO(VCPU_EVENTS),
102     KVM_CAP_INFO(X86_ROBUST_SINGLESTEP),
103     KVM_CAP_INFO(MCE),
104     KVM_CAP_INFO(ADJUST_CLOCK),
105     KVM_CAP_INFO(SET_IDENTITY_MAP_ADDR),
106     KVM_CAP_LAST_INFO
107 };
108 
109 static bool has_msr_star;
110 static bool has_msr_hsave_pa;
111 static bool has_msr_tsc_aux;
112 static bool has_msr_tsc_adjust;
113 static bool has_msr_tsc_deadline;
114 static bool has_msr_feature_control;
115 static bool has_msr_misc_enable;
116 static bool has_msr_smbase;
117 static bool has_msr_bndcfgs;
118 static int lm_capable_kernel;
119 static bool has_msr_hv_hypercall;
120 static bool has_msr_hv_crash;
121 static bool has_msr_hv_reset;
122 static bool has_msr_hv_vpindex;
123 static bool hv_vpindex_settable;
124 static bool has_msr_hv_runtime;
125 static bool has_msr_hv_synic;
126 static bool has_msr_hv_stimer;
127 static bool has_msr_hv_frequencies;
128 static bool has_msr_hv_reenlightenment;
129 static bool has_msr_hv_syndbg_options;
130 static bool has_msr_xss;
131 static bool has_msr_umwait;
132 static bool has_msr_spec_ctrl;
133 static bool has_tsc_scale_msr;
134 static bool has_msr_tsx_ctrl;
135 static bool has_msr_virt_ssbd;
136 static bool has_msr_smi_count;
137 static bool has_msr_arch_capabs;
138 static bool has_msr_core_capabs;
139 static bool has_msr_vmx_vmfunc;
140 static bool has_msr_ucode_rev;
141 static bool has_msr_vmx_procbased_ctls2;
142 static bool has_msr_perf_capabs;
143 static bool has_msr_pkrs;
144 
145 static uint32_t has_architectural_pmu_version;
146 static uint32_t num_architectural_pmu_gp_counters;
147 static uint32_t num_architectural_pmu_fixed_counters;
148 
149 static int has_xsave2;
150 static int has_xcrs;
151 static int has_sregs2;
152 static int has_exception_payload;
153 static int has_triple_fault_event;
154 
155 static bool has_msr_mcg_ext_ctl;
156 
157 static struct kvm_cpuid2 *cpuid_cache;
158 static struct kvm_cpuid2 *hv_cpuid_cache;
159 static struct kvm_msr_list *kvm_feature_msrs;
160 
161 static KVMMSRHandlers msr_handlers[KVM_MSR_FILTER_MAX_RANGES];
162 
163 #define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */
164 static RateLimit bus_lock_ratelimit_ctrl;
165 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value);
166 
167 static const char *vm_type_name[] = {
168     [KVM_X86_DEFAULT_VM] = "default",
169     [KVM_X86_SEV_VM] = "SEV",
170     [KVM_X86_SEV_ES_VM] = "SEV-ES",
171     [KVM_X86_SNP_VM] = "SEV-SNP",
172 };
173 
174 bool kvm_is_vm_type_supported(int type)
175 {
176     uint32_t machine_types;
177 
178     /*
179      * old KVM doesn't support KVM_CAP_VM_TYPES but KVM_X86_DEFAULT_VM
180      * is always supported
181      */
182     if (type == KVM_X86_DEFAULT_VM) {
183         return true;
184     }
185 
186     machine_types = kvm_check_extension(KVM_STATE(current_machine->accelerator),
187                                         KVM_CAP_VM_TYPES);
188     return !!(machine_types & BIT(type));
189 }
190 
191 int kvm_get_vm_type(MachineState *ms)
192 {
193     int kvm_type = KVM_X86_DEFAULT_VM;
194 
195     if (ms->cgs) {
196         if (!object_dynamic_cast(OBJECT(ms->cgs), TYPE_X86_CONFIDENTIAL_GUEST)) {
197             error_report("configuration type %s not supported for x86 guests",
198                          object_get_typename(OBJECT(ms->cgs)));
199             exit(1);
200         }
201         kvm_type = x86_confidential_guest_kvm_type(
202             X86_CONFIDENTIAL_GUEST(ms->cgs));
203     }
204 
205     if (!kvm_is_vm_type_supported(kvm_type)) {
206         error_report("vm-type %s not supported by KVM", vm_type_name[kvm_type]);
207         exit(1);
208     }
209 
210     return kvm_type;
211 }
212 
213 bool kvm_enable_hypercall(uint64_t enable_mask)
214 {
215     KVMState *s = KVM_STATE(current_accel());
216 
217     return !kvm_vm_enable_cap(s, KVM_CAP_EXIT_HYPERCALL, 0, enable_mask);
218 }
219 
220 bool kvm_has_smm(void)
221 {
222     return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM);
223 }
224 
225 bool kvm_has_adjust_clock_stable(void)
226 {
227     int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
228 
229     return (ret & KVM_CLOCK_TSC_STABLE);
230 }
231 
232 bool kvm_has_exception_payload(void)
233 {
234     return has_exception_payload;
235 }
236 
237 static bool kvm_x2apic_api_set_flags(uint64_t flags)
238 {
239     KVMState *s = KVM_STATE(current_accel());
240 
241     return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
242 }
243 
244 #define MEMORIZE(fn, _result) \
245     ({ \
246         static bool _memorized; \
247         \
248         if (_memorized) { \
249             return _result; \
250         } \
251         _memorized = true; \
252         _result = fn; \
253     })
254 
255 static bool has_x2apic_api;
256 
257 bool kvm_has_x2apic_api(void)
258 {
259     return has_x2apic_api;
260 }
261 
262 bool kvm_enable_x2apic(void)
263 {
264     return MEMORIZE(
265              kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
266                                       KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
267              has_x2apic_api);
268 }
269 
270 bool kvm_hv_vpindex_settable(void)
271 {
272     return hv_vpindex_settable;
273 }
274 
275 static int kvm_get_tsc(CPUState *cs)
276 {
277     X86CPU *cpu = X86_CPU(cs);
278     CPUX86State *env = &cpu->env;
279     uint64_t value;
280     int ret;
281 
282     if (env->tsc_valid) {
283         return 0;
284     }
285 
286     env->tsc_valid = !runstate_is_running();
287 
288     ret = kvm_get_one_msr(cpu, MSR_IA32_TSC, &value);
289     if (ret < 0) {
290         return ret;
291     }
292 
293     env->tsc = value;
294     return 0;
295 }
296 
297 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
298 {
299     kvm_get_tsc(cpu);
300 }
301 
302 void kvm_synchronize_all_tsc(void)
303 {
304     CPUState *cpu;
305 
306     if (kvm_enabled()) {
307         CPU_FOREACH(cpu) {
308             run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
309         }
310     }
311 }
312 
313 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
314 {
315     struct kvm_cpuid2 *cpuid;
316     int r, size;
317 
318     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
319     cpuid = g_malloc0(size);
320     cpuid->nent = max;
321     r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
322     if (r == 0 && cpuid->nent >= max) {
323         r = -E2BIG;
324     }
325     if (r < 0) {
326         if (r == -E2BIG) {
327             g_free(cpuid);
328             return NULL;
329         } else {
330             fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
331                     strerror(-r));
332             exit(1);
333         }
334     }
335     return cpuid;
336 }
337 
338 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
339  * for all entries.
340  */
341 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
342 {
343     struct kvm_cpuid2 *cpuid;
344     int max = 1;
345 
346     if (cpuid_cache != NULL) {
347         return cpuid_cache;
348     }
349     while ((cpuid = try_get_cpuid(s, max)) == NULL) {
350         max *= 2;
351     }
352     cpuid_cache = cpuid;
353     return cpuid;
354 }
355 
356 static bool host_tsx_broken(void)
357 {
358     int family, model, stepping;\
359     char vendor[CPUID_VENDOR_SZ + 1];
360 
361     host_cpu_vendor_fms(vendor, &family, &model, &stepping);
362 
363     /* Check if we are running on a Haswell host known to have broken TSX */
364     return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
365            (family == 6) &&
366            ((model == 63 && stepping < 4) ||
367             model == 60 || model == 69 || model == 70);
368 }
369 
370 /* Returns the value for a specific register on the cpuid entry
371  */
372 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
373 {
374     uint32_t ret = 0;
375     switch (reg) {
376     case R_EAX:
377         ret = entry->eax;
378         break;
379     case R_EBX:
380         ret = entry->ebx;
381         break;
382     case R_ECX:
383         ret = entry->ecx;
384         break;
385     case R_EDX:
386         ret = entry->edx;
387         break;
388     }
389     return ret;
390 }
391 
392 /* Find matching entry for function/index on kvm_cpuid2 struct
393  */
394 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
395                                                  uint32_t function,
396                                                  uint32_t index)
397 {
398     int i;
399     for (i = 0; i < cpuid->nent; ++i) {
400         if (cpuid->entries[i].function == function &&
401             cpuid->entries[i].index == index) {
402             return &cpuid->entries[i];
403         }
404     }
405     /* not found: */
406     return NULL;
407 }
408 
409 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
410                                       uint32_t index, int reg)
411 {
412     struct kvm_cpuid2 *cpuid;
413     uint32_t ret = 0;
414     uint32_t cpuid_1_edx, unused;
415     uint64_t bitmask;
416 
417     cpuid = get_supported_cpuid(s);
418 
419     struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
420     if (entry) {
421         ret = cpuid_entry_get_reg(entry, reg);
422     }
423 
424     /* Fixups for the data returned by KVM, below */
425 
426     if (function == 1 && reg == R_EDX) {
427         /* KVM before 2.6.30 misreports the following features */
428         ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
429         /* KVM never reports CPUID_HT but QEMU can support when vcpus > 1 */
430         ret |= CPUID_HT;
431     } else if (function == 1 && reg == R_ECX) {
432         /* We can set the hypervisor flag, even if KVM does not return it on
433          * GET_SUPPORTED_CPUID
434          */
435         ret |= CPUID_EXT_HYPERVISOR;
436         /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
437          * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
438          * and the irqchip is in the kernel.
439          */
440         if (kvm_irqchip_in_kernel() &&
441                 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
442             ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
443         }
444 
445         /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
446          * without the in-kernel irqchip
447          */
448         if (!kvm_irqchip_in_kernel()) {
449             ret &= ~CPUID_EXT_X2APIC;
450         }
451 
452         if (enable_cpu_pm) {
453             int disable_exits = kvm_check_extension(s,
454                                                     KVM_CAP_X86_DISABLE_EXITS);
455 
456             if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
457                 ret |= CPUID_EXT_MONITOR;
458             }
459         }
460     } else if (function == 6 && reg == R_EAX) {
461         ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
462     } else if (function == 7 && index == 0 && reg == R_EBX) {
463         /* Not new instructions, just an optimization.  */
464         uint32_t ebx;
465         host_cpuid(7, 0, &unused, &ebx, &unused, &unused);
466         ret |= ebx & CPUID_7_0_EBX_ERMS;
467 
468         if (host_tsx_broken()) {
469             ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
470         }
471     } else if (function == 7 && index == 0 && reg == R_EDX) {
472         /* Not new instructions, just an optimization.  */
473         uint32_t edx;
474         host_cpuid(7, 0, &unused, &unused, &unused, &edx);
475         ret |= edx & CPUID_7_0_EDX_FSRM;
476 
477         /*
478          * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts.
479          * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is
480          * returned by KVM_GET_MSR_INDEX_LIST.
481          */
482         if (!has_msr_arch_capabs) {
483             ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
484         }
485     } else if (function == 7 && index == 1 && reg == R_EAX) {
486         /* Not new instructions, just an optimization.  */
487         uint32_t eax;
488         host_cpuid(7, 1, &eax, &unused, &unused, &unused);
489         ret |= eax & (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_FSRC);
490     } else if (function == 7 && index == 2 && reg == R_EDX) {
491         uint32_t edx;
492         host_cpuid(7, 2, &unused, &unused, &unused, &edx);
493         ret |= edx & CPUID_7_2_EDX_MCDT_NO;
494     } else if (function == 0xd && index == 0 &&
495                (reg == R_EAX || reg == R_EDX)) {
496         /*
497          * The value returned by KVM_GET_SUPPORTED_CPUID does not include
498          * features that still have to be enabled with the arch_prctl
499          * system call.  QEMU needs the full value, which is retrieved
500          * with KVM_GET_DEVICE_ATTR.
501          */
502         struct kvm_device_attr attr = {
503             .group = 0,
504             .attr = KVM_X86_XCOMP_GUEST_SUPP,
505             .addr = (unsigned long) &bitmask
506         };
507 
508         bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES);
509         if (!sys_attr) {
510             return ret;
511         }
512 
513         int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr);
514         if (rc < 0) {
515             if (rc != -ENXIO) {
516                 warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) "
517                             "error: %d", rc);
518             }
519             return ret;
520         }
521         ret = (reg == R_EAX) ? bitmask : bitmask >> 32;
522     } else if (function == 0x80000001 && reg == R_ECX) {
523         /*
524          * It's safe to enable TOPOEXT even if it's not returned by
525          * GET_SUPPORTED_CPUID.  Unconditionally enabling TOPOEXT here allows
526          * us to keep CPU models including TOPOEXT runnable on older kernels.
527          */
528         ret |= CPUID_EXT3_TOPOEXT;
529     } else if (function == 0x80000001 && reg == R_EDX) {
530         /* On Intel, kvm returns cpuid according to the Intel spec,
531          * so add missing bits according to the AMD spec:
532          */
533         cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
534         ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
535     } else if (function == 0x80000007 && reg == R_EBX) {
536         ret |= CPUID_8000_0007_EBX_OVERFLOW_RECOV | CPUID_8000_0007_EBX_SUCCOR;
537     } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
538         /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
539          * be enabled without the in-kernel irqchip
540          */
541         if (!kvm_irqchip_in_kernel()) {
542             ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
543         }
544         if (kvm_irqchip_is_split()) {
545             ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID;
546         }
547     } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
548         ret |= 1U << KVM_HINTS_REALTIME;
549     }
550 
551     return ret;
552 }
553 
554 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
555 {
556     struct {
557         struct kvm_msrs info;
558         struct kvm_msr_entry entries[1];
559     } msr_data = {};
560     uint64_t value;
561     uint32_t ret, can_be_one, must_be_one;
562 
563     if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */
564         return 0;
565     }
566 
567     /* Check if requested MSR is supported feature MSR */
568     int i;
569     for (i = 0; i < kvm_feature_msrs->nmsrs; i++)
570         if (kvm_feature_msrs->indices[i] == index) {
571             break;
572         }
573     if (i == kvm_feature_msrs->nmsrs) {
574         return 0; /* if the feature MSR is not supported, simply return 0 */
575     }
576 
577     msr_data.info.nmsrs = 1;
578     msr_data.entries[0].index = index;
579 
580     ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data);
581     if (ret != 1) {
582         error_report("KVM get MSR (index=0x%x) feature failed, %s",
583             index, strerror(-ret));
584         exit(1);
585     }
586 
587     value = msr_data.entries[0].data;
588     switch (index) {
589     case MSR_IA32_VMX_PROCBASED_CTLS2:
590         if (!has_msr_vmx_procbased_ctls2) {
591             /* KVM forgot to add these bits for some time, do this ourselves. */
592             if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) &
593                 CPUID_XSAVE_XSAVES) {
594                 value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32;
595             }
596             if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) &
597                 CPUID_EXT_RDRAND) {
598                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32;
599             }
600             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
601                 CPUID_7_0_EBX_INVPCID) {
602                 value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32;
603             }
604             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
605                 CPUID_7_0_EBX_RDSEED) {
606                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32;
607             }
608             if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) &
609                 CPUID_EXT2_RDTSCP) {
610                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32;
611             }
612         }
613         /* fall through */
614     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
615     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
616     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
617     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
618         /*
619          * Return true for bits that can be one, but do not have to be one.
620          * The SDM tells us which bits could have a "must be one" setting,
621          * so we can do the opposite transformation in make_vmx_msr_value.
622          */
623         must_be_one = (uint32_t)value;
624         can_be_one = (uint32_t)(value >> 32);
625         return can_be_one & ~must_be_one;
626 
627     default:
628         return value;
629     }
630 }
631 
632 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
633                                      int *max_banks)
634 {
635     *max_banks = kvm_check_extension(s, KVM_CAP_MCE);
636     return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
637 }
638 
639 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
640 {
641     CPUState *cs = CPU(cpu);
642     CPUX86State *env = &cpu->env;
643     uint64_t status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_MISCV |
644                       MCI_STATUS_ADDRV;
645     uint64_t mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
646     int flags = 0;
647 
648     if (!IS_AMD_CPU(env)) {
649         status |= MCI_STATUS_S | MCI_STATUS_UC;
650         if (code == BUS_MCEERR_AR) {
651             status |= MCI_STATUS_AR | 0x134;
652             mcg_status |= MCG_STATUS_EIPV;
653         } else {
654             status |= 0xc0;
655         }
656     } else {
657         if (code == BUS_MCEERR_AR) {
658             status |= MCI_STATUS_UC | MCI_STATUS_POISON;
659             mcg_status |= MCG_STATUS_EIPV;
660         } else {
661             /* Setting the POISON bit for deferred errors indicates to the
662              * guest kernel that the address provided by the MCE is valid
663              * and usable which will ensure that the guest kernel will send
664              * a SIGBUS_AO signal to the guest process. This allows for
665              * more desirable behavior in the case that the guest process
666              * with poisoned memory has set the MCE_KILL_EARLY prctl flag
667              * which indicates that the process would prefer to handle or
668              * shutdown due to the poisoned memory condition before the
669              * memory has been accessed.
670              *
671              * While the POISON bit would not be set in a deferred error
672              * sent from hardware, the bit is not meaningful for deferred
673              * errors and can be reused in this scenario.
674              */
675             status |= MCI_STATUS_DEFERRED | MCI_STATUS_POISON;
676         }
677     }
678 
679     flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
680     /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
681      * guest kernel back into env->mcg_ext_ctl.
682      */
683     cpu_synchronize_state(cs);
684     if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
685         mcg_status |= MCG_STATUS_LMCE;
686         flags = 0;
687     }
688 
689     cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
690                        (MCM_ADDR_PHYS << 6) | 0xc, flags);
691 }
692 
693 static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar)
694 {
695     MemoryFailureFlags mff = {.action_required = ar, .recursive = false};
696 
697     qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action,
698                                    &mff);
699 }
700 
701 static void hardware_memory_error(void *host_addr)
702 {
703     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true);
704     error_report("QEMU got Hardware memory error at addr %p", host_addr);
705     exit(1);
706 }
707 
708 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
709 {
710     X86CPU *cpu = X86_CPU(c);
711     CPUX86State *env = &cpu->env;
712     ram_addr_t ram_addr;
713     hwaddr paddr;
714 
715     /* If we get an action required MCE, it has been injected by KVM
716      * while the VM was running.  An action optional MCE instead should
717      * be coming from the main thread, which qemu_init_sigbus identifies
718      * as the "early kill" thread.
719      */
720     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
721 
722     if ((env->mcg_cap & MCG_SER_P) && addr) {
723         ram_addr = qemu_ram_addr_from_host(addr);
724         if (ram_addr != RAM_ADDR_INVALID &&
725             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
726             kvm_hwpoison_page_add(ram_addr);
727             kvm_mce_inject(cpu, paddr, code);
728 
729             /*
730              * Use different logging severity based on error type.
731              * If there is additional MCE reporting on the hypervisor, QEMU VA
732              * could be another source to identify the PA and MCE details.
733              */
734             if (code == BUS_MCEERR_AR) {
735                 error_report("Guest MCE Memory Error at QEMU addr %p and "
736                     "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
737                     addr, paddr, "BUS_MCEERR_AR");
738             } else {
739                  warn_report("Guest MCE Memory Error at QEMU addr %p and "
740                      "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
741                      addr, paddr, "BUS_MCEERR_AO");
742             }
743 
744             return;
745         }
746 
747         if (code == BUS_MCEERR_AO) {
748             warn_report("Hardware memory error at addr %p of type %s "
749                 "for memory used by QEMU itself instead of guest system!",
750                  addr, "BUS_MCEERR_AO");
751         }
752     }
753 
754     if (code == BUS_MCEERR_AR) {
755         hardware_memory_error(addr);
756     }
757 
758     /* Hope we are lucky for AO MCE, just notify a event */
759     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false);
760 }
761 
762 static void kvm_queue_exception(CPUX86State *env,
763                                 int32_t exception_nr,
764                                 uint8_t exception_has_payload,
765                                 uint64_t exception_payload)
766 {
767     assert(env->exception_nr == -1);
768     assert(!env->exception_pending);
769     assert(!env->exception_injected);
770     assert(!env->exception_has_payload);
771 
772     env->exception_nr = exception_nr;
773 
774     if (has_exception_payload) {
775         env->exception_pending = 1;
776 
777         env->exception_has_payload = exception_has_payload;
778         env->exception_payload = exception_payload;
779     } else {
780         env->exception_injected = 1;
781 
782         if (exception_nr == EXCP01_DB) {
783             assert(exception_has_payload);
784             env->dr[6] = exception_payload;
785         } else if (exception_nr == EXCP0E_PAGE) {
786             assert(exception_has_payload);
787             env->cr[2] = exception_payload;
788         } else {
789             assert(!exception_has_payload);
790         }
791     }
792 }
793 
794 static void cpu_update_state(void *opaque, bool running, RunState state)
795 {
796     CPUX86State *env = opaque;
797 
798     if (running) {
799         env->tsc_valid = false;
800     }
801 }
802 
803 unsigned long kvm_arch_vcpu_id(CPUState *cs)
804 {
805     X86CPU *cpu = X86_CPU(cs);
806     return cpu->apic_id;
807 }
808 
809 #ifndef KVM_CPUID_SIGNATURE_NEXT
810 #define KVM_CPUID_SIGNATURE_NEXT                0x40000100
811 #endif
812 
813 static bool hyperv_enabled(X86CPU *cpu)
814 {
815     return kvm_check_extension(kvm_state, KVM_CAP_HYPERV) > 0 &&
816         ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) ||
817          cpu->hyperv_features || cpu->hyperv_passthrough);
818 }
819 
820 /*
821  * Check whether target_freq is within conservative
822  * ntp correctable bounds (250ppm) of freq
823  */
824 static inline bool freq_within_bounds(int freq, int target_freq)
825 {
826         int max_freq = freq + (freq * 250 / 1000000);
827         int min_freq = freq - (freq * 250 / 1000000);
828 
829         if (target_freq >= min_freq && target_freq <= max_freq) {
830                 return true;
831         }
832 
833         return false;
834 }
835 
836 static int kvm_arch_set_tsc_khz(CPUState *cs)
837 {
838     X86CPU *cpu = X86_CPU(cs);
839     CPUX86State *env = &cpu->env;
840     int r, cur_freq;
841     bool set_ioctl = false;
842 
843     if (!env->tsc_khz) {
844         return 0;
845     }
846 
847     cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
848                kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
849 
850     /*
851      * If TSC scaling is supported, attempt to set TSC frequency.
852      */
853     if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
854         set_ioctl = true;
855     }
856 
857     /*
858      * If desired TSC frequency is within bounds of NTP correction,
859      * attempt to set TSC frequency.
860      */
861     if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) {
862         set_ioctl = true;
863     }
864 
865     r = set_ioctl ?
866         kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
867         -ENOTSUP;
868 
869     if (r < 0) {
870         /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
871          * TSC frequency doesn't match the one we want.
872          */
873         cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
874                    kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
875                    -ENOTSUP;
876         if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
877             warn_report("TSC frequency mismatch between "
878                         "VM (%" PRId64 " kHz) and host (%d kHz), "
879                         "and TSC scaling unavailable",
880                         env->tsc_khz, cur_freq);
881             return r;
882         }
883     }
884 
885     return 0;
886 }
887 
888 static bool tsc_is_stable_and_known(CPUX86State *env)
889 {
890     if (!env->tsc_khz) {
891         return false;
892     }
893     return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
894         || env->user_tsc_khz;
895 }
896 
897 #define DEFAULT_EVMCS_VERSION ((1 << 8) | 1)
898 
899 static struct {
900     const char *desc;
901     struct {
902         uint32_t func;
903         int reg;
904         uint32_t bits;
905     } flags[2];
906     uint64_t dependencies;
907 } kvm_hyperv_properties[] = {
908     [HYPERV_FEAT_RELAXED] = {
909         .desc = "relaxed timing (hv-relaxed)",
910         .flags = {
911             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
912              .bits = HV_RELAXED_TIMING_RECOMMENDED}
913         }
914     },
915     [HYPERV_FEAT_VAPIC] = {
916         .desc = "virtual APIC (hv-vapic)",
917         .flags = {
918             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
919              .bits = HV_APIC_ACCESS_AVAILABLE}
920         }
921     },
922     [HYPERV_FEAT_TIME] = {
923         .desc = "clocksources (hv-time)",
924         .flags = {
925             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
926              .bits = HV_TIME_REF_COUNT_AVAILABLE | HV_REFERENCE_TSC_AVAILABLE}
927         }
928     },
929     [HYPERV_FEAT_CRASH] = {
930         .desc = "crash MSRs (hv-crash)",
931         .flags = {
932             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
933              .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
934         }
935     },
936     [HYPERV_FEAT_RESET] = {
937         .desc = "reset MSR (hv-reset)",
938         .flags = {
939             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
940              .bits = HV_RESET_AVAILABLE}
941         }
942     },
943     [HYPERV_FEAT_VPINDEX] = {
944         .desc = "VP_INDEX MSR (hv-vpindex)",
945         .flags = {
946             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
947              .bits = HV_VP_INDEX_AVAILABLE}
948         }
949     },
950     [HYPERV_FEAT_RUNTIME] = {
951         .desc = "VP_RUNTIME MSR (hv-runtime)",
952         .flags = {
953             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
954              .bits = HV_VP_RUNTIME_AVAILABLE}
955         }
956     },
957     [HYPERV_FEAT_SYNIC] = {
958         .desc = "synthetic interrupt controller (hv-synic)",
959         .flags = {
960             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
961              .bits = HV_SYNIC_AVAILABLE}
962         }
963     },
964     [HYPERV_FEAT_STIMER] = {
965         .desc = "synthetic timers (hv-stimer)",
966         .flags = {
967             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
968              .bits = HV_SYNTIMERS_AVAILABLE}
969         },
970         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
971     },
972     [HYPERV_FEAT_FREQUENCIES] = {
973         .desc = "frequency MSRs (hv-frequencies)",
974         .flags = {
975             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
976              .bits = HV_ACCESS_FREQUENCY_MSRS},
977             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
978              .bits = HV_FREQUENCY_MSRS_AVAILABLE}
979         }
980     },
981     [HYPERV_FEAT_REENLIGHTENMENT] = {
982         .desc = "reenlightenment MSRs (hv-reenlightenment)",
983         .flags = {
984             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
985              .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
986         }
987     },
988     [HYPERV_FEAT_TLBFLUSH] = {
989         .desc = "paravirtualized TLB flush (hv-tlbflush)",
990         .flags = {
991             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
992              .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
993              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
994         },
995         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
996     },
997     [HYPERV_FEAT_EVMCS] = {
998         .desc = "enlightened VMCS (hv-evmcs)",
999         .flags = {
1000             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
1001              .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
1002         },
1003         .dependencies = BIT(HYPERV_FEAT_VAPIC)
1004     },
1005     [HYPERV_FEAT_IPI] = {
1006         .desc = "paravirtualized IPI (hv-ipi)",
1007         .flags = {
1008             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
1009              .bits = HV_CLUSTER_IPI_RECOMMENDED |
1010              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
1011         },
1012         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
1013     },
1014     [HYPERV_FEAT_STIMER_DIRECT] = {
1015         .desc = "direct mode synthetic timers (hv-stimer-direct)",
1016         .flags = {
1017             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
1018              .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
1019         },
1020         .dependencies = BIT(HYPERV_FEAT_STIMER)
1021     },
1022     [HYPERV_FEAT_AVIC] = {
1023         .desc = "AVIC/APICv support (hv-avic/hv-apicv)",
1024         .flags = {
1025             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
1026              .bits = HV_DEPRECATING_AEOI_RECOMMENDED}
1027         }
1028     },
1029 #ifdef CONFIG_SYNDBG
1030     [HYPERV_FEAT_SYNDBG] = {
1031         .desc = "Enable synthetic kernel debugger channel (hv-syndbg)",
1032         .flags = {
1033             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
1034              .bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE}
1035         },
1036         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED)
1037     },
1038 #endif
1039     [HYPERV_FEAT_MSR_BITMAP] = {
1040         .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)",
1041         .flags = {
1042             {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX,
1043              .bits = HV_NESTED_MSR_BITMAP}
1044         }
1045     },
1046     [HYPERV_FEAT_XMM_INPUT] = {
1047         .desc = "XMM fast hypercall input (hv-xmm-input)",
1048         .flags = {
1049             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
1050              .bits = HV_HYPERCALL_XMM_INPUT_AVAILABLE}
1051         }
1052     },
1053     [HYPERV_FEAT_TLBFLUSH_EXT] = {
1054         .desc = "Extended gva ranges for TLB flush hypercalls (hv-tlbflush-ext)",
1055         .flags = {
1056             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
1057              .bits = HV_EXT_GVA_RANGES_FLUSH_AVAILABLE}
1058         },
1059         .dependencies = BIT(HYPERV_FEAT_TLBFLUSH)
1060     },
1061     [HYPERV_FEAT_TLBFLUSH_DIRECT] = {
1062         .desc = "direct TLB flush (hv-tlbflush-direct)",
1063         .flags = {
1064             {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX,
1065              .bits = HV_NESTED_DIRECT_FLUSH}
1066         },
1067         .dependencies = BIT(HYPERV_FEAT_VAPIC)
1068     },
1069 };
1070 
1071 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max,
1072                                            bool do_sys_ioctl)
1073 {
1074     struct kvm_cpuid2 *cpuid;
1075     int r, size;
1076 
1077     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
1078     cpuid = g_malloc0(size);
1079     cpuid->nent = max;
1080 
1081     if (do_sys_ioctl) {
1082         r = kvm_ioctl(kvm_state, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1083     } else {
1084         r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1085     }
1086     if (r == 0 && cpuid->nent >= max) {
1087         r = -E2BIG;
1088     }
1089     if (r < 0) {
1090         if (r == -E2BIG) {
1091             g_free(cpuid);
1092             return NULL;
1093         } else {
1094             fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n",
1095                     strerror(-r));
1096             exit(1);
1097         }
1098     }
1099     return cpuid;
1100 }
1101 
1102 /*
1103  * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough
1104  * for all entries.
1105  */
1106 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs)
1107 {
1108     struct kvm_cpuid2 *cpuid;
1109     /* 0x40000000..0x40000005, 0x4000000A, 0x40000080..0x40000082 leaves */
1110     int max = 11;
1111     int i;
1112     bool do_sys_ioctl;
1113 
1114     do_sys_ioctl =
1115         kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID) > 0;
1116 
1117     /*
1118      * Non-empty KVM context is needed when KVM_CAP_SYS_HYPERV_CPUID is
1119      * unsupported, kvm_hyperv_expand_features() checks for that.
1120      */
1121     assert(do_sys_ioctl || cs->kvm_state);
1122 
1123     /*
1124      * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with
1125      * -E2BIG, however, it doesn't report back the right size. Keep increasing
1126      * it and re-trying until we succeed.
1127      */
1128     while ((cpuid = try_get_hv_cpuid(cs, max, do_sys_ioctl)) == NULL) {
1129         max++;
1130     }
1131 
1132     /*
1133      * KVM_GET_SUPPORTED_HV_CPUID does not set EVMCS CPUID bit before
1134      * KVM_CAP_HYPERV_ENLIGHTENED_VMCS is enabled but we want to get the
1135      * information early, just check for the capability and set the bit
1136      * manually.
1137      */
1138     if (!do_sys_ioctl && kvm_check_extension(cs->kvm_state,
1139                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1140         for (i = 0; i < cpuid->nent; i++) {
1141             if (cpuid->entries[i].function == HV_CPUID_ENLIGHTMENT_INFO) {
1142                 cpuid->entries[i].eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1143             }
1144         }
1145     }
1146 
1147     return cpuid;
1148 }
1149 
1150 /*
1151  * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature
1152  * leaves from KVM_CAP_HYPERV* and present MSRs data.
1153  */
1154 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs)
1155 {
1156     X86CPU *cpu = X86_CPU(cs);
1157     struct kvm_cpuid2 *cpuid;
1158     struct kvm_cpuid_entry2 *entry_feat, *entry_recomm;
1159 
1160     /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */
1161     cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries));
1162     cpuid->nent = 2;
1163 
1164     /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */
1165     entry_feat = &cpuid->entries[0];
1166     entry_feat->function = HV_CPUID_FEATURES;
1167 
1168     entry_recomm = &cpuid->entries[1];
1169     entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO;
1170     entry_recomm->ebx = cpu->hyperv_spinlock_attempts;
1171 
1172     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) {
1173         entry_feat->eax |= HV_HYPERCALL_AVAILABLE;
1174         entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE;
1175         entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1176         entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED;
1177         entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED;
1178     }
1179 
1180     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
1181         entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE;
1182         entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE;
1183     }
1184 
1185     if (has_msr_hv_frequencies) {
1186         entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
1187         entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE;
1188     }
1189 
1190     if (has_msr_hv_crash) {
1191         entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE;
1192     }
1193 
1194     if (has_msr_hv_reenlightenment) {
1195         entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
1196     }
1197 
1198     if (has_msr_hv_reset) {
1199         entry_feat->eax |= HV_RESET_AVAILABLE;
1200     }
1201 
1202     if (has_msr_hv_vpindex) {
1203         entry_feat->eax |= HV_VP_INDEX_AVAILABLE;
1204     }
1205 
1206     if (has_msr_hv_runtime) {
1207         entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE;
1208     }
1209 
1210     if (has_msr_hv_synic) {
1211         unsigned int cap = cpu->hyperv_synic_kvm_only ?
1212             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1213 
1214         if (kvm_check_extension(cs->kvm_state, cap) > 0) {
1215             entry_feat->eax |= HV_SYNIC_AVAILABLE;
1216         }
1217     }
1218 
1219     if (has_msr_hv_stimer) {
1220         entry_feat->eax |= HV_SYNTIMERS_AVAILABLE;
1221     }
1222 
1223     if (has_msr_hv_syndbg_options) {
1224         entry_feat->edx |= HV_GUEST_DEBUGGING_AVAILABLE;
1225         entry_feat->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
1226         entry_feat->ebx |= HV_PARTITION_DEBUGGING_ALLOWED;
1227     }
1228 
1229     if (kvm_check_extension(cs->kvm_state,
1230                             KVM_CAP_HYPERV_TLBFLUSH) > 0) {
1231         entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
1232         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1233     }
1234 
1235     if (kvm_check_extension(cs->kvm_state,
1236                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1237         entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1238     }
1239 
1240     if (kvm_check_extension(cs->kvm_state,
1241                             KVM_CAP_HYPERV_SEND_IPI) > 0) {
1242         entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED;
1243         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1244     }
1245 
1246     return cpuid;
1247 }
1248 
1249 static uint32_t hv_cpuid_get_host(CPUState *cs, uint32_t func, int reg)
1250 {
1251     struct kvm_cpuid_entry2 *entry;
1252     struct kvm_cpuid2 *cpuid;
1253 
1254     if (hv_cpuid_cache) {
1255         cpuid = hv_cpuid_cache;
1256     } else {
1257         if (kvm_check_extension(kvm_state, KVM_CAP_HYPERV_CPUID) > 0) {
1258             cpuid = get_supported_hv_cpuid(cs);
1259         } else {
1260             /*
1261              * 'cs->kvm_state' may be NULL when Hyper-V features are expanded
1262              * before KVM context is created but this is only done when
1263              * KVM_CAP_SYS_HYPERV_CPUID is supported and it implies
1264              * KVM_CAP_HYPERV_CPUID.
1265              */
1266             assert(cs->kvm_state);
1267 
1268             cpuid = get_supported_hv_cpuid_legacy(cs);
1269         }
1270         hv_cpuid_cache = cpuid;
1271     }
1272 
1273     if (!cpuid) {
1274         return 0;
1275     }
1276 
1277     entry = cpuid_find_entry(cpuid, func, 0);
1278     if (!entry) {
1279         return 0;
1280     }
1281 
1282     return cpuid_entry_get_reg(entry, reg);
1283 }
1284 
1285 static bool hyperv_feature_supported(CPUState *cs, int feature)
1286 {
1287     uint32_t func, bits;
1288     int i, reg;
1289 
1290     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
1291 
1292         func = kvm_hyperv_properties[feature].flags[i].func;
1293         reg = kvm_hyperv_properties[feature].flags[i].reg;
1294         bits = kvm_hyperv_properties[feature].flags[i].bits;
1295 
1296         if (!func) {
1297             continue;
1298         }
1299 
1300         if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) {
1301             return false;
1302         }
1303     }
1304 
1305     return true;
1306 }
1307 
1308 /* Checks that all feature dependencies are enabled */
1309 static bool hv_feature_check_deps(X86CPU *cpu, int feature, Error **errp)
1310 {
1311     uint64_t deps;
1312     int dep_feat;
1313 
1314     deps = kvm_hyperv_properties[feature].dependencies;
1315     while (deps) {
1316         dep_feat = ctz64(deps);
1317         if (!(hyperv_feat_enabled(cpu, dep_feat))) {
1318             error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1319                        kvm_hyperv_properties[feature].desc,
1320                        kvm_hyperv_properties[dep_feat].desc);
1321             return false;
1322         }
1323         deps &= ~(1ull << dep_feat);
1324     }
1325 
1326     return true;
1327 }
1328 
1329 static uint32_t hv_build_cpuid_leaf(CPUState *cs, uint32_t func, int reg)
1330 {
1331     X86CPU *cpu = X86_CPU(cs);
1332     uint32_t r = 0;
1333     int i, j;
1334 
1335     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties); i++) {
1336         if (!hyperv_feat_enabled(cpu, i)) {
1337             continue;
1338         }
1339 
1340         for (j = 0; j < ARRAY_SIZE(kvm_hyperv_properties[i].flags); j++) {
1341             if (kvm_hyperv_properties[i].flags[j].func != func) {
1342                 continue;
1343             }
1344             if (kvm_hyperv_properties[i].flags[j].reg != reg) {
1345                 continue;
1346             }
1347 
1348             r |= kvm_hyperv_properties[i].flags[j].bits;
1349         }
1350     }
1351 
1352     /* HV_CPUID_NESTED_FEATURES.EAX also encodes the supported eVMCS range */
1353     if (func == HV_CPUID_NESTED_FEATURES && reg == R_EAX) {
1354         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1355             r |= DEFAULT_EVMCS_VERSION;
1356         }
1357     }
1358 
1359     return r;
1360 }
1361 
1362 /*
1363  * Expand Hyper-V CPU features. In partucular, check that all the requested
1364  * features are supported by the host and the sanity of the configuration
1365  * (that all the required dependencies are included). Also, this takes care
1366  * of 'hv_passthrough' mode and fills the environment with all supported
1367  * Hyper-V features.
1368  */
1369 bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp)
1370 {
1371     CPUState *cs = CPU(cpu);
1372     Error *local_err = NULL;
1373     int feat;
1374 
1375     if (!hyperv_enabled(cpu))
1376         return true;
1377 
1378     /*
1379      * When kvm_hyperv_expand_features is called at CPU feature expansion
1380      * time per-CPU kvm_state is not available yet so we can only proceed
1381      * when KVM_CAP_SYS_HYPERV_CPUID is supported.
1382      */
1383     if (!cs->kvm_state &&
1384         !kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID))
1385         return true;
1386 
1387     if (cpu->hyperv_passthrough) {
1388         cpu->hyperv_vendor_id[0] =
1389             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EBX);
1390         cpu->hyperv_vendor_id[1] =
1391             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_ECX);
1392         cpu->hyperv_vendor_id[2] =
1393             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EDX);
1394         cpu->hyperv_vendor = g_realloc(cpu->hyperv_vendor,
1395                                        sizeof(cpu->hyperv_vendor_id) + 1);
1396         memcpy(cpu->hyperv_vendor, cpu->hyperv_vendor_id,
1397                sizeof(cpu->hyperv_vendor_id));
1398         cpu->hyperv_vendor[sizeof(cpu->hyperv_vendor_id)] = 0;
1399 
1400         cpu->hyperv_interface_id[0] =
1401             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EAX);
1402         cpu->hyperv_interface_id[1] =
1403             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EBX);
1404         cpu->hyperv_interface_id[2] =
1405             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_ECX);
1406         cpu->hyperv_interface_id[3] =
1407             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EDX);
1408 
1409         cpu->hyperv_ver_id_build =
1410             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EAX);
1411         cpu->hyperv_ver_id_major =
1412             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) >> 16;
1413         cpu->hyperv_ver_id_minor =
1414             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) & 0xffff;
1415         cpu->hyperv_ver_id_sp =
1416             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_ECX);
1417         cpu->hyperv_ver_id_sb =
1418             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) >> 24;
1419         cpu->hyperv_ver_id_sn =
1420             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) & 0xffffff;
1421 
1422         cpu->hv_max_vps = hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS,
1423                                             R_EAX);
1424         cpu->hyperv_limits[0] =
1425             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EBX);
1426         cpu->hyperv_limits[1] =
1427             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_ECX);
1428         cpu->hyperv_limits[2] =
1429             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EDX);
1430 
1431         cpu->hyperv_spinlock_attempts =
1432             hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EBX);
1433 
1434         /*
1435          * Mark feature as enabled in 'cpu->hyperv_features' as
1436          * hv_build_cpuid_leaf() uses this info to build guest CPUIDs.
1437          */
1438         for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1439             if (hyperv_feature_supported(cs, feat)) {
1440                 cpu->hyperv_features |= BIT(feat);
1441             }
1442         }
1443     } else {
1444         /* Check features availability and dependencies */
1445         for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1446             /* If the feature was not requested skip it. */
1447             if (!hyperv_feat_enabled(cpu, feat)) {
1448                 continue;
1449             }
1450 
1451             /* Check if the feature is supported by KVM */
1452             if (!hyperv_feature_supported(cs, feat)) {
1453                 error_setg(errp, "Hyper-V %s is not supported by kernel",
1454                            kvm_hyperv_properties[feat].desc);
1455                 return false;
1456             }
1457 
1458             /* Check dependencies */
1459             if (!hv_feature_check_deps(cpu, feat, &local_err)) {
1460                 error_propagate(errp, local_err);
1461                 return false;
1462             }
1463         }
1464     }
1465 
1466     /* Additional dependencies not covered by kvm_hyperv_properties[] */
1467     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1468         !cpu->hyperv_synic_kvm_only &&
1469         !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
1470         error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1471                    kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
1472                    kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
1473         return false;
1474     }
1475 
1476     return true;
1477 }
1478 
1479 /*
1480  * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent.
1481  */
1482 static int hyperv_fill_cpuids(CPUState *cs,
1483                               struct kvm_cpuid_entry2 *cpuid_ent)
1484 {
1485     X86CPU *cpu = X86_CPU(cs);
1486     struct kvm_cpuid_entry2 *c;
1487     uint32_t signature[3];
1488     uint32_t cpuid_i = 0, max_cpuid_leaf = 0;
1489     uint32_t nested_eax =
1490         hv_build_cpuid_leaf(cs, HV_CPUID_NESTED_FEATURES, R_EAX);
1491 
1492     max_cpuid_leaf = nested_eax ? HV_CPUID_NESTED_FEATURES :
1493         HV_CPUID_IMPLEMENT_LIMITS;
1494 
1495     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) {
1496         max_cpuid_leaf =
1497             MAX(max_cpuid_leaf, HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
1498     }
1499 
1500     c = &cpuid_ent[cpuid_i++];
1501     c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
1502     c->eax = max_cpuid_leaf;
1503     c->ebx = cpu->hyperv_vendor_id[0];
1504     c->ecx = cpu->hyperv_vendor_id[1];
1505     c->edx = cpu->hyperv_vendor_id[2];
1506 
1507     c = &cpuid_ent[cpuid_i++];
1508     c->function = HV_CPUID_INTERFACE;
1509     c->eax = cpu->hyperv_interface_id[0];
1510     c->ebx = cpu->hyperv_interface_id[1];
1511     c->ecx = cpu->hyperv_interface_id[2];
1512     c->edx = cpu->hyperv_interface_id[3];
1513 
1514     c = &cpuid_ent[cpuid_i++];
1515     c->function = HV_CPUID_VERSION;
1516     c->eax = cpu->hyperv_ver_id_build;
1517     c->ebx = (uint32_t)cpu->hyperv_ver_id_major << 16 |
1518         cpu->hyperv_ver_id_minor;
1519     c->ecx = cpu->hyperv_ver_id_sp;
1520     c->edx = (uint32_t)cpu->hyperv_ver_id_sb << 24 |
1521         (cpu->hyperv_ver_id_sn & 0xffffff);
1522 
1523     c = &cpuid_ent[cpuid_i++];
1524     c->function = HV_CPUID_FEATURES;
1525     c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EAX);
1526     c->ebx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EBX);
1527     c->edx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EDX);
1528 
1529     /* Unconditionally required with any Hyper-V enlightenment */
1530     c->eax |= HV_HYPERCALL_AVAILABLE;
1531 
1532     /* SynIC and Vmbus devices require messages/signals hypercalls */
1533     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1534         !cpu->hyperv_synic_kvm_only) {
1535         c->ebx |= HV_POST_MESSAGES | HV_SIGNAL_EVENTS;
1536     }
1537 
1538 
1539     /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
1540     c->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1541 
1542     c = &cpuid_ent[cpuid_i++];
1543     c->function = HV_CPUID_ENLIGHTMENT_INFO;
1544     c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX);
1545     c->ebx = cpu->hyperv_spinlock_attempts;
1546 
1547     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
1548         !hyperv_feat_enabled(cpu, HYPERV_FEAT_AVIC)) {
1549         c->eax |= HV_APIC_ACCESS_RECOMMENDED;
1550     }
1551 
1552     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) {
1553         c->eax |= HV_NO_NONARCH_CORESHARING;
1554     } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) {
1555         c->eax |= hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX) &
1556             HV_NO_NONARCH_CORESHARING;
1557     }
1558 
1559     c = &cpuid_ent[cpuid_i++];
1560     c->function = HV_CPUID_IMPLEMENT_LIMITS;
1561     c->eax = cpu->hv_max_vps;
1562     c->ebx = cpu->hyperv_limits[0];
1563     c->ecx = cpu->hyperv_limits[1];
1564     c->edx = cpu->hyperv_limits[2];
1565 
1566     if (nested_eax) {
1567         uint32_t function;
1568 
1569         /* Create zeroed 0x40000006..0x40000009 leaves */
1570         for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
1571              function < HV_CPUID_NESTED_FEATURES; function++) {
1572             c = &cpuid_ent[cpuid_i++];
1573             c->function = function;
1574         }
1575 
1576         c = &cpuid_ent[cpuid_i++];
1577         c->function = HV_CPUID_NESTED_FEATURES;
1578         c->eax = nested_eax;
1579     }
1580 
1581     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) {
1582         c = &cpuid_ent[cpuid_i++];
1583         c->function = HV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS;
1584         c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
1585             HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
1586         memcpy(signature, "Microsoft VS", 12);
1587         c->eax = 0;
1588         c->ebx = signature[0];
1589         c->ecx = signature[1];
1590         c->edx = signature[2];
1591 
1592         c = &cpuid_ent[cpuid_i++];
1593         c->function = HV_CPUID_SYNDBG_INTERFACE;
1594         memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12);
1595         c->eax = signature[0];
1596         c->ebx = 0;
1597         c->ecx = 0;
1598         c->edx = 0;
1599 
1600         c = &cpuid_ent[cpuid_i++];
1601         c->function = HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES;
1602         c->eax = HV_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
1603         c->ebx = 0;
1604         c->ecx = 0;
1605         c->edx = 0;
1606     }
1607 
1608     return cpuid_i;
1609 }
1610 
1611 static Error *hv_passthrough_mig_blocker;
1612 static Error *hv_no_nonarch_cs_mig_blocker;
1613 
1614 /* Checks that the exposed eVMCS version range is supported by KVM */
1615 static bool evmcs_version_supported(uint16_t evmcs_version,
1616                                     uint16_t supported_evmcs_version)
1617 {
1618     uint8_t min_version = evmcs_version & 0xff;
1619     uint8_t max_version = evmcs_version >> 8;
1620     uint8_t min_supported_version = supported_evmcs_version & 0xff;
1621     uint8_t max_supported_version = supported_evmcs_version >> 8;
1622 
1623     return (min_version >= min_supported_version) &&
1624         (max_version <= max_supported_version);
1625 }
1626 
1627 static int hyperv_init_vcpu(X86CPU *cpu)
1628 {
1629     CPUState *cs = CPU(cpu);
1630     Error *local_err = NULL;
1631     int ret;
1632 
1633     if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) {
1634         error_setg(&hv_passthrough_mig_blocker,
1635                    "'hv-passthrough' CPU flag prevents migration, use explicit"
1636                    " set of hv-* flags instead");
1637         ret = migrate_add_blocker(&hv_passthrough_mig_blocker, &local_err);
1638         if (ret < 0) {
1639             error_report_err(local_err);
1640             return ret;
1641         }
1642     }
1643 
1644     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO &&
1645         hv_no_nonarch_cs_mig_blocker == NULL) {
1646         error_setg(&hv_no_nonarch_cs_mig_blocker,
1647                    "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration"
1648                    " use explicit 'hv-no-nonarch-coresharing=on' instead (but"
1649                    " make sure SMT is disabled and/or that vCPUs are properly"
1650                    " pinned)");
1651         ret = migrate_add_blocker(&hv_no_nonarch_cs_mig_blocker, &local_err);
1652         if (ret < 0) {
1653             error_report_err(local_err);
1654             return ret;
1655         }
1656     }
1657 
1658     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) {
1659         /*
1660          * the kernel doesn't support setting vp_index; assert that its value
1661          * is in sync
1662          */
1663         uint64_t value;
1664 
1665         ret = kvm_get_one_msr(cpu, HV_X64_MSR_VP_INDEX, &value);
1666         if (ret < 0) {
1667             return ret;
1668         }
1669 
1670         if (value != hyperv_vp_index(CPU(cpu))) {
1671             error_report("kernel's vp_index != QEMU's vp_index");
1672             return -ENXIO;
1673         }
1674     }
1675 
1676     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
1677         uint32_t synic_cap = cpu->hyperv_synic_kvm_only ?
1678             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1679         ret = kvm_vcpu_enable_cap(cs, synic_cap, 0);
1680         if (ret < 0) {
1681             error_report("failed to turn on HyperV SynIC in KVM: %s",
1682                          strerror(-ret));
1683             return ret;
1684         }
1685 
1686         if (!cpu->hyperv_synic_kvm_only) {
1687             ret = hyperv_x86_synic_add(cpu);
1688             if (ret < 0) {
1689                 error_report("failed to create HyperV SynIC: %s",
1690                              strerror(-ret));
1691                 return ret;
1692             }
1693         }
1694     }
1695 
1696     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1697         uint16_t evmcs_version = DEFAULT_EVMCS_VERSION;
1698         uint16_t supported_evmcs_version;
1699 
1700         ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
1701                                   (uintptr_t)&supported_evmcs_version);
1702 
1703         /*
1704          * KVM is required to support EVMCS ver.1. as that's what 'hv-evmcs'
1705          * option sets. Note: we hardcode the maximum supported eVMCS version
1706          * to '1' as well so 'hv-evmcs' feature is migratable even when (and if)
1707          * ver.2 is implemented. A new option (e.g. 'hv-evmcs=2') will then have
1708          * to be added.
1709          */
1710         if (ret < 0) {
1711             error_report("Hyper-V %s is not supported by kernel",
1712                          kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc);
1713             return ret;
1714         }
1715 
1716         if (!evmcs_version_supported(evmcs_version, supported_evmcs_version)) {
1717             error_report("eVMCS version range [%d..%d] is not supported by "
1718                          "kernel (supported: [%d..%d])", evmcs_version & 0xff,
1719                          evmcs_version >> 8, supported_evmcs_version & 0xff,
1720                          supported_evmcs_version >> 8);
1721             return -ENOTSUP;
1722         }
1723     }
1724 
1725     if (cpu->hyperv_enforce_cpuid) {
1726         ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENFORCE_CPUID, 0, 1);
1727         if (ret < 0) {
1728             error_report("failed to enable KVM_CAP_HYPERV_ENFORCE_CPUID: %s",
1729                          strerror(-ret));
1730             return ret;
1731         }
1732     }
1733 
1734     /* Skip SynIC and VP_INDEX since they are hard deps already */
1735     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_STIMER) &&
1736         hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
1737         hyperv_feat_enabled(cpu, HYPERV_FEAT_RUNTIME)) {
1738         hyperv_x86_set_vmbus_recommended_features_enabled();
1739     }
1740 
1741     return 0;
1742 }
1743 
1744 static Error *invtsc_mig_blocker;
1745 
1746 #define KVM_MAX_CPUID_ENTRIES  100
1747 
1748 static void kvm_init_xsave(CPUX86State *env)
1749 {
1750     if (has_xsave2) {
1751         env->xsave_buf_len = QEMU_ALIGN_UP(has_xsave2, 4096);
1752     } else {
1753         env->xsave_buf_len = sizeof(struct kvm_xsave);
1754     }
1755 
1756     env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
1757     memset(env->xsave_buf, 0, env->xsave_buf_len);
1758     /*
1759      * The allocated storage must be large enough for all of the
1760      * possible XSAVE state components.
1761      */
1762     assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <=
1763            env->xsave_buf_len);
1764 }
1765 
1766 static void kvm_init_nested_state(CPUX86State *env)
1767 {
1768     struct kvm_vmx_nested_state_hdr *vmx_hdr;
1769     uint32_t size;
1770 
1771     if (!env->nested_state) {
1772         return;
1773     }
1774 
1775     size = env->nested_state->size;
1776 
1777     memset(env->nested_state, 0, size);
1778     env->nested_state->size = size;
1779 
1780     if (cpu_has_vmx(env)) {
1781         env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
1782         vmx_hdr = &env->nested_state->hdr.vmx;
1783         vmx_hdr->vmxon_pa = -1ull;
1784         vmx_hdr->vmcs12_pa = -1ull;
1785     } else if (cpu_has_svm(env)) {
1786         env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
1787     }
1788 }
1789 
1790 static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
1791                                     struct kvm_cpuid_entry2 *entries,
1792                                     uint32_t cpuid_i)
1793 {
1794     uint32_t limit, i, j;
1795     uint32_t unused;
1796     struct kvm_cpuid_entry2 *c;
1797 
1798     cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
1799 
1800     for (i = 0; i <= limit; i++) {
1801         j = 0;
1802         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1803             goto full;
1804         }
1805         c = &entries[cpuid_i++];
1806         switch (i) {
1807         case 2: {
1808             /* Keep reading function 2 till all the input is received */
1809             int times;
1810 
1811             c->function = i;
1812             c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
1813                        KVM_CPUID_FLAG_STATE_READ_NEXT;
1814             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1815             times = c->eax & 0xff;
1816 
1817             for (j = 1; j < times; ++j) {
1818                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1819                     goto full;
1820                 }
1821                 c = &entries[cpuid_i++];
1822                 c->function = i;
1823                 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
1824                 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1825             }
1826             break;
1827         }
1828         case 0x1f:
1829             if (!x86_has_extended_topo(env->avail_cpu_topo)) {
1830                 cpuid_i--;
1831                 break;
1832             }
1833             /* fallthrough */
1834         case 4:
1835         case 0xb:
1836         case 0xd:
1837             for (j = 0; ; j++) {
1838                 if (i == 0xd && j == 64) {
1839                     break;
1840                 }
1841 
1842                 c->function = i;
1843                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1844                 c->index = j;
1845                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1846 
1847                 if (i == 4 && c->eax == 0) {
1848                     break;
1849                 }
1850                 if (i == 0xb && !(c->ecx & 0xff00)) {
1851                     break;
1852                 }
1853                 if (i == 0x1f && !(c->ecx & 0xff00)) {
1854                     break;
1855                 }
1856                 if (i == 0xd && c->eax == 0) {
1857                     continue;
1858                 }
1859                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1860                     goto full;
1861                 }
1862                 c = &entries[cpuid_i++];
1863             }
1864             break;
1865         case 0x12:
1866             for (j = 0; ; j++) {
1867                 c->function = i;
1868                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1869                 c->index = j;
1870                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1871 
1872                 if (j > 1 && (c->eax & 0xf) != 1) {
1873                     break;
1874                 }
1875 
1876                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1877                     goto full;
1878                 }
1879                 c = &entries[cpuid_i++];
1880             }
1881             break;
1882         case 0x7:
1883         case 0x14:
1884         case 0x1d:
1885         case 0x1e: {
1886             uint32_t times;
1887 
1888             c->function = i;
1889             c->index = 0;
1890             c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1891             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1892             times = c->eax;
1893 
1894             for (j = 1; j <= times; ++j) {
1895                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1896                     goto full;
1897                 }
1898                 c = &entries[cpuid_i++];
1899                 c->function = i;
1900                 c->index = j;
1901                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1902                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1903             }
1904             break;
1905         }
1906         default:
1907             c->function = i;
1908             c->flags = 0;
1909             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1910             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1911                 /*
1912                  * KVM already returns all zeroes if a CPUID entry is missing,
1913                  * so we can omit it and avoid hitting KVM's 80-entry limit.
1914                  */
1915                 cpuid_i--;
1916             }
1917             break;
1918         }
1919     }
1920 
1921     if (limit >= 0x0a) {
1922         uint32_t eax, edx;
1923 
1924         cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
1925 
1926         has_architectural_pmu_version = eax & 0xff;
1927         if (has_architectural_pmu_version > 0) {
1928             num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
1929 
1930             /* Shouldn't be more than 32, since that's the number of bits
1931              * available in EBX to tell us _which_ counters are available.
1932              * Play it safe.
1933              */
1934             if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
1935                 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
1936             }
1937 
1938             if (has_architectural_pmu_version > 1) {
1939                 num_architectural_pmu_fixed_counters = edx & 0x1f;
1940 
1941                 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
1942                     num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
1943                 }
1944             }
1945         }
1946     }
1947 
1948     cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
1949 
1950     for (i = 0x80000000; i <= limit; i++) {
1951         j = 0;
1952         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1953             goto full;
1954         }
1955         c = &entries[cpuid_i++];
1956 
1957         switch (i) {
1958         case 0x8000001d:
1959             /* Query for all AMD cache information leaves */
1960             for (j = 0; ; j++) {
1961                 c->function = i;
1962                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1963                 c->index = j;
1964                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1965 
1966                 if (c->eax == 0) {
1967                     break;
1968                 }
1969                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1970                     goto full;
1971                 }
1972                 c = &entries[cpuid_i++];
1973             }
1974             break;
1975         default:
1976             c->function = i;
1977             c->flags = 0;
1978             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1979             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1980                 /*
1981                  * KVM already returns all zeroes if a CPUID entry is missing,
1982                  * so we can omit it and avoid hitting KVM's 80-entry limit.
1983                  */
1984                 cpuid_i--;
1985             }
1986             break;
1987         }
1988     }
1989 
1990     /* Call Centaur's CPUID instructions they are supported. */
1991     if (env->cpuid_xlevel2 > 0) {
1992         cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
1993 
1994         for (i = 0xC0000000; i <= limit; i++) {
1995             j = 0;
1996             if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1997                 goto full;
1998             }
1999             c = &entries[cpuid_i++];
2000 
2001             c->function = i;
2002             c->flags = 0;
2003             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
2004         }
2005     }
2006 
2007     return cpuid_i;
2008 
2009 full:
2010     fprintf(stderr, "cpuid_data is full, no space for "
2011             "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
2012     abort();
2013 }
2014 
2015 int kvm_arch_init_vcpu(CPUState *cs)
2016 {
2017     struct {
2018         struct kvm_cpuid2 cpuid;
2019         struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
2020     } cpuid_data;
2021     /*
2022      * The kernel defines these structs with padding fields so there
2023      * should be no extra padding in our cpuid_data struct.
2024      */
2025     QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
2026                       sizeof(struct kvm_cpuid2) +
2027                       sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
2028 
2029     X86CPU *cpu = X86_CPU(cs);
2030     CPUX86State *env = &cpu->env;
2031     uint32_t cpuid_i;
2032     struct kvm_cpuid_entry2 *c;
2033     uint32_t signature[3];
2034     int kvm_base = KVM_CPUID_SIGNATURE;
2035     int max_nested_state_len;
2036     int r;
2037     Error *local_err = NULL;
2038 
2039     memset(&cpuid_data, 0, sizeof(cpuid_data));
2040 
2041     cpuid_i = 0;
2042 
2043     has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2);
2044 
2045     r = kvm_arch_set_tsc_khz(cs);
2046     if (r < 0) {
2047         return r;
2048     }
2049 
2050     /* vcpu's TSC frequency is either specified by user, or following
2051      * the value used by KVM if the former is not present. In the
2052      * latter case, we query it from KVM and record in env->tsc_khz,
2053      * so that vcpu's TSC frequency can be migrated later via this field.
2054      */
2055     if (!env->tsc_khz) {
2056         r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
2057             kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
2058             -ENOTSUP;
2059         if (r > 0) {
2060             env->tsc_khz = r;
2061         }
2062     }
2063 
2064     env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
2065 
2066     /*
2067      * kvm_hyperv_expand_features() is called here for the second time in case
2068      * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
2069      * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to
2070      * check which Hyper-V enlightenments are supported and which are not, we
2071      * can still proceed and check/expand Hyper-V enlightenments here so legacy
2072      * behavior is preserved.
2073      */
2074     if (!kvm_hyperv_expand_features(cpu, &local_err)) {
2075         error_report_err(local_err);
2076         return -ENOSYS;
2077     }
2078 
2079     if (hyperv_enabled(cpu)) {
2080         r = hyperv_init_vcpu(cpu);
2081         if (r) {
2082             return r;
2083         }
2084 
2085         cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries);
2086         kvm_base = KVM_CPUID_SIGNATURE_NEXT;
2087         has_msr_hv_hypercall = true;
2088     }
2089 
2090     if (cs->kvm_state->xen_version) {
2091 #ifdef CONFIG_XEN_EMU
2092         struct kvm_cpuid_entry2 *xen_max_leaf;
2093 
2094         memcpy(signature, "XenVMMXenVMM", 12);
2095 
2096         xen_max_leaf = c = &cpuid_data.entries[cpuid_i++];
2097         c->function = kvm_base + XEN_CPUID_SIGNATURE;
2098         c->eax = kvm_base + XEN_CPUID_TIME;
2099         c->ebx = signature[0];
2100         c->ecx = signature[1];
2101         c->edx = signature[2];
2102 
2103         c = &cpuid_data.entries[cpuid_i++];
2104         c->function = kvm_base + XEN_CPUID_VENDOR;
2105         c->eax = cs->kvm_state->xen_version;
2106         c->ebx = 0;
2107         c->ecx = 0;
2108         c->edx = 0;
2109 
2110         c = &cpuid_data.entries[cpuid_i++];
2111         c->function = kvm_base + XEN_CPUID_HVM_MSR;
2112         /* Number of hypercall-transfer pages */
2113         c->eax = 1;
2114         /* Hypercall MSR base address */
2115         if (hyperv_enabled(cpu)) {
2116             c->ebx = XEN_HYPERCALL_MSR_HYPERV;
2117             kvm_xen_init(cs->kvm_state, c->ebx);
2118         } else {
2119             c->ebx = XEN_HYPERCALL_MSR;
2120         }
2121         c->ecx = 0;
2122         c->edx = 0;
2123 
2124         c = &cpuid_data.entries[cpuid_i++];
2125         c->function = kvm_base + XEN_CPUID_TIME;
2126         c->eax = ((!!tsc_is_stable_and_known(env) << 1) |
2127             (!!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP) << 2));
2128         /* default=0 (emulate if necessary) */
2129         c->ebx = 0;
2130         /* guest tsc frequency */
2131         c->ecx = env->user_tsc_khz;
2132         /* guest tsc incarnation (migration count) */
2133         c->edx = 0;
2134 
2135         c = &cpuid_data.entries[cpuid_i++];
2136         c->function = kvm_base + XEN_CPUID_HVM;
2137         xen_max_leaf->eax = kvm_base + XEN_CPUID_HVM;
2138         if (cs->kvm_state->xen_version >= XEN_VERSION(4, 5)) {
2139             c->function = kvm_base + XEN_CPUID_HVM;
2140 
2141             if (cpu->xen_vapic) {
2142                 c->eax |= XEN_HVM_CPUID_APIC_ACCESS_VIRT;
2143                 c->eax |= XEN_HVM_CPUID_X2APIC_VIRT;
2144             }
2145 
2146             c->eax |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
2147 
2148             if (cs->kvm_state->xen_version >= XEN_VERSION(4, 6)) {
2149                 c->eax |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
2150                 c->ebx = cs->cpu_index;
2151             }
2152 
2153             if (cs->kvm_state->xen_version >= XEN_VERSION(4, 17)) {
2154                 c->eax |= XEN_HVM_CPUID_UPCALL_VECTOR;
2155             }
2156         }
2157 
2158         r = kvm_xen_init_vcpu(cs);
2159         if (r) {
2160             return r;
2161         }
2162 
2163         kvm_base += 0x100;
2164 #else /* CONFIG_XEN_EMU */
2165         /* This should never happen as kvm_arch_init() would have died first. */
2166         fprintf(stderr, "Cannot enable Xen CPUID without Xen support\n");
2167         abort();
2168 #endif
2169     } else if (cpu->expose_kvm) {
2170         memcpy(signature, "KVMKVMKVM\0\0\0", 12);
2171         c = &cpuid_data.entries[cpuid_i++];
2172         c->function = KVM_CPUID_SIGNATURE | kvm_base;
2173         c->eax = KVM_CPUID_FEATURES | kvm_base;
2174         c->ebx = signature[0];
2175         c->ecx = signature[1];
2176         c->edx = signature[2];
2177 
2178         c = &cpuid_data.entries[cpuid_i++];
2179         c->function = KVM_CPUID_FEATURES | kvm_base;
2180         c->eax = env->features[FEAT_KVM];
2181         c->edx = env->features[FEAT_KVM_HINTS];
2182     }
2183 
2184     if (cpu->kvm_pv_enforce_cpuid) {
2185         r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1);
2186         if (r < 0) {
2187             fprintf(stderr,
2188                     "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s",
2189                     strerror(-r));
2190             abort();
2191         }
2192     }
2193 
2194     cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i);
2195     cpuid_data.cpuid.nent = cpuid_i;
2196 
2197     if (((env->cpuid_version >> 8)&0xF) >= 6
2198         && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
2199            (CPUID_MCE | CPUID_MCA)) {
2200         uint64_t mcg_cap, unsupported_caps;
2201         int banks;
2202         int ret;
2203 
2204         ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
2205         if (ret < 0) {
2206             fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
2207             return ret;
2208         }
2209 
2210         if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
2211             error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
2212                          (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
2213             return -ENOTSUP;
2214         }
2215 
2216         unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
2217         if (unsupported_caps) {
2218             if (unsupported_caps & MCG_LMCE_P) {
2219                 error_report("kvm: LMCE not supported");
2220                 return -ENOTSUP;
2221             }
2222             warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
2223                         unsupported_caps);
2224         }
2225 
2226         env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
2227         ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
2228         if (ret < 0) {
2229             fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
2230             return ret;
2231         }
2232     }
2233 
2234     cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env);
2235 
2236     c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
2237     if (c) {
2238         has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
2239                                   !!(c->ecx & CPUID_EXT_SMX);
2240     }
2241 
2242     c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0);
2243     if (c && (c->ebx & CPUID_7_0_EBX_SGX)) {
2244         has_msr_feature_control = true;
2245     }
2246 
2247     if (env->mcg_cap & MCG_LMCE_P) {
2248         has_msr_mcg_ext_ctl = has_msr_feature_control = true;
2249     }
2250 
2251     if (!env->user_tsc_khz) {
2252         if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
2253             invtsc_mig_blocker == NULL) {
2254             error_setg(&invtsc_mig_blocker,
2255                        "State blocked by non-migratable CPU device"
2256                        " (invtsc flag)");
2257             r = migrate_add_blocker(&invtsc_mig_blocker, &local_err);
2258             if (r < 0) {
2259                 error_report_err(local_err);
2260                 return r;
2261             }
2262         }
2263     }
2264 
2265     if (cpu->vmware_cpuid_freq
2266         /* Guests depend on 0x40000000 to detect this feature, so only expose
2267          * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
2268         && cpu->expose_kvm
2269         && kvm_base == KVM_CPUID_SIGNATURE
2270         /* TSC clock must be stable and known for this feature. */
2271         && tsc_is_stable_and_known(env)) {
2272 
2273         c = &cpuid_data.entries[cpuid_i++];
2274         c->function = KVM_CPUID_SIGNATURE | 0x10;
2275         c->eax = env->tsc_khz;
2276         c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */
2277         c->ecx = c->edx = 0;
2278 
2279         c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
2280         c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
2281     }
2282 
2283     cpuid_data.cpuid.nent = cpuid_i;
2284 
2285     cpuid_data.cpuid.padding = 0;
2286     r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
2287     if (r) {
2288         goto fail;
2289     }
2290     kvm_init_xsave(env);
2291 
2292     max_nested_state_len = kvm_max_nested_state_length();
2293     if (max_nested_state_len > 0) {
2294         assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
2295 
2296         if (cpu_has_vmx(env) || cpu_has_svm(env)) {
2297             env->nested_state = g_malloc0(max_nested_state_len);
2298             env->nested_state->size = max_nested_state_len;
2299 
2300             kvm_init_nested_state(env);
2301         }
2302     }
2303 
2304     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
2305 
2306     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
2307         has_msr_tsc_aux = false;
2308     }
2309 
2310     kvm_init_msrs(cpu);
2311 
2312     return 0;
2313 
2314  fail:
2315     migrate_del_blocker(&invtsc_mig_blocker);
2316 
2317     return r;
2318 }
2319 
2320 int kvm_arch_destroy_vcpu(CPUState *cs)
2321 {
2322     X86CPU *cpu = X86_CPU(cs);
2323     CPUX86State *env = &cpu->env;
2324 
2325     g_free(env->xsave_buf);
2326 
2327     g_free(cpu->kvm_msr_buf);
2328     cpu->kvm_msr_buf = NULL;
2329 
2330     g_free(env->nested_state);
2331     env->nested_state = NULL;
2332 
2333     qemu_del_vm_change_state_handler(cpu->vmsentry);
2334 
2335     return 0;
2336 }
2337 
2338 void kvm_arch_reset_vcpu(X86CPU *cpu)
2339 {
2340     CPUX86State *env = &cpu->env;
2341 
2342     env->xcr0 = 1;
2343     if (kvm_irqchip_in_kernel()) {
2344         env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
2345                                           KVM_MP_STATE_UNINITIALIZED;
2346     } else {
2347         env->mp_state = KVM_MP_STATE_RUNNABLE;
2348     }
2349 
2350     /* enabled by default */
2351     env->poll_control_msr = 1;
2352 
2353     kvm_init_nested_state(env);
2354 
2355     sev_es_set_reset_vector(CPU(cpu));
2356 }
2357 
2358 void kvm_arch_after_reset_vcpu(X86CPU *cpu)
2359 {
2360     CPUX86State *env = &cpu->env;
2361     int i;
2362 
2363     /*
2364      * Reset SynIC after all other devices have been reset to let them remove
2365      * their SINT routes first.
2366      */
2367     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
2368         for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
2369             env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
2370         }
2371 
2372         hyperv_x86_synic_reset(cpu);
2373     }
2374 }
2375 
2376 void kvm_arch_do_init_vcpu(X86CPU *cpu)
2377 {
2378     CPUX86State *env = &cpu->env;
2379 
2380     /* APs get directly into wait-for-SIPI state.  */
2381     if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
2382         env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
2383     }
2384 }
2385 
2386 static int kvm_get_supported_feature_msrs(KVMState *s)
2387 {
2388     int ret = 0;
2389 
2390     if (kvm_feature_msrs != NULL) {
2391         return 0;
2392     }
2393 
2394     if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) {
2395         return 0;
2396     }
2397 
2398     struct kvm_msr_list msr_list;
2399 
2400     msr_list.nmsrs = 0;
2401     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list);
2402     if (ret < 0 && ret != -E2BIG) {
2403         error_report("Fetch KVM feature MSR list failed: %s",
2404             strerror(-ret));
2405         return ret;
2406     }
2407 
2408     assert(msr_list.nmsrs > 0);
2409     kvm_feature_msrs = g_malloc0(sizeof(msr_list) +
2410                  msr_list.nmsrs * sizeof(msr_list.indices[0]));
2411 
2412     kvm_feature_msrs->nmsrs = msr_list.nmsrs;
2413     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs);
2414 
2415     if (ret < 0) {
2416         error_report("Fetch KVM feature MSR list failed: %s",
2417             strerror(-ret));
2418         g_free(kvm_feature_msrs);
2419         kvm_feature_msrs = NULL;
2420         return ret;
2421     }
2422 
2423     return 0;
2424 }
2425 
2426 static int kvm_get_supported_msrs(KVMState *s)
2427 {
2428     int ret = 0;
2429     struct kvm_msr_list msr_list, *kvm_msr_list;
2430 
2431     /*
2432      *  Obtain MSR list from KVM.  These are the MSRs that we must
2433      *  save/restore.
2434      */
2435     msr_list.nmsrs = 0;
2436     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
2437     if (ret < 0 && ret != -E2BIG) {
2438         return ret;
2439     }
2440     /*
2441      * Old kernel modules had a bug and could write beyond the provided
2442      * memory. Allocate at least a safe amount of 1K.
2443      */
2444     kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
2445                                           msr_list.nmsrs *
2446                                           sizeof(msr_list.indices[0])));
2447 
2448     kvm_msr_list->nmsrs = msr_list.nmsrs;
2449     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
2450     if (ret >= 0) {
2451         int i;
2452 
2453         for (i = 0; i < kvm_msr_list->nmsrs; i++) {
2454             switch (kvm_msr_list->indices[i]) {
2455             case MSR_STAR:
2456                 has_msr_star = true;
2457                 break;
2458             case MSR_VM_HSAVE_PA:
2459                 has_msr_hsave_pa = true;
2460                 break;
2461             case MSR_TSC_AUX:
2462                 has_msr_tsc_aux = true;
2463                 break;
2464             case MSR_TSC_ADJUST:
2465                 has_msr_tsc_adjust = true;
2466                 break;
2467             case MSR_IA32_TSCDEADLINE:
2468                 has_msr_tsc_deadline = true;
2469                 break;
2470             case MSR_IA32_SMBASE:
2471                 has_msr_smbase = true;
2472                 break;
2473             case MSR_SMI_COUNT:
2474                 has_msr_smi_count = true;
2475                 break;
2476             case MSR_IA32_MISC_ENABLE:
2477                 has_msr_misc_enable = true;
2478                 break;
2479             case MSR_IA32_BNDCFGS:
2480                 has_msr_bndcfgs = true;
2481                 break;
2482             case MSR_IA32_XSS:
2483                 has_msr_xss = true;
2484                 break;
2485             case MSR_IA32_UMWAIT_CONTROL:
2486                 has_msr_umwait = true;
2487                 break;
2488             case HV_X64_MSR_CRASH_CTL:
2489                 has_msr_hv_crash = true;
2490                 break;
2491             case HV_X64_MSR_RESET:
2492                 has_msr_hv_reset = true;
2493                 break;
2494             case HV_X64_MSR_VP_INDEX:
2495                 has_msr_hv_vpindex = true;
2496                 break;
2497             case HV_X64_MSR_VP_RUNTIME:
2498                 has_msr_hv_runtime = true;
2499                 break;
2500             case HV_X64_MSR_SCONTROL:
2501                 has_msr_hv_synic = true;
2502                 break;
2503             case HV_X64_MSR_STIMER0_CONFIG:
2504                 has_msr_hv_stimer = true;
2505                 break;
2506             case HV_X64_MSR_TSC_FREQUENCY:
2507                 has_msr_hv_frequencies = true;
2508                 break;
2509             case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2510                 has_msr_hv_reenlightenment = true;
2511                 break;
2512             case HV_X64_MSR_SYNDBG_OPTIONS:
2513                 has_msr_hv_syndbg_options = true;
2514                 break;
2515             case MSR_IA32_SPEC_CTRL:
2516                 has_msr_spec_ctrl = true;
2517                 break;
2518             case MSR_AMD64_TSC_RATIO:
2519                 has_tsc_scale_msr = true;
2520                 break;
2521             case MSR_IA32_TSX_CTRL:
2522                 has_msr_tsx_ctrl = true;
2523                 break;
2524             case MSR_VIRT_SSBD:
2525                 has_msr_virt_ssbd = true;
2526                 break;
2527             case MSR_IA32_ARCH_CAPABILITIES:
2528                 has_msr_arch_capabs = true;
2529                 break;
2530             case MSR_IA32_CORE_CAPABILITY:
2531                 has_msr_core_capabs = true;
2532                 break;
2533             case MSR_IA32_PERF_CAPABILITIES:
2534                 has_msr_perf_capabs = true;
2535                 break;
2536             case MSR_IA32_VMX_VMFUNC:
2537                 has_msr_vmx_vmfunc = true;
2538                 break;
2539             case MSR_IA32_UCODE_REV:
2540                 has_msr_ucode_rev = true;
2541                 break;
2542             case MSR_IA32_VMX_PROCBASED_CTLS2:
2543                 has_msr_vmx_procbased_ctls2 = true;
2544                 break;
2545             case MSR_IA32_PKRS:
2546                 has_msr_pkrs = true;
2547                 break;
2548             }
2549         }
2550     }
2551 
2552     g_free(kvm_msr_list);
2553 
2554     return ret;
2555 }
2556 
2557 static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
2558                                         uint64_t *val)
2559 {
2560     CPUState *cs = CPU(cpu);
2561 
2562     *val = cs->nr_threads * cs->nr_cores; /* thread count, bits 15..0 */
2563     *val |= ((uint32_t)cs->nr_cores << 16); /* core count, bits 31..16 */
2564 
2565     return true;
2566 }
2567 
2568 static Notifier smram_machine_done;
2569 static KVMMemoryListener smram_listener;
2570 static AddressSpace smram_address_space;
2571 static MemoryRegion smram_as_root;
2572 static MemoryRegion smram_as_mem;
2573 
2574 static void register_smram_listener(Notifier *n, void *unused)
2575 {
2576     MemoryRegion *smram =
2577         (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
2578 
2579     /* Outer container... */
2580     memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
2581     memory_region_set_enabled(&smram_as_root, true);
2582 
2583     /* ... with two regions inside: normal system memory with low
2584      * priority, and...
2585      */
2586     memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
2587                              get_system_memory(), 0, ~0ull);
2588     memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
2589     memory_region_set_enabled(&smram_as_mem, true);
2590 
2591     if (smram) {
2592         /* ... SMRAM with higher priority */
2593         memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
2594         memory_region_set_enabled(smram, true);
2595     }
2596 
2597     address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
2598     kvm_memory_listener_register(kvm_state, &smram_listener,
2599                                  &smram_address_space, 1, "kvm-smram");
2600 }
2601 
2602 int kvm_arch_get_default_type(MachineState *ms)
2603 {
2604     return 0;
2605 }
2606 
2607 int kvm_arch_init(MachineState *ms, KVMState *s)
2608 {
2609     uint64_t identity_base = 0xfffbc000;
2610     uint64_t shadow_mem;
2611     int ret;
2612     struct utsname utsname;
2613     Error *local_err = NULL;
2614 
2615     /*
2616      * Initialize SEV context, if required
2617      *
2618      * If no memory encryption is requested (ms->cgs == NULL) this is
2619      * a no-op.
2620      *
2621      * It's also a no-op if a non-SEV confidential guest support
2622      * mechanism is selected.  SEV is the only mechanism available to
2623      * select on x86 at present, so this doesn't arise, but if new
2624      * mechanisms are supported in future (e.g. TDX), they'll need
2625      * their own initialization either here or elsewhere.
2626      */
2627     if (ms->cgs) {
2628         ret = confidential_guest_kvm_init(ms->cgs, &local_err);
2629         if (ret < 0) {
2630             error_report_err(local_err);
2631             return ret;
2632         }
2633     }
2634 
2635     has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
2636     has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0;
2637 
2638     hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
2639 
2640     has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
2641     if (has_exception_payload) {
2642         ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
2643         if (ret < 0) {
2644             error_report("kvm: Failed to enable exception payload cap: %s",
2645                          strerror(-ret));
2646             return ret;
2647         }
2648     }
2649 
2650     has_triple_fault_event = kvm_check_extension(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT);
2651     if (has_triple_fault_event) {
2652         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true);
2653         if (ret < 0) {
2654             error_report("kvm: Failed to enable triple fault event cap: %s",
2655                          strerror(-ret));
2656             return ret;
2657         }
2658     }
2659 
2660     if (s->xen_version) {
2661 #ifdef CONFIG_XEN_EMU
2662         if (!object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE)) {
2663             error_report("kvm: Xen support only available in PC machine");
2664             return -ENOTSUP;
2665         }
2666         /* hyperv_enabled() doesn't work yet. */
2667         uint32_t msr = XEN_HYPERCALL_MSR;
2668         ret = kvm_xen_init(s, msr);
2669         if (ret < 0) {
2670             return ret;
2671         }
2672 #else
2673         error_report("kvm: Xen support not enabled in qemu");
2674         return -ENOTSUP;
2675 #endif
2676     }
2677 
2678     ret = kvm_get_supported_msrs(s);
2679     if (ret < 0) {
2680         return ret;
2681     }
2682 
2683     kvm_get_supported_feature_msrs(s);
2684 
2685     uname(&utsname);
2686     lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
2687 
2688     /*
2689      * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
2690      * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
2691      * Since these must be part of guest physical memory, we need to allocate
2692      * them, both by setting their start addresses in the kernel and by
2693      * creating a corresponding e820 entry. We need 4 pages before the BIOS,
2694      * so this value allows up to 16M BIOSes.
2695      */
2696     identity_base = 0xfeffc000;
2697     ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
2698     if (ret < 0) {
2699         return ret;
2700     }
2701 
2702     /* Set TSS base one page after EPT identity map. */
2703     ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
2704     if (ret < 0) {
2705         return ret;
2706     }
2707 
2708     /* Tell fw_cfg to notify the BIOS to reserve the range. */
2709     ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
2710     if (ret < 0) {
2711         fprintf(stderr, "e820_add_entry() table is full\n");
2712         return ret;
2713     }
2714 
2715     shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort);
2716     if (shadow_mem != -1) {
2717         shadow_mem /= 4096;
2718         ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
2719         if (ret < 0) {
2720             return ret;
2721         }
2722     }
2723 
2724     if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
2725         object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
2726         x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
2727         smram_machine_done.notify = register_smram_listener;
2728         qemu_add_machine_init_done_notifier(&smram_machine_done);
2729     }
2730 
2731     if (enable_cpu_pm) {
2732         int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
2733 /* Work around for kernel header with a typo. TODO: fix header and drop. */
2734 #if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
2735 #define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
2736 #endif
2737         if (disable_exits) {
2738             disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
2739                               KVM_X86_DISABLE_EXITS_HLT |
2740                               KVM_X86_DISABLE_EXITS_PAUSE |
2741                               KVM_X86_DISABLE_EXITS_CSTATE);
2742         }
2743 
2744         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
2745                                 disable_exits);
2746         if (ret < 0) {
2747             error_report("kvm: guest stopping CPU not supported: %s",
2748                          strerror(-ret));
2749         }
2750     }
2751 
2752     if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
2753         X86MachineState *x86ms = X86_MACHINE(ms);
2754 
2755         if (x86ms->bus_lock_ratelimit > 0) {
2756             ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT);
2757             if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) {
2758                 error_report("kvm: bus lock detection unsupported");
2759                 return -ENOTSUP;
2760             }
2761             ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0,
2762                                     KVM_BUS_LOCK_DETECTION_EXIT);
2763             if (ret < 0) {
2764                 error_report("kvm: Failed to enable bus lock detection cap: %s",
2765                              strerror(-ret));
2766                 return ret;
2767             }
2768             ratelimit_init(&bus_lock_ratelimit_ctrl);
2769             ratelimit_set_speed(&bus_lock_ratelimit_ctrl,
2770                                 x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME);
2771         }
2772     }
2773 
2774     if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE &&
2775         kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) {
2776             uint64_t notify_window_flags =
2777                 ((uint64_t)s->notify_window << 32) |
2778                 KVM_X86_NOTIFY_VMEXIT_ENABLED |
2779                 KVM_X86_NOTIFY_VMEXIT_USER;
2780             ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0,
2781                                     notify_window_flags);
2782             if (ret < 0) {
2783                 error_report("kvm: Failed to enable notify vmexit cap: %s",
2784                              strerror(-ret));
2785                 return ret;
2786             }
2787     }
2788     if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) {
2789         bool r;
2790 
2791         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_USER_SPACE_MSR, 0,
2792                                 KVM_MSR_EXIT_REASON_FILTER);
2793         if (ret) {
2794             error_report("Could not enable user space MSRs: %s",
2795                          strerror(-ret));
2796             exit(1);
2797         }
2798 
2799         r = kvm_filter_msr(s, MSR_CORE_THREAD_COUNT,
2800                            kvm_rdmsr_core_thread_count, NULL);
2801         if (!r) {
2802             error_report("Could not install MSR_CORE_THREAD_COUNT handler: %s",
2803                          strerror(-ret));
2804             exit(1);
2805         }
2806     }
2807 
2808     return 0;
2809 }
2810 
2811 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2812 {
2813     lhs->selector = rhs->selector;
2814     lhs->base = rhs->base;
2815     lhs->limit = rhs->limit;
2816     lhs->type = 3;
2817     lhs->present = 1;
2818     lhs->dpl = 3;
2819     lhs->db = 0;
2820     lhs->s = 1;
2821     lhs->l = 0;
2822     lhs->g = 0;
2823     lhs->avl = 0;
2824     lhs->unusable = 0;
2825 }
2826 
2827 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2828 {
2829     unsigned flags = rhs->flags;
2830     lhs->selector = rhs->selector;
2831     lhs->base = rhs->base;
2832     lhs->limit = rhs->limit;
2833     lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
2834     lhs->present = (flags & DESC_P_MASK) != 0;
2835     lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
2836     lhs->db = (flags >> DESC_B_SHIFT) & 1;
2837     lhs->s = (flags & DESC_S_MASK) != 0;
2838     lhs->l = (flags >> DESC_L_SHIFT) & 1;
2839     lhs->g = (flags & DESC_G_MASK) != 0;
2840     lhs->avl = (flags & DESC_AVL_MASK) != 0;
2841     lhs->unusable = !lhs->present;
2842     lhs->padding = 0;
2843 }
2844 
2845 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
2846 {
2847     lhs->selector = rhs->selector;
2848     lhs->base = rhs->base;
2849     lhs->limit = rhs->limit;
2850     lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
2851                  ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
2852                  (rhs->dpl << DESC_DPL_SHIFT) |
2853                  (rhs->db << DESC_B_SHIFT) |
2854                  (rhs->s * DESC_S_MASK) |
2855                  (rhs->l << DESC_L_SHIFT) |
2856                  (rhs->g * DESC_G_MASK) |
2857                  (rhs->avl * DESC_AVL_MASK);
2858 }
2859 
2860 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
2861 {
2862     if (set) {
2863         *kvm_reg = *qemu_reg;
2864     } else {
2865         *qemu_reg = *kvm_reg;
2866     }
2867 }
2868 
2869 static int kvm_getput_regs(X86CPU *cpu, int set)
2870 {
2871     CPUX86State *env = &cpu->env;
2872     struct kvm_regs regs;
2873     int ret = 0;
2874 
2875     if (!set) {
2876         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
2877         if (ret < 0) {
2878             return ret;
2879         }
2880     }
2881 
2882     kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
2883     kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
2884     kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
2885     kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
2886     kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
2887     kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
2888     kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
2889     kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
2890 #ifdef TARGET_X86_64
2891     kvm_getput_reg(&regs.r8, &env->regs[8], set);
2892     kvm_getput_reg(&regs.r9, &env->regs[9], set);
2893     kvm_getput_reg(&regs.r10, &env->regs[10], set);
2894     kvm_getput_reg(&regs.r11, &env->regs[11], set);
2895     kvm_getput_reg(&regs.r12, &env->regs[12], set);
2896     kvm_getput_reg(&regs.r13, &env->regs[13], set);
2897     kvm_getput_reg(&regs.r14, &env->regs[14], set);
2898     kvm_getput_reg(&regs.r15, &env->regs[15], set);
2899 #endif
2900 
2901     kvm_getput_reg(&regs.rflags, &env->eflags, set);
2902     kvm_getput_reg(&regs.rip, &env->eip, set);
2903 
2904     if (set) {
2905         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
2906     }
2907 
2908     return ret;
2909 }
2910 
2911 static int kvm_put_xsave(X86CPU *cpu)
2912 {
2913     CPUX86State *env = &cpu->env;
2914     void *xsave = env->xsave_buf;
2915 
2916     x86_cpu_xsave_all_areas(cpu, xsave, env->xsave_buf_len);
2917 
2918     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
2919 }
2920 
2921 static int kvm_put_xcrs(X86CPU *cpu)
2922 {
2923     CPUX86State *env = &cpu->env;
2924     struct kvm_xcrs xcrs = {};
2925 
2926     if (!has_xcrs) {
2927         return 0;
2928     }
2929 
2930     xcrs.nr_xcrs = 1;
2931     xcrs.flags = 0;
2932     xcrs.xcrs[0].xcr = 0;
2933     xcrs.xcrs[0].value = env->xcr0;
2934     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
2935 }
2936 
2937 static int kvm_put_sregs(X86CPU *cpu)
2938 {
2939     CPUX86State *env = &cpu->env;
2940     struct kvm_sregs sregs;
2941 
2942     /*
2943      * The interrupt_bitmap is ignored because KVM_SET_SREGS is
2944      * always followed by KVM_SET_VCPU_EVENTS.
2945      */
2946     memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
2947 
2948     if ((env->eflags & VM_MASK)) {
2949         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2950         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2951         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2952         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2953         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2954         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2955     } else {
2956         set_seg(&sregs.cs, &env->segs[R_CS]);
2957         set_seg(&sregs.ds, &env->segs[R_DS]);
2958         set_seg(&sregs.es, &env->segs[R_ES]);
2959         set_seg(&sregs.fs, &env->segs[R_FS]);
2960         set_seg(&sregs.gs, &env->segs[R_GS]);
2961         set_seg(&sregs.ss, &env->segs[R_SS]);
2962     }
2963 
2964     set_seg(&sregs.tr, &env->tr);
2965     set_seg(&sregs.ldt, &env->ldt);
2966 
2967     sregs.idt.limit = env->idt.limit;
2968     sregs.idt.base = env->idt.base;
2969     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2970     sregs.gdt.limit = env->gdt.limit;
2971     sregs.gdt.base = env->gdt.base;
2972     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2973 
2974     sregs.cr0 = env->cr[0];
2975     sregs.cr2 = env->cr[2];
2976     sregs.cr3 = env->cr[3];
2977     sregs.cr4 = env->cr[4];
2978 
2979     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2980     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2981 
2982     sregs.efer = env->efer;
2983 
2984     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
2985 }
2986 
2987 static int kvm_put_sregs2(X86CPU *cpu)
2988 {
2989     CPUX86State *env = &cpu->env;
2990     struct kvm_sregs2 sregs;
2991     int i;
2992 
2993     sregs.flags = 0;
2994 
2995     if ((env->eflags & VM_MASK)) {
2996         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2997         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2998         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2999         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
3000         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
3001         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
3002     } else {
3003         set_seg(&sregs.cs, &env->segs[R_CS]);
3004         set_seg(&sregs.ds, &env->segs[R_DS]);
3005         set_seg(&sregs.es, &env->segs[R_ES]);
3006         set_seg(&sregs.fs, &env->segs[R_FS]);
3007         set_seg(&sregs.gs, &env->segs[R_GS]);
3008         set_seg(&sregs.ss, &env->segs[R_SS]);
3009     }
3010 
3011     set_seg(&sregs.tr, &env->tr);
3012     set_seg(&sregs.ldt, &env->ldt);
3013 
3014     sregs.idt.limit = env->idt.limit;
3015     sregs.idt.base = env->idt.base;
3016     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
3017     sregs.gdt.limit = env->gdt.limit;
3018     sregs.gdt.base = env->gdt.base;
3019     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
3020 
3021     sregs.cr0 = env->cr[0];
3022     sregs.cr2 = env->cr[2];
3023     sregs.cr3 = env->cr[3];
3024     sregs.cr4 = env->cr[4];
3025 
3026     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
3027     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
3028 
3029     sregs.efer = env->efer;
3030 
3031     if (env->pdptrs_valid) {
3032         for (i = 0; i < 4; i++) {
3033             sregs.pdptrs[i] = env->pdptrs[i];
3034         }
3035         sregs.flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
3036     }
3037 
3038     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS2, &sregs);
3039 }
3040 
3041 
3042 static void kvm_msr_buf_reset(X86CPU *cpu)
3043 {
3044     memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
3045 }
3046 
3047 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
3048 {
3049     struct kvm_msrs *msrs = cpu->kvm_msr_buf;
3050     void *limit = ((void *)msrs) + MSR_BUF_SIZE;
3051     struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
3052 
3053     assert((void *)(entry + 1) <= limit);
3054 
3055     entry->index = index;
3056     entry->reserved = 0;
3057     entry->data = value;
3058     msrs->nmsrs++;
3059 }
3060 
3061 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
3062 {
3063     kvm_msr_buf_reset(cpu);
3064     kvm_msr_entry_add(cpu, index, value);
3065 
3066     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
3067 }
3068 
3069 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value)
3070 {
3071     int ret;
3072     struct {
3073         struct kvm_msrs info;
3074         struct kvm_msr_entry entries[1];
3075     } msr_data = {
3076         .info.nmsrs = 1,
3077         .entries[0].index = index,
3078     };
3079 
3080     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
3081     if (ret < 0) {
3082         return ret;
3083     }
3084     assert(ret == 1);
3085     *value = msr_data.entries[0].data;
3086     return ret;
3087 }
3088 void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
3089 {
3090     int ret;
3091 
3092     ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
3093     assert(ret == 1);
3094 }
3095 
3096 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
3097 {
3098     CPUX86State *env = &cpu->env;
3099     int ret;
3100 
3101     if (!has_msr_tsc_deadline) {
3102         return 0;
3103     }
3104 
3105     ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
3106     if (ret < 0) {
3107         return ret;
3108     }
3109 
3110     assert(ret == 1);
3111     return 0;
3112 }
3113 
3114 /*
3115  * Provide a separate write service for the feature control MSR in order to
3116  * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
3117  * before writing any other state because forcibly leaving nested mode
3118  * invalidates the VCPU state.
3119  */
3120 static int kvm_put_msr_feature_control(X86CPU *cpu)
3121 {
3122     int ret;
3123 
3124     if (!has_msr_feature_control) {
3125         return 0;
3126     }
3127 
3128     ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
3129                           cpu->env.msr_ia32_feature_control);
3130     if (ret < 0) {
3131         return ret;
3132     }
3133 
3134     assert(ret == 1);
3135     return 0;
3136 }
3137 
3138 static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features)
3139 {
3140     uint32_t default1, can_be_one, can_be_zero;
3141     uint32_t must_be_one;
3142 
3143     switch (index) {
3144     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3145         default1 = 0x00000016;
3146         break;
3147     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3148         default1 = 0x0401e172;
3149         break;
3150     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3151         default1 = 0x000011ff;
3152         break;
3153     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3154         default1 = 0x00036dff;
3155         break;
3156     case MSR_IA32_VMX_PROCBASED_CTLS2:
3157         default1 = 0;
3158         break;
3159     default:
3160         abort();
3161     }
3162 
3163     /* If a feature bit is set, the control can be either set or clear.
3164      * Otherwise the value is limited to either 0 or 1 by default1.
3165      */
3166     can_be_one = features | default1;
3167     can_be_zero = features | ~default1;
3168     must_be_one = ~can_be_zero;
3169 
3170     /*
3171      * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one).
3172      * Bit 32:63 -> 1 if the control bit can be one.
3173      */
3174     return must_be_one | (((uint64_t)can_be_one) << 32);
3175 }
3176 
3177 static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
3178 {
3179     uint64_t kvm_vmx_basic =
3180         kvm_arch_get_supported_msr_feature(kvm_state,
3181                                            MSR_IA32_VMX_BASIC);
3182 
3183     if (!kvm_vmx_basic) {
3184         /* If the kernel doesn't support VMX feature (kvm_intel.nested=0),
3185          * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail.
3186          */
3187         return;
3188     }
3189 
3190     uint64_t kvm_vmx_misc =
3191         kvm_arch_get_supported_msr_feature(kvm_state,
3192                                            MSR_IA32_VMX_MISC);
3193     uint64_t kvm_vmx_ept_vpid =
3194         kvm_arch_get_supported_msr_feature(kvm_state,
3195                                            MSR_IA32_VMX_EPT_VPID_CAP);
3196 
3197     /*
3198      * If the guest is 64-bit, a value of 1 is allowed for the host address
3199      * space size vmexit control.
3200      */
3201     uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM
3202         ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0;
3203 
3204     /*
3205      * Bits 0-30, 32-44 and 50-53 come from the host.  KVM should
3206      * not change them for backwards compatibility.
3207      */
3208     uint64_t fixed_vmx_basic = kvm_vmx_basic &
3209         (MSR_VMX_BASIC_VMCS_REVISION_MASK |
3210          MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK |
3211          MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK);
3212 
3213     /*
3214      * Same for bits 0-4 and 25-27.  Bits 16-24 (CR3 target count) can
3215      * change in the future but are always zero for now, clear them to be
3216      * future proof.  Bits 32-63 in theory could change, though KVM does
3217      * not support dual-monitor treatment and probably never will; mask
3218      * them out as well.
3219      */
3220     uint64_t fixed_vmx_misc = kvm_vmx_misc &
3221         (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK |
3222          MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK);
3223 
3224     /*
3225      * EPT memory types should not change either, so we do not bother
3226      * adding features for them.
3227      */
3228     uint64_t fixed_vmx_ept_mask =
3229             (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ?
3230              MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0);
3231     uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask;
3232 
3233     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
3234                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
3235                                          f[FEAT_VMX_PROCBASED_CTLS]));
3236     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS,
3237                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS,
3238                                          f[FEAT_VMX_PINBASED_CTLS]));
3239     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS,
3240                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS,
3241                                          f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit);
3242     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS,
3243                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS,
3244                                          f[FEAT_VMX_ENTRY_CTLS]));
3245     kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2,
3246                       make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2,
3247                                          f[FEAT_VMX_SECONDARY_CTLS]));
3248     kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP,
3249                       f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid);
3250     kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC,
3251                       f[FEAT_VMX_BASIC] | fixed_vmx_basic);
3252     kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC,
3253                       f[FEAT_VMX_MISC] | fixed_vmx_misc);
3254     if (has_msr_vmx_vmfunc) {
3255         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]);
3256     }
3257 
3258     /*
3259      * Just to be safe, write these with constant values.  The CRn_FIXED1
3260      * MSRs are generated by KVM based on the vCPU's CPUID.
3261      */
3262     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0,
3263                       CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK);
3264     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
3265                       CR4_VMXE_MASK);
3266 
3267     if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
3268         /* TSC multiplier (0x2032).  */
3269         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32);
3270     } else {
3271         /* Preemption timer (0x482E).  */
3272         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x2E);
3273     }
3274 }
3275 
3276 static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f)
3277 {
3278     uint64_t kvm_perf_cap =
3279         kvm_arch_get_supported_msr_feature(kvm_state,
3280                                            MSR_IA32_PERF_CAPABILITIES);
3281 
3282     if (kvm_perf_cap) {
3283         kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES,
3284                         kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]);
3285     }
3286 }
3287 
3288 static int kvm_buf_set_msrs(X86CPU *cpu)
3289 {
3290     int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
3291     if (ret < 0) {
3292         return ret;
3293     }
3294 
3295     if (ret < cpu->kvm_msr_buf->nmsrs) {
3296         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
3297         error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
3298                      (uint32_t)e->index, (uint64_t)e->data);
3299     }
3300 
3301     assert(ret == cpu->kvm_msr_buf->nmsrs);
3302     return 0;
3303 }
3304 
3305 static void kvm_init_msrs(X86CPU *cpu)
3306 {
3307     CPUX86State *env = &cpu->env;
3308 
3309     kvm_msr_buf_reset(cpu);
3310     if (has_msr_arch_capabs) {
3311         kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
3312                           env->features[FEAT_ARCH_CAPABILITIES]);
3313     }
3314 
3315     if (has_msr_core_capabs) {
3316         kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
3317                           env->features[FEAT_CORE_CAPABILITY]);
3318     }
3319 
3320     if (has_msr_perf_capabs && cpu->enable_pmu) {
3321         kvm_msr_entry_add_perf(cpu, env->features);
3322     }
3323 
3324     if (has_msr_ucode_rev) {
3325         kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
3326     }
3327 
3328     /*
3329      * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
3330      * all kernels with MSR features should have them.
3331      */
3332     if (kvm_feature_msrs && cpu_has_vmx(env)) {
3333         kvm_msr_entry_add_vmx(cpu, env->features);
3334     }
3335 
3336     assert(kvm_buf_set_msrs(cpu) == 0);
3337 }
3338 
3339 static int kvm_put_msrs(X86CPU *cpu, int level)
3340 {
3341     CPUX86State *env = &cpu->env;
3342     int i;
3343 
3344     kvm_msr_buf_reset(cpu);
3345 
3346     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
3347     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
3348     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
3349     kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
3350     if (has_msr_star) {
3351         kvm_msr_entry_add(cpu, MSR_STAR, env->star);
3352     }
3353     if (has_msr_hsave_pa) {
3354         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
3355     }
3356     if (has_msr_tsc_aux) {
3357         kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
3358     }
3359     if (has_msr_tsc_adjust) {
3360         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
3361     }
3362     if (has_msr_misc_enable) {
3363         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
3364                           env->msr_ia32_misc_enable);
3365     }
3366     if (has_msr_smbase) {
3367         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
3368     }
3369     if (has_msr_smi_count) {
3370         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
3371     }
3372     if (has_msr_pkrs) {
3373         kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs);
3374     }
3375     if (has_msr_bndcfgs) {
3376         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
3377     }
3378     if (has_msr_xss) {
3379         kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
3380     }
3381     if (has_msr_umwait) {
3382         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait);
3383     }
3384     if (has_msr_spec_ctrl) {
3385         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
3386     }
3387     if (has_tsc_scale_msr) {
3388         kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr);
3389     }
3390 
3391     if (has_msr_tsx_ctrl) {
3392         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl);
3393     }
3394     if (has_msr_virt_ssbd) {
3395         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
3396     }
3397 
3398 #ifdef TARGET_X86_64
3399     if (lm_capable_kernel) {
3400         kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
3401         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
3402         kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
3403         kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
3404         if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) {
3405             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, env->fred_rsp0);
3406             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, env->fred_rsp1);
3407             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, env->fred_rsp2);
3408             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, env->fred_rsp3);
3409             kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, env->fred_stklvls);
3410             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, env->fred_ssp1);
3411             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, env->fred_ssp2);
3412             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, env->fred_ssp3);
3413             kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, env->fred_config);
3414         }
3415     }
3416 #endif
3417 
3418     /*
3419      * The following MSRs have side effects on the guest or are too heavy
3420      * for normal writeback. Limit them to reset or full state updates.
3421      */
3422     if (level >= KVM_PUT_RESET_STATE) {
3423         kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
3424         kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
3425         kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
3426         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3427             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr);
3428         }
3429         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3430             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
3431         }
3432         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3433             kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
3434         }
3435         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3436             kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
3437         }
3438 
3439         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3440             kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
3441         }
3442 
3443         if (has_architectural_pmu_version > 0) {
3444             if (has_architectural_pmu_version > 1) {
3445                 /* Stop the counter.  */
3446                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3447                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3448             }
3449 
3450             /* Set the counter values.  */
3451             for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3452                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
3453                                   env->msr_fixed_counters[i]);
3454             }
3455             for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3456                 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
3457                                   env->msr_gp_counters[i]);
3458                 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
3459                                   env->msr_gp_evtsel[i]);
3460             }
3461             if (has_architectural_pmu_version > 1) {
3462                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
3463                                   env->msr_global_status);
3464                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
3465                                   env->msr_global_ovf_ctrl);
3466 
3467                 /* Now start the PMU.  */
3468                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
3469                                   env->msr_fixed_ctr_ctrl);
3470                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
3471                                   env->msr_global_ctrl);
3472             }
3473         }
3474         /*
3475          * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
3476          * only sync them to KVM on the first cpu
3477          */
3478         if (current_cpu == first_cpu) {
3479             if (has_msr_hv_hypercall) {
3480                 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
3481                                   env->msr_hv_guest_os_id);
3482                 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
3483                                   env->msr_hv_hypercall);
3484             }
3485             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3486                 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
3487                                   env->msr_hv_tsc);
3488             }
3489             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3490                 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
3491                                   env->msr_hv_reenlightenment_control);
3492                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
3493                                   env->msr_hv_tsc_emulation_control);
3494                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
3495                                   env->msr_hv_tsc_emulation_status);
3496             }
3497 #ifdef CONFIG_SYNDBG
3498             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) &&
3499                 has_msr_hv_syndbg_options) {
3500                 kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS,
3501                                   hyperv_syndbg_query_options());
3502             }
3503 #endif
3504         }
3505         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3506             kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
3507                               env->msr_hv_vapic);
3508         }
3509         if (has_msr_hv_crash) {
3510             int j;
3511 
3512             for (j = 0; j < HV_CRASH_PARAMS; j++)
3513                 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
3514                                   env->msr_hv_crash_params[j]);
3515 
3516             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
3517         }
3518         if (has_msr_hv_runtime) {
3519             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
3520         }
3521         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)
3522             && hv_vpindex_settable) {
3523             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX,
3524                               hyperv_vp_index(CPU(cpu)));
3525         }
3526         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3527             int j;
3528 
3529             kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
3530 
3531             kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
3532                               env->msr_hv_synic_control);
3533             kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
3534                               env->msr_hv_synic_evt_page);
3535             kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
3536                               env->msr_hv_synic_msg_page);
3537 
3538             for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
3539                 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
3540                                   env->msr_hv_synic_sint[j]);
3541             }
3542         }
3543         if (has_msr_hv_stimer) {
3544             int j;
3545 
3546             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
3547                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
3548                                 env->msr_hv_stimer_config[j]);
3549             }
3550 
3551             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
3552                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
3553                                 env->msr_hv_stimer_count[j]);
3554             }
3555         }
3556         if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3557             uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
3558 
3559             kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
3560             kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
3561             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
3562             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
3563             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
3564             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
3565             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
3566             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
3567             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
3568             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
3569             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
3570             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
3571             for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3572                 /* The CPU GPs if we write to a bit above the physical limit of
3573                  * the host CPU (and KVM emulates that)
3574                  */
3575                 uint64_t mask = env->mtrr_var[i].mask;
3576                 mask &= phys_mask;
3577 
3578                 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
3579                                   env->mtrr_var[i].base);
3580                 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
3581             }
3582         }
3583         if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3584             int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
3585                                                     0x14, 1, R_EAX) & 0x7;
3586 
3587             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
3588                             env->msr_rtit_ctrl);
3589             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
3590                             env->msr_rtit_status);
3591             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
3592                             env->msr_rtit_output_base);
3593             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
3594                             env->msr_rtit_output_mask);
3595             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
3596                             env->msr_rtit_cr3_match);
3597             for (i = 0; i < addr_num; i++) {
3598                 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
3599                             env->msr_rtit_addrs[i]);
3600             }
3601         }
3602 
3603         if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
3604             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0,
3605                               env->msr_ia32_sgxlepubkeyhash[0]);
3606             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1,
3607                               env->msr_ia32_sgxlepubkeyhash[1]);
3608             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2,
3609                               env->msr_ia32_sgxlepubkeyhash[2]);
3610             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3,
3611                               env->msr_ia32_sgxlepubkeyhash[3]);
3612         }
3613 
3614         if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
3615             kvm_msr_entry_add(cpu, MSR_IA32_XFD,
3616                               env->msr_xfd);
3617             kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR,
3618                               env->msr_xfd_err);
3619         }
3620 
3621         if (kvm_enabled() && cpu->enable_pmu &&
3622             (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) {
3623             uint64_t depth;
3624             int ret;
3625 
3626             /*
3627              * Only migrate Arch LBR states when the host Arch LBR depth
3628              * equals that of source guest's, this is to avoid mismatch
3629              * of guest/host config for the msr hence avoid unexpected
3630              * misbehavior.
3631              */
3632             ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth);
3633 
3634             if (ret == 1 && !!depth && depth == env->msr_lbr_depth) {
3635                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, env->msr_lbr_ctl);
3636                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, env->msr_lbr_depth);
3637 
3638                 for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) {
3639                     if (!env->lbr_records[i].from) {
3640                         continue;
3641                     }
3642                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i,
3643                                       env->lbr_records[i].from);
3644                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i,
3645                                       env->lbr_records[i].to);
3646                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i,
3647                                       env->lbr_records[i].info);
3648                 }
3649             }
3650         }
3651 
3652         /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
3653          *       kvm_put_msr_feature_control. */
3654     }
3655 
3656     if (env->mcg_cap) {
3657         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
3658         kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
3659         if (has_msr_mcg_ext_ctl) {
3660             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
3661         }
3662         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3663             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
3664         }
3665     }
3666 
3667     return kvm_buf_set_msrs(cpu);
3668 }
3669 
3670 
3671 static int kvm_get_xsave(X86CPU *cpu)
3672 {
3673     CPUX86State *env = &cpu->env;
3674     void *xsave = env->xsave_buf;
3675     int type, ret;
3676 
3677     type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE;
3678     ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave);
3679     if (ret < 0) {
3680         return ret;
3681     }
3682     x86_cpu_xrstor_all_areas(cpu, xsave, env->xsave_buf_len);
3683 
3684     return 0;
3685 }
3686 
3687 static int kvm_get_xcrs(X86CPU *cpu)
3688 {
3689     CPUX86State *env = &cpu->env;
3690     int i, ret;
3691     struct kvm_xcrs xcrs;
3692 
3693     if (!has_xcrs) {
3694         return 0;
3695     }
3696 
3697     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
3698     if (ret < 0) {
3699         return ret;
3700     }
3701 
3702     for (i = 0; i < xcrs.nr_xcrs; i++) {
3703         /* Only support xcr0 now */
3704         if (xcrs.xcrs[i].xcr == 0) {
3705             env->xcr0 = xcrs.xcrs[i].value;
3706             break;
3707         }
3708     }
3709     return 0;
3710 }
3711 
3712 static int kvm_get_sregs(X86CPU *cpu)
3713 {
3714     CPUX86State *env = &cpu->env;
3715     struct kvm_sregs sregs;
3716     int ret;
3717 
3718     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
3719     if (ret < 0) {
3720         return ret;
3721     }
3722 
3723     /*
3724      * The interrupt_bitmap is ignored because KVM_GET_SREGS is
3725      * always preceded by KVM_GET_VCPU_EVENTS.
3726      */
3727 
3728     get_seg(&env->segs[R_CS], &sregs.cs);
3729     get_seg(&env->segs[R_DS], &sregs.ds);
3730     get_seg(&env->segs[R_ES], &sregs.es);
3731     get_seg(&env->segs[R_FS], &sregs.fs);
3732     get_seg(&env->segs[R_GS], &sregs.gs);
3733     get_seg(&env->segs[R_SS], &sregs.ss);
3734 
3735     get_seg(&env->tr, &sregs.tr);
3736     get_seg(&env->ldt, &sregs.ldt);
3737 
3738     env->idt.limit = sregs.idt.limit;
3739     env->idt.base = sregs.idt.base;
3740     env->gdt.limit = sregs.gdt.limit;
3741     env->gdt.base = sregs.gdt.base;
3742 
3743     env->cr[0] = sregs.cr0;
3744     env->cr[2] = sregs.cr2;
3745     env->cr[3] = sregs.cr3;
3746     env->cr[4] = sregs.cr4;
3747 
3748     env->efer = sregs.efer;
3749     if (sev_es_enabled() && env->efer & MSR_EFER_LME &&
3750         env->cr[0] & CR0_PG_MASK) {
3751         env->efer |= MSR_EFER_LMA;
3752     }
3753 
3754     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3755     x86_update_hflags(env);
3756 
3757     return 0;
3758 }
3759 
3760 static int kvm_get_sregs2(X86CPU *cpu)
3761 {
3762     CPUX86State *env = &cpu->env;
3763     struct kvm_sregs2 sregs;
3764     int i, ret;
3765 
3766     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
3767     if (ret < 0) {
3768         return ret;
3769     }
3770 
3771     get_seg(&env->segs[R_CS], &sregs.cs);
3772     get_seg(&env->segs[R_DS], &sregs.ds);
3773     get_seg(&env->segs[R_ES], &sregs.es);
3774     get_seg(&env->segs[R_FS], &sregs.fs);
3775     get_seg(&env->segs[R_GS], &sregs.gs);
3776     get_seg(&env->segs[R_SS], &sregs.ss);
3777 
3778     get_seg(&env->tr, &sregs.tr);
3779     get_seg(&env->ldt, &sregs.ldt);
3780 
3781     env->idt.limit = sregs.idt.limit;
3782     env->idt.base = sregs.idt.base;
3783     env->gdt.limit = sregs.gdt.limit;
3784     env->gdt.base = sregs.gdt.base;
3785 
3786     env->cr[0] = sregs.cr0;
3787     env->cr[2] = sregs.cr2;
3788     env->cr[3] = sregs.cr3;
3789     env->cr[4] = sregs.cr4;
3790 
3791     env->efer = sregs.efer;
3792     if (sev_es_enabled() && env->efer & MSR_EFER_LME &&
3793         env->cr[0] & CR0_PG_MASK) {
3794         env->efer |= MSR_EFER_LMA;
3795     }
3796 
3797     env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
3798 
3799     if (env->pdptrs_valid) {
3800         for (i = 0; i < 4; i++) {
3801             env->pdptrs[i] = sregs.pdptrs[i];
3802         }
3803     }
3804 
3805     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3806     x86_update_hflags(env);
3807 
3808     return 0;
3809 }
3810 
3811 static int kvm_get_msrs(X86CPU *cpu)
3812 {
3813     CPUX86State *env = &cpu->env;
3814     struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
3815     int ret, i;
3816     uint64_t mtrr_top_bits;
3817 
3818     kvm_msr_buf_reset(cpu);
3819 
3820     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
3821     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
3822     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
3823     kvm_msr_entry_add(cpu, MSR_PAT, 0);
3824     if (has_msr_star) {
3825         kvm_msr_entry_add(cpu, MSR_STAR, 0);
3826     }
3827     if (has_msr_hsave_pa) {
3828         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
3829     }
3830     if (has_msr_tsc_aux) {
3831         kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
3832     }
3833     if (has_msr_tsc_adjust) {
3834         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
3835     }
3836     if (has_msr_tsc_deadline) {
3837         kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
3838     }
3839     if (has_msr_misc_enable) {
3840         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
3841     }
3842     if (has_msr_smbase) {
3843         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
3844     }
3845     if (has_msr_smi_count) {
3846         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
3847     }
3848     if (has_msr_feature_control) {
3849         kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
3850     }
3851     if (has_msr_pkrs) {
3852         kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0);
3853     }
3854     if (has_msr_bndcfgs) {
3855         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
3856     }
3857     if (has_msr_xss) {
3858         kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
3859     }
3860     if (has_msr_umwait) {
3861         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0);
3862     }
3863     if (has_msr_spec_ctrl) {
3864         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
3865     }
3866     if (has_tsc_scale_msr) {
3867         kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, 0);
3868     }
3869 
3870     if (has_msr_tsx_ctrl) {
3871         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0);
3872     }
3873     if (has_msr_virt_ssbd) {
3874         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
3875     }
3876     if (!env->tsc_valid) {
3877         kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
3878         env->tsc_valid = !runstate_is_running();
3879     }
3880 
3881 #ifdef TARGET_X86_64
3882     if (lm_capable_kernel) {
3883         kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
3884         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
3885         kvm_msr_entry_add(cpu, MSR_FMASK, 0);
3886         kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
3887         if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) {
3888             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, 0);
3889             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, 0);
3890             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, 0);
3891             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, 0);
3892             kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, 0);
3893             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, 0);
3894             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, 0);
3895             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, 0);
3896             kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, 0);
3897         }
3898     }
3899 #endif
3900     kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
3901     kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
3902     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3903         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0);
3904     }
3905     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3906         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
3907     }
3908     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3909         kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
3910     }
3911     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3912         kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
3913     }
3914     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3915         kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
3916     }
3917     if (has_architectural_pmu_version > 0) {
3918         if (has_architectural_pmu_version > 1) {
3919             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3920             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3921             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
3922             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
3923         }
3924         for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3925             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
3926         }
3927         for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3928             kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
3929             kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
3930         }
3931     }
3932 
3933     if (env->mcg_cap) {
3934         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
3935         kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
3936         if (has_msr_mcg_ext_ctl) {
3937             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
3938         }
3939         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3940             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
3941         }
3942     }
3943 
3944     if (has_msr_hv_hypercall) {
3945         kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
3946         kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
3947     }
3948     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3949         kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
3950     }
3951     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3952         kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
3953     }
3954     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3955         kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
3956         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
3957         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
3958     }
3959     if (has_msr_hv_syndbg_options) {
3960         kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, 0);
3961     }
3962     if (has_msr_hv_crash) {
3963         int j;
3964 
3965         for (j = 0; j < HV_CRASH_PARAMS; j++) {
3966             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
3967         }
3968     }
3969     if (has_msr_hv_runtime) {
3970         kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
3971     }
3972     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3973         uint32_t msr;
3974 
3975         kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
3976         kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
3977         kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
3978         for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
3979             kvm_msr_entry_add(cpu, msr, 0);
3980         }
3981     }
3982     if (has_msr_hv_stimer) {
3983         uint32_t msr;
3984 
3985         for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
3986              msr++) {
3987             kvm_msr_entry_add(cpu, msr, 0);
3988         }
3989     }
3990     if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3991         kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
3992         kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
3993         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
3994         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
3995         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
3996         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
3997         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
3998         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
3999         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
4000         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
4001         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
4002         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
4003         for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
4004             kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
4005             kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
4006         }
4007     }
4008 
4009     if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
4010         int addr_num =
4011             kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
4012 
4013         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
4014         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
4015         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
4016         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
4017         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
4018         for (i = 0; i < addr_num; i++) {
4019             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
4020         }
4021     }
4022 
4023     if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
4024         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0);
4025         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0);
4026         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0);
4027         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0);
4028     }
4029 
4030     if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
4031         kvm_msr_entry_add(cpu, MSR_IA32_XFD, 0);
4032         kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 0);
4033     }
4034 
4035     if (kvm_enabled() && cpu->enable_pmu &&
4036         (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) {
4037         uint64_t depth;
4038 
4039         ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth);
4040         if (ret == 1 && depth == ARCH_LBR_NR_ENTRIES) {
4041             kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, 0);
4042             kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, 0);
4043 
4044             for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) {
4045                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i, 0);
4046                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i, 0);
4047                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i, 0);
4048             }
4049         }
4050     }
4051 
4052     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
4053     if (ret < 0) {
4054         return ret;
4055     }
4056 
4057     if (ret < cpu->kvm_msr_buf->nmsrs) {
4058         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
4059         error_report("error: failed to get MSR 0x%" PRIx32,
4060                      (uint32_t)e->index);
4061     }
4062 
4063     assert(ret == cpu->kvm_msr_buf->nmsrs);
4064     /*
4065      * MTRR masks: Each mask consists of 5 parts
4066      * a  10..0: must be zero
4067      * b  11   : valid bit
4068      * c n-1.12: actual mask bits
4069      * d  51..n: reserved must be zero
4070      * e  63.52: reserved must be zero
4071      *
4072      * 'n' is the number of physical bits supported by the CPU and is
4073      * apparently always <= 52.   We know our 'n' but don't know what
4074      * the destinations 'n' is; it might be smaller, in which case
4075      * it masks (c) on loading. It might be larger, in which case
4076      * we fill 'd' so that d..c is consistent irrespetive of the 'n'
4077      * we're migrating to.
4078      */
4079 
4080     if (cpu->fill_mtrr_mask) {
4081         QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
4082         assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
4083         mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
4084     } else {
4085         mtrr_top_bits = 0;
4086     }
4087 
4088     for (i = 0; i < ret; i++) {
4089         uint32_t index = msrs[i].index;
4090         switch (index) {
4091         case MSR_IA32_SYSENTER_CS:
4092             env->sysenter_cs = msrs[i].data;
4093             break;
4094         case MSR_IA32_SYSENTER_ESP:
4095             env->sysenter_esp = msrs[i].data;
4096             break;
4097         case MSR_IA32_SYSENTER_EIP:
4098             env->sysenter_eip = msrs[i].data;
4099             break;
4100         case MSR_PAT:
4101             env->pat = msrs[i].data;
4102             break;
4103         case MSR_STAR:
4104             env->star = msrs[i].data;
4105             break;
4106 #ifdef TARGET_X86_64
4107         case MSR_CSTAR:
4108             env->cstar = msrs[i].data;
4109             break;
4110         case MSR_KERNELGSBASE:
4111             env->kernelgsbase = msrs[i].data;
4112             break;
4113         case MSR_FMASK:
4114             env->fmask = msrs[i].data;
4115             break;
4116         case MSR_LSTAR:
4117             env->lstar = msrs[i].data;
4118             break;
4119         case MSR_IA32_FRED_RSP0:
4120             env->fred_rsp0 = msrs[i].data;
4121             break;
4122         case MSR_IA32_FRED_RSP1:
4123             env->fred_rsp1 = msrs[i].data;
4124             break;
4125         case MSR_IA32_FRED_RSP2:
4126             env->fred_rsp2 = msrs[i].data;
4127             break;
4128         case MSR_IA32_FRED_RSP3:
4129             env->fred_rsp3 = msrs[i].data;
4130             break;
4131         case MSR_IA32_FRED_STKLVLS:
4132             env->fred_stklvls = msrs[i].data;
4133             break;
4134         case MSR_IA32_FRED_SSP1:
4135             env->fred_ssp1 = msrs[i].data;
4136             break;
4137         case MSR_IA32_FRED_SSP2:
4138             env->fred_ssp2 = msrs[i].data;
4139             break;
4140         case MSR_IA32_FRED_SSP3:
4141             env->fred_ssp3 = msrs[i].data;
4142             break;
4143         case MSR_IA32_FRED_CONFIG:
4144             env->fred_config = msrs[i].data;
4145             break;
4146 #endif
4147         case MSR_IA32_TSC:
4148             env->tsc = msrs[i].data;
4149             break;
4150         case MSR_TSC_AUX:
4151             env->tsc_aux = msrs[i].data;
4152             break;
4153         case MSR_TSC_ADJUST:
4154             env->tsc_adjust = msrs[i].data;
4155             break;
4156         case MSR_IA32_TSCDEADLINE:
4157             env->tsc_deadline = msrs[i].data;
4158             break;
4159         case MSR_VM_HSAVE_PA:
4160             env->vm_hsave = msrs[i].data;
4161             break;
4162         case MSR_KVM_SYSTEM_TIME:
4163             env->system_time_msr = msrs[i].data;
4164             break;
4165         case MSR_KVM_WALL_CLOCK:
4166             env->wall_clock_msr = msrs[i].data;
4167             break;
4168         case MSR_MCG_STATUS:
4169             env->mcg_status = msrs[i].data;
4170             break;
4171         case MSR_MCG_CTL:
4172             env->mcg_ctl = msrs[i].data;
4173             break;
4174         case MSR_MCG_EXT_CTL:
4175             env->mcg_ext_ctl = msrs[i].data;
4176             break;
4177         case MSR_IA32_MISC_ENABLE:
4178             env->msr_ia32_misc_enable = msrs[i].data;
4179             break;
4180         case MSR_IA32_SMBASE:
4181             env->smbase = msrs[i].data;
4182             break;
4183         case MSR_SMI_COUNT:
4184             env->msr_smi_count = msrs[i].data;
4185             break;
4186         case MSR_IA32_FEATURE_CONTROL:
4187             env->msr_ia32_feature_control = msrs[i].data;
4188             break;
4189         case MSR_IA32_BNDCFGS:
4190             env->msr_bndcfgs = msrs[i].data;
4191             break;
4192         case MSR_IA32_XSS:
4193             env->xss = msrs[i].data;
4194             break;
4195         case MSR_IA32_UMWAIT_CONTROL:
4196             env->umwait = msrs[i].data;
4197             break;
4198         case MSR_IA32_PKRS:
4199             env->pkrs = msrs[i].data;
4200             break;
4201         default:
4202             if (msrs[i].index >= MSR_MC0_CTL &&
4203                 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
4204                 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
4205             }
4206             break;
4207         case MSR_KVM_ASYNC_PF_EN:
4208             env->async_pf_en_msr = msrs[i].data;
4209             break;
4210         case MSR_KVM_ASYNC_PF_INT:
4211             env->async_pf_int_msr = msrs[i].data;
4212             break;
4213         case MSR_KVM_PV_EOI_EN:
4214             env->pv_eoi_en_msr = msrs[i].data;
4215             break;
4216         case MSR_KVM_STEAL_TIME:
4217             env->steal_time_msr = msrs[i].data;
4218             break;
4219         case MSR_KVM_POLL_CONTROL: {
4220             env->poll_control_msr = msrs[i].data;
4221             break;
4222         }
4223         case MSR_CORE_PERF_FIXED_CTR_CTRL:
4224             env->msr_fixed_ctr_ctrl = msrs[i].data;
4225             break;
4226         case MSR_CORE_PERF_GLOBAL_CTRL:
4227             env->msr_global_ctrl = msrs[i].data;
4228             break;
4229         case MSR_CORE_PERF_GLOBAL_STATUS:
4230             env->msr_global_status = msrs[i].data;
4231             break;
4232         case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
4233             env->msr_global_ovf_ctrl = msrs[i].data;
4234             break;
4235         case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
4236             env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
4237             break;
4238         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
4239             env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
4240             break;
4241         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
4242             env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
4243             break;
4244         case HV_X64_MSR_HYPERCALL:
4245             env->msr_hv_hypercall = msrs[i].data;
4246             break;
4247         case HV_X64_MSR_GUEST_OS_ID:
4248             env->msr_hv_guest_os_id = msrs[i].data;
4249             break;
4250         case HV_X64_MSR_APIC_ASSIST_PAGE:
4251             env->msr_hv_vapic = msrs[i].data;
4252             break;
4253         case HV_X64_MSR_REFERENCE_TSC:
4254             env->msr_hv_tsc = msrs[i].data;
4255             break;
4256         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
4257             env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
4258             break;
4259         case HV_X64_MSR_VP_RUNTIME:
4260             env->msr_hv_runtime = msrs[i].data;
4261             break;
4262         case HV_X64_MSR_SCONTROL:
4263             env->msr_hv_synic_control = msrs[i].data;
4264             break;
4265         case HV_X64_MSR_SIEFP:
4266             env->msr_hv_synic_evt_page = msrs[i].data;
4267             break;
4268         case HV_X64_MSR_SIMP:
4269             env->msr_hv_synic_msg_page = msrs[i].data;
4270             break;
4271         case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
4272             env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
4273             break;
4274         case HV_X64_MSR_STIMER0_CONFIG:
4275         case HV_X64_MSR_STIMER1_CONFIG:
4276         case HV_X64_MSR_STIMER2_CONFIG:
4277         case HV_X64_MSR_STIMER3_CONFIG:
4278             env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
4279                                 msrs[i].data;
4280             break;
4281         case HV_X64_MSR_STIMER0_COUNT:
4282         case HV_X64_MSR_STIMER1_COUNT:
4283         case HV_X64_MSR_STIMER2_COUNT:
4284         case HV_X64_MSR_STIMER3_COUNT:
4285             env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
4286                                 msrs[i].data;
4287             break;
4288         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
4289             env->msr_hv_reenlightenment_control = msrs[i].data;
4290             break;
4291         case HV_X64_MSR_TSC_EMULATION_CONTROL:
4292             env->msr_hv_tsc_emulation_control = msrs[i].data;
4293             break;
4294         case HV_X64_MSR_TSC_EMULATION_STATUS:
4295             env->msr_hv_tsc_emulation_status = msrs[i].data;
4296             break;
4297         case HV_X64_MSR_SYNDBG_OPTIONS:
4298             env->msr_hv_syndbg_options = msrs[i].data;
4299             break;
4300         case MSR_MTRRdefType:
4301             env->mtrr_deftype = msrs[i].data;
4302             break;
4303         case MSR_MTRRfix64K_00000:
4304             env->mtrr_fixed[0] = msrs[i].data;
4305             break;
4306         case MSR_MTRRfix16K_80000:
4307             env->mtrr_fixed[1] = msrs[i].data;
4308             break;
4309         case MSR_MTRRfix16K_A0000:
4310             env->mtrr_fixed[2] = msrs[i].data;
4311             break;
4312         case MSR_MTRRfix4K_C0000:
4313             env->mtrr_fixed[3] = msrs[i].data;
4314             break;
4315         case MSR_MTRRfix4K_C8000:
4316             env->mtrr_fixed[4] = msrs[i].data;
4317             break;
4318         case MSR_MTRRfix4K_D0000:
4319             env->mtrr_fixed[5] = msrs[i].data;
4320             break;
4321         case MSR_MTRRfix4K_D8000:
4322             env->mtrr_fixed[6] = msrs[i].data;
4323             break;
4324         case MSR_MTRRfix4K_E0000:
4325             env->mtrr_fixed[7] = msrs[i].data;
4326             break;
4327         case MSR_MTRRfix4K_E8000:
4328             env->mtrr_fixed[8] = msrs[i].data;
4329             break;
4330         case MSR_MTRRfix4K_F0000:
4331             env->mtrr_fixed[9] = msrs[i].data;
4332             break;
4333         case MSR_MTRRfix4K_F8000:
4334             env->mtrr_fixed[10] = msrs[i].data;
4335             break;
4336         case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
4337             if (index & 1) {
4338                 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
4339                                                                mtrr_top_bits;
4340             } else {
4341                 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
4342             }
4343             break;
4344         case MSR_IA32_SPEC_CTRL:
4345             env->spec_ctrl = msrs[i].data;
4346             break;
4347         case MSR_AMD64_TSC_RATIO:
4348             env->amd_tsc_scale_msr = msrs[i].data;
4349             break;
4350         case MSR_IA32_TSX_CTRL:
4351             env->tsx_ctrl = msrs[i].data;
4352             break;
4353         case MSR_VIRT_SSBD:
4354             env->virt_ssbd = msrs[i].data;
4355             break;
4356         case MSR_IA32_RTIT_CTL:
4357             env->msr_rtit_ctrl = msrs[i].data;
4358             break;
4359         case MSR_IA32_RTIT_STATUS:
4360             env->msr_rtit_status = msrs[i].data;
4361             break;
4362         case MSR_IA32_RTIT_OUTPUT_BASE:
4363             env->msr_rtit_output_base = msrs[i].data;
4364             break;
4365         case MSR_IA32_RTIT_OUTPUT_MASK:
4366             env->msr_rtit_output_mask = msrs[i].data;
4367             break;
4368         case MSR_IA32_RTIT_CR3_MATCH:
4369             env->msr_rtit_cr3_match = msrs[i].data;
4370             break;
4371         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
4372             env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
4373             break;
4374         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
4375             env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] =
4376                            msrs[i].data;
4377             break;
4378         case MSR_IA32_XFD:
4379             env->msr_xfd = msrs[i].data;
4380             break;
4381         case MSR_IA32_XFD_ERR:
4382             env->msr_xfd_err = msrs[i].data;
4383             break;
4384         case MSR_ARCH_LBR_CTL:
4385             env->msr_lbr_ctl = msrs[i].data;
4386             break;
4387         case MSR_ARCH_LBR_DEPTH:
4388             env->msr_lbr_depth = msrs[i].data;
4389             break;
4390         case MSR_ARCH_LBR_FROM_0 ... MSR_ARCH_LBR_FROM_0 + 31:
4391             env->lbr_records[index - MSR_ARCH_LBR_FROM_0].from = msrs[i].data;
4392             break;
4393         case MSR_ARCH_LBR_TO_0 ... MSR_ARCH_LBR_TO_0 + 31:
4394             env->lbr_records[index - MSR_ARCH_LBR_TO_0].to = msrs[i].data;
4395             break;
4396         case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31:
4397             env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data;
4398             break;
4399         }
4400     }
4401 
4402     return 0;
4403 }
4404 
4405 static int kvm_put_mp_state(X86CPU *cpu)
4406 {
4407     struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
4408 
4409     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
4410 }
4411 
4412 static int kvm_get_mp_state(X86CPU *cpu)
4413 {
4414     CPUState *cs = CPU(cpu);
4415     CPUX86State *env = &cpu->env;
4416     struct kvm_mp_state mp_state;
4417     int ret;
4418 
4419     ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
4420     if (ret < 0) {
4421         return ret;
4422     }
4423     env->mp_state = mp_state.mp_state;
4424     if (kvm_irqchip_in_kernel()) {
4425         cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
4426     }
4427     return 0;
4428 }
4429 
4430 static int kvm_get_apic(X86CPU *cpu)
4431 {
4432     DeviceState *apic = cpu->apic_state;
4433     struct kvm_lapic_state kapic;
4434     int ret;
4435 
4436     if (apic && kvm_irqchip_in_kernel()) {
4437         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
4438         if (ret < 0) {
4439             return ret;
4440         }
4441 
4442         kvm_get_apic_state(apic, &kapic);
4443     }
4444     return 0;
4445 }
4446 
4447 static int kvm_put_vcpu_events(X86CPU *cpu, int level)
4448 {
4449     CPUState *cs = CPU(cpu);
4450     CPUX86State *env = &cpu->env;
4451     struct kvm_vcpu_events events = {};
4452 
4453     events.flags = 0;
4454 
4455     if (has_exception_payload) {
4456         events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4457         events.exception.pending = env->exception_pending;
4458         events.exception_has_payload = env->exception_has_payload;
4459         events.exception_payload = env->exception_payload;
4460     }
4461     events.exception.nr = env->exception_nr;
4462     events.exception.injected = env->exception_injected;
4463     events.exception.has_error_code = env->has_error_code;
4464     events.exception.error_code = env->error_code;
4465 
4466     events.interrupt.injected = (env->interrupt_injected >= 0);
4467     events.interrupt.nr = env->interrupt_injected;
4468     events.interrupt.soft = env->soft_interrupt;
4469 
4470     events.nmi.injected = env->nmi_injected;
4471     events.nmi.pending = env->nmi_pending;
4472     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
4473 
4474     events.sipi_vector = env->sipi_vector;
4475 
4476     if (has_msr_smbase) {
4477         events.flags |= KVM_VCPUEVENT_VALID_SMM;
4478         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
4479         events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
4480         if (kvm_irqchip_in_kernel()) {
4481             /* As soon as these are moved to the kernel, remove them
4482              * from cs->interrupt_request.
4483              */
4484             events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
4485             events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
4486             cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
4487         } else {
4488             /* Keep these in cs->interrupt_request.  */
4489             events.smi.pending = 0;
4490             events.smi.latched_init = 0;
4491         }
4492     }
4493 
4494     if (level >= KVM_PUT_RESET_STATE) {
4495         events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
4496         if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
4497             events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR;
4498         }
4499     }
4500 
4501     if (has_triple_fault_event) {
4502         events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
4503         events.triple_fault.pending = env->triple_fault_pending;
4504     }
4505 
4506     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
4507 }
4508 
4509 static int kvm_get_vcpu_events(X86CPU *cpu)
4510 {
4511     CPUX86State *env = &cpu->env;
4512     struct kvm_vcpu_events events;
4513     int ret;
4514 
4515     memset(&events, 0, sizeof(events));
4516     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
4517     if (ret < 0) {
4518        return ret;
4519     }
4520 
4521     if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4522         env->exception_pending = events.exception.pending;
4523         env->exception_has_payload = events.exception_has_payload;
4524         env->exception_payload = events.exception_payload;
4525     } else {
4526         env->exception_pending = 0;
4527         env->exception_has_payload = false;
4528     }
4529     env->exception_injected = events.exception.injected;
4530     env->exception_nr =
4531         (env->exception_pending || env->exception_injected) ?
4532         events.exception.nr : -1;
4533     env->has_error_code = events.exception.has_error_code;
4534     env->error_code = events.exception.error_code;
4535 
4536     env->interrupt_injected =
4537         events.interrupt.injected ? events.interrupt.nr : -1;
4538     env->soft_interrupt = events.interrupt.soft;
4539 
4540     env->nmi_injected = events.nmi.injected;
4541     env->nmi_pending = events.nmi.pending;
4542     if (events.nmi.masked) {
4543         env->hflags2 |= HF2_NMI_MASK;
4544     } else {
4545         env->hflags2 &= ~HF2_NMI_MASK;
4546     }
4547 
4548     if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
4549         if (events.smi.smm) {
4550             env->hflags |= HF_SMM_MASK;
4551         } else {
4552             env->hflags &= ~HF_SMM_MASK;
4553         }
4554         if (events.smi.pending) {
4555             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4556         } else {
4557             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4558         }
4559         if (events.smi.smm_inside_nmi) {
4560             env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
4561         } else {
4562             env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
4563         }
4564         if (events.smi.latched_init) {
4565             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4566         } else {
4567             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4568         }
4569     }
4570 
4571     if (events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
4572         env->triple_fault_pending = events.triple_fault.pending;
4573     }
4574 
4575     env->sipi_vector = events.sipi_vector;
4576 
4577     return 0;
4578 }
4579 
4580 static int kvm_put_debugregs(X86CPU *cpu)
4581 {
4582     CPUX86State *env = &cpu->env;
4583     struct kvm_debugregs dbgregs;
4584     int i;
4585 
4586     memset(&dbgregs, 0, sizeof(dbgregs));
4587     for (i = 0; i < 4; i++) {
4588         dbgregs.db[i] = env->dr[i];
4589     }
4590     dbgregs.dr6 = env->dr[6];
4591     dbgregs.dr7 = env->dr[7];
4592     dbgregs.flags = 0;
4593 
4594     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
4595 }
4596 
4597 static int kvm_get_debugregs(X86CPU *cpu)
4598 {
4599     CPUX86State *env = &cpu->env;
4600     struct kvm_debugregs dbgregs;
4601     int i, ret;
4602 
4603     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
4604     if (ret < 0) {
4605         return ret;
4606     }
4607     for (i = 0; i < 4; i++) {
4608         env->dr[i] = dbgregs.db[i];
4609     }
4610     env->dr[4] = env->dr[6] = dbgregs.dr6;
4611     env->dr[5] = env->dr[7] = dbgregs.dr7;
4612 
4613     return 0;
4614 }
4615 
4616 static int kvm_put_nested_state(X86CPU *cpu)
4617 {
4618     CPUX86State *env = &cpu->env;
4619     int max_nested_state_len = kvm_max_nested_state_length();
4620 
4621     if (!env->nested_state) {
4622         return 0;
4623     }
4624 
4625     /*
4626      * Copy flags that are affected by reset from env->hflags and env->hflags2.
4627      */
4628     if (env->hflags & HF_GUEST_MASK) {
4629         env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
4630     } else {
4631         env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
4632     }
4633 
4634     /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */
4635     if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) {
4636         env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
4637     } else {
4638         env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
4639     }
4640 
4641     assert(env->nested_state->size <= max_nested_state_len);
4642     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
4643 }
4644 
4645 static int kvm_get_nested_state(X86CPU *cpu)
4646 {
4647     CPUX86State *env = &cpu->env;
4648     int max_nested_state_len = kvm_max_nested_state_length();
4649     int ret;
4650 
4651     if (!env->nested_state) {
4652         return 0;
4653     }
4654 
4655     /*
4656      * It is possible that migration restored a smaller size into
4657      * nested_state->hdr.size than what our kernel support.
4658      * We preserve migration origin nested_state->hdr.size for
4659      * call to KVM_SET_NESTED_STATE but wish that our next call
4660      * to KVM_GET_NESTED_STATE will use max size our kernel support.
4661      */
4662     env->nested_state->size = max_nested_state_len;
4663 
4664     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
4665     if (ret < 0) {
4666         return ret;
4667     }
4668 
4669     /*
4670      * Copy flags that are affected by reset to env->hflags and env->hflags2.
4671      */
4672     if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
4673         env->hflags |= HF_GUEST_MASK;
4674     } else {
4675         env->hflags &= ~HF_GUEST_MASK;
4676     }
4677 
4678     /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */
4679     if (cpu_has_svm(env)) {
4680         if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
4681             env->hflags2 |= HF2_GIF_MASK;
4682         } else {
4683             env->hflags2 &= ~HF2_GIF_MASK;
4684         }
4685     }
4686 
4687     return ret;
4688 }
4689 
4690 int kvm_arch_put_registers(CPUState *cpu, int level)
4691 {
4692     X86CPU *x86_cpu = X86_CPU(cpu);
4693     int ret;
4694 
4695     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
4696 
4697     /*
4698      * Put MSR_IA32_FEATURE_CONTROL first, this ensures the VM gets out of VMX
4699      * root operation upon vCPU reset. kvm_put_msr_feature_control() should also
4700      * precede kvm_put_nested_state() when 'real' nested state is set.
4701      */
4702     if (level >= KVM_PUT_RESET_STATE) {
4703         ret = kvm_put_msr_feature_control(x86_cpu);
4704         if (ret < 0) {
4705             return ret;
4706         }
4707     }
4708 
4709     /* must be before kvm_put_nested_state so that EFER.SVME is set */
4710     ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu);
4711     if (ret < 0) {
4712         return ret;
4713     }
4714 
4715     if (level >= KVM_PUT_RESET_STATE) {
4716         ret = kvm_put_nested_state(x86_cpu);
4717         if (ret < 0) {
4718             return ret;
4719         }
4720     }
4721 
4722     if (level == KVM_PUT_FULL_STATE) {
4723         /* We don't check for kvm_arch_set_tsc_khz() errors here,
4724          * because TSC frequency mismatch shouldn't abort migration,
4725          * unless the user explicitly asked for a more strict TSC
4726          * setting (e.g. using an explicit "tsc-freq" option).
4727          */
4728         kvm_arch_set_tsc_khz(cpu);
4729     }
4730 
4731 #ifdef CONFIG_XEN_EMU
4732     if (xen_mode == XEN_EMULATE && level == KVM_PUT_FULL_STATE) {
4733         ret = kvm_put_xen_state(cpu);
4734         if (ret < 0) {
4735             return ret;
4736         }
4737     }
4738 #endif
4739 
4740     ret = kvm_getput_regs(x86_cpu, 1);
4741     if (ret < 0) {
4742         return ret;
4743     }
4744     ret = kvm_put_xsave(x86_cpu);
4745     if (ret < 0) {
4746         return ret;
4747     }
4748     ret = kvm_put_xcrs(x86_cpu);
4749     if (ret < 0) {
4750         return ret;
4751     }
4752     ret = kvm_put_msrs(x86_cpu, level);
4753     if (ret < 0) {
4754         return ret;
4755     }
4756     ret = kvm_put_vcpu_events(x86_cpu, level);
4757     if (ret < 0) {
4758         return ret;
4759     }
4760     if (level >= KVM_PUT_RESET_STATE) {
4761         ret = kvm_put_mp_state(x86_cpu);
4762         if (ret < 0) {
4763             return ret;
4764         }
4765     }
4766 
4767     ret = kvm_put_tscdeadline_msr(x86_cpu);
4768     if (ret < 0) {
4769         return ret;
4770     }
4771     ret = kvm_put_debugregs(x86_cpu);
4772     if (ret < 0) {
4773         return ret;
4774     }
4775     return 0;
4776 }
4777 
4778 int kvm_arch_get_registers(CPUState *cs)
4779 {
4780     X86CPU *cpu = X86_CPU(cs);
4781     int ret;
4782 
4783     assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
4784 
4785     ret = kvm_get_vcpu_events(cpu);
4786     if (ret < 0) {
4787         goto out;
4788     }
4789     /*
4790      * KVM_GET_MPSTATE can modify CS and RIP, call it before
4791      * KVM_GET_REGS and KVM_GET_SREGS.
4792      */
4793     ret = kvm_get_mp_state(cpu);
4794     if (ret < 0) {
4795         goto out;
4796     }
4797     ret = kvm_getput_regs(cpu, 0);
4798     if (ret < 0) {
4799         goto out;
4800     }
4801     ret = kvm_get_xsave(cpu);
4802     if (ret < 0) {
4803         goto out;
4804     }
4805     ret = kvm_get_xcrs(cpu);
4806     if (ret < 0) {
4807         goto out;
4808     }
4809     ret = has_sregs2 ? kvm_get_sregs2(cpu) : kvm_get_sregs(cpu);
4810     if (ret < 0) {
4811         goto out;
4812     }
4813     ret = kvm_get_msrs(cpu);
4814     if (ret < 0) {
4815         goto out;
4816     }
4817     ret = kvm_get_apic(cpu);
4818     if (ret < 0) {
4819         goto out;
4820     }
4821     ret = kvm_get_debugregs(cpu);
4822     if (ret < 0) {
4823         goto out;
4824     }
4825     ret = kvm_get_nested_state(cpu);
4826     if (ret < 0) {
4827         goto out;
4828     }
4829 #ifdef CONFIG_XEN_EMU
4830     if (xen_mode == XEN_EMULATE) {
4831         ret = kvm_get_xen_state(cs);
4832         if (ret < 0) {
4833             goto out;
4834         }
4835     }
4836 #endif
4837     ret = 0;
4838  out:
4839     cpu_sync_bndcs_hflags(&cpu->env);
4840     return ret;
4841 }
4842 
4843 void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
4844 {
4845     X86CPU *x86_cpu = X86_CPU(cpu);
4846     CPUX86State *env = &x86_cpu->env;
4847     int ret;
4848 
4849     /* Inject NMI */
4850     if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
4851         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
4852             bql_lock();
4853             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
4854             bql_unlock();
4855             DPRINTF("injected NMI\n");
4856             ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
4857             if (ret < 0) {
4858                 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
4859                         strerror(-ret));
4860             }
4861         }
4862         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
4863             bql_lock();
4864             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
4865             bql_unlock();
4866             DPRINTF("injected SMI\n");
4867             ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
4868             if (ret < 0) {
4869                 fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
4870                         strerror(-ret));
4871             }
4872         }
4873     }
4874 
4875     if (!kvm_pic_in_kernel()) {
4876         bql_lock();
4877     }
4878 
4879     /* Force the VCPU out of its inner loop to process any INIT requests
4880      * or (for userspace APIC, but it is cheap to combine the checks here)
4881      * pending TPR access reports.
4882      */
4883     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
4884         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
4885             !(env->hflags & HF_SMM_MASK)) {
4886             cpu->exit_request = 1;
4887         }
4888         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
4889             cpu->exit_request = 1;
4890         }
4891     }
4892 
4893     if (!kvm_pic_in_kernel()) {
4894         /* Try to inject an interrupt if the guest can accept it */
4895         if (run->ready_for_interrupt_injection &&
4896             (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
4897             (env->eflags & IF_MASK)) {
4898             int irq;
4899 
4900             cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
4901             irq = cpu_get_pic_interrupt(env);
4902             if (irq >= 0) {
4903                 struct kvm_interrupt intr;
4904 
4905                 intr.irq = irq;
4906                 DPRINTF("injected interrupt %d\n", irq);
4907                 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
4908                 if (ret < 0) {
4909                     fprintf(stderr,
4910                             "KVM: injection failed, interrupt lost (%s)\n",
4911                             strerror(-ret));
4912                 }
4913             }
4914         }
4915 
4916         /* If we have an interrupt but the guest is not ready to receive an
4917          * interrupt, request an interrupt window exit.  This will
4918          * cause a return to userspace as soon as the guest is ready to
4919          * receive interrupts. */
4920         if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
4921             run->request_interrupt_window = 1;
4922         } else {
4923             run->request_interrupt_window = 0;
4924         }
4925 
4926         DPRINTF("setting tpr\n");
4927         run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
4928 
4929         bql_unlock();
4930     }
4931 }
4932 
4933 static void kvm_rate_limit_on_bus_lock(void)
4934 {
4935     uint64_t delay_ns = ratelimit_calculate_delay(&bus_lock_ratelimit_ctrl, 1);
4936 
4937     if (delay_ns) {
4938         g_usleep(delay_ns / SCALE_US);
4939     }
4940 }
4941 
4942 MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
4943 {
4944     X86CPU *x86_cpu = X86_CPU(cpu);
4945     CPUX86State *env = &x86_cpu->env;
4946 
4947     if (run->flags & KVM_RUN_X86_SMM) {
4948         env->hflags |= HF_SMM_MASK;
4949     } else {
4950         env->hflags &= ~HF_SMM_MASK;
4951     }
4952     if (run->if_flag) {
4953         env->eflags |= IF_MASK;
4954     } else {
4955         env->eflags &= ~IF_MASK;
4956     }
4957     if (run->flags & KVM_RUN_X86_BUS_LOCK) {
4958         kvm_rate_limit_on_bus_lock();
4959     }
4960 
4961 #ifdef CONFIG_XEN_EMU
4962     /*
4963      * If the callback is asserted as a GSI (or PCI INTx) then check if
4964      * vcpu_info->evtchn_upcall_pending has been cleared, and deassert
4965      * the callback IRQ if so. Ideally we could hook into the PIC/IOAPIC
4966      * EOI and only resample then, exactly how the VFIO eventfd pairs
4967      * are designed to work for level triggered interrupts.
4968      */
4969     if (x86_cpu->env.xen_callback_asserted) {
4970         kvm_xen_maybe_deassert_callback(cpu);
4971     }
4972 #endif
4973 
4974     /* We need to protect the apic state against concurrent accesses from
4975      * different threads in case the userspace irqchip is used. */
4976     if (!kvm_irqchip_in_kernel()) {
4977         bql_lock();
4978     }
4979     cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
4980     cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
4981     if (!kvm_irqchip_in_kernel()) {
4982         bql_unlock();
4983     }
4984     return cpu_get_mem_attrs(env);
4985 }
4986 
4987 int kvm_arch_process_async_events(CPUState *cs)
4988 {
4989     X86CPU *cpu = X86_CPU(cs);
4990     CPUX86State *env = &cpu->env;
4991 
4992     if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
4993         /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
4994         assert(env->mcg_cap);
4995 
4996         cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
4997 
4998         kvm_cpu_synchronize_state(cs);
4999 
5000         if (env->exception_nr == EXCP08_DBLE) {
5001             /* this means triple fault */
5002             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
5003             cs->exit_request = 1;
5004             return 0;
5005         }
5006         kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
5007         env->has_error_code = 0;
5008 
5009         cs->halted = 0;
5010         if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
5011             env->mp_state = KVM_MP_STATE_RUNNABLE;
5012         }
5013     }
5014 
5015     if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
5016         !(env->hflags & HF_SMM_MASK)) {
5017         kvm_cpu_synchronize_state(cs);
5018         do_cpu_init(cpu);
5019     }
5020 
5021     if (kvm_irqchip_in_kernel()) {
5022         return 0;
5023     }
5024 
5025     if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
5026         cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
5027         apic_poll_irq(cpu->apic_state);
5028     }
5029     if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
5030          (env->eflags & IF_MASK)) ||
5031         (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
5032         cs->halted = 0;
5033     }
5034     if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
5035         kvm_cpu_synchronize_state(cs);
5036         do_cpu_sipi(cpu);
5037     }
5038     if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
5039         cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
5040         kvm_cpu_synchronize_state(cs);
5041         apic_handle_tpr_access_report(cpu->apic_state, env->eip,
5042                                       env->tpr_access_type);
5043     }
5044 
5045     return cs->halted;
5046 }
5047 
5048 static int kvm_handle_halt(X86CPU *cpu)
5049 {
5050     CPUState *cs = CPU(cpu);
5051     CPUX86State *env = &cpu->env;
5052 
5053     if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
5054           (env->eflags & IF_MASK)) &&
5055         !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
5056         cs->halted = 1;
5057         return EXCP_HLT;
5058     }
5059 
5060     return 0;
5061 }
5062 
5063 static int kvm_handle_tpr_access(X86CPU *cpu)
5064 {
5065     CPUState *cs = CPU(cpu);
5066     struct kvm_run *run = cs->kvm_run;
5067 
5068     apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
5069                                   run->tpr_access.is_write ? TPR_ACCESS_WRITE
5070                                                            : TPR_ACCESS_READ);
5071     return 1;
5072 }
5073 
5074 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
5075 {
5076     static const uint8_t int3 = 0xcc;
5077 
5078     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
5079         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
5080         return -EINVAL;
5081     }
5082     return 0;
5083 }
5084 
5085 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
5086 {
5087     uint8_t int3;
5088 
5089     if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0)) {
5090         return -EINVAL;
5091     }
5092     if (int3 != 0xcc) {
5093         return 0;
5094     }
5095     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
5096         return -EINVAL;
5097     }
5098     return 0;
5099 }
5100 
5101 static struct {
5102     target_ulong addr;
5103     int len;
5104     int type;
5105 } hw_breakpoint[4];
5106 
5107 static int nb_hw_breakpoint;
5108 
5109 static int find_hw_breakpoint(target_ulong addr, int len, int type)
5110 {
5111     int n;
5112 
5113     for (n = 0; n < nb_hw_breakpoint; n++) {
5114         if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
5115             (hw_breakpoint[n].len == len || len == -1)) {
5116             return n;
5117         }
5118     }
5119     return -1;
5120 }
5121 
5122 int kvm_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type)
5123 {
5124     switch (type) {
5125     case GDB_BREAKPOINT_HW:
5126         len = 1;
5127         break;
5128     case GDB_WATCHPOINT_WRITE:
5129     case GDB_WATCHPOINT_ACCESS:
5130         switch (len) {
5131         case 1:
5132             break;
5133         case 2:
5134         case 4:
5135         case 8:
5136             if (addr & (len - 1)) {
5137                 return -EINVAL;
5138             }
5139             break;
5140         default:
5141             return -EINVAL;
5142         }
5143         break;
5144     default:
5145         return -ENOSYS;
5146     }
5147 
5148     if (nb_hw_breakpoint == 4) {
5149         return -ENOBUFS;
5150     }
5151     if (find_hw_breakpoint(addr, len, type) >= 0) {
5152         return -EEXIST;
5153     }
5154     hw_breakpoint[nb_hw_breakpoint].addr = addr;
5155     hw_breakpoint[nb_hw_breakpoint].len = len;
5156     hw_breakpoint[nb_hw_breakpoint].type = type;
5157     nb_hw_breakpoint++;
5158 
5159     return 0;
5160 }
5161 
5162 int kvm_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type)
5163 {
5164     int n;
5165 
5166     n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
5167     if (n < 0) {
5168         return -ENOENT;
5169     }
5170     nb_hw_breakpoint--;
5171     hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
5172 
5173     return 0;
5174 }
5175 
5176 void kvm_arch_remove_all_hw_breakpoints(void)
5177 {
5178     nb_hw_breakpoint = 0;
5179 }
5180 
5181 static CPUWatchpoint hw_watchpoint;
5182 
5183 static int kvm_handle_debug(X86CPU *cpu,
5184                             struct kvm_debug_exit_arch *arch_info)
5185 {
5186     CPUState *cs = CPU(cpu);
5187     CPUX86State *env = &cpu->env;
5188     int ret = 0;
5189     int n;
5190 
5191     if (arch_info->exception == EXCP01_DB) {
5192         if (arch_info->dr6 & DR6_BS) {
5193             if (cs->singlestep_enabled) {
5194                 ret = EXCP_DEBUG;
5195             }
5196         } else {
5197             for (n = 0; n < 4; n++) {
5198                 if (arch_info->dr6 & (1 << n)) {
5199                     switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
5200                     case 0x0:
5201                         ret = EXCP_DEBUG;
5202                         break;
5203                     case 0x1:
5204                         ret = EXCP_DEBUG;
5205                         cs->watchpoint_hit = &hw_watchpoint;
5206                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
5207                         hw_watchpoint.flags = BP_MEM_WRITE;
5208                         break;
5209                     case 0x3:
5210                         ret = EXCP_DEBUG;
5211                         cs->watchpoint_hit = &hw_watchpoint;
5212                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
5213                         hw_watchpoint.flags = BP_MEM_ACCESS;
5214                         break;
5215                     }
5216                 }
5217             }
5218         }
5219     } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
5220         ret = EXCP_DEBUG;
5221     }
5222     if (ret == 0) {
5223         cpu_synchronize_state(cs);
5224         assert(env->exception_nr == -1);
5225 
5226         /* pass to guest */
5227         kvm_queue_exception(env, arch_info->exception,
5228                             arch_info->exception == EXCP01_DB,
5229                             arch_info->dr6);
5230         env->has_error_code = 0;
5231     }
5232 
5233     return ret;
5234 }
5235 
5236 void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
5237 {
5238     const uint8_t type_code[] = {
5239         [GDB_BREAKPOINT_HW] = 0x0,
5240         [GDB_WATCHPOINT_WRITE] = 0x1,
5241         [GDB_WATCHPOINT_ACCESS] = 0x3
5242     };
5243     const uint8_t len_code[] = {
5244         [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
5245     };
5246     int n;
5247 
5248     if (kvm_sw_breakpoints_active(cpu)) {
5249         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
5250     }
5251     if (nb_hw_breakpoint > 0) {
5252         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
5253         dbg->arch.debugreg[7] = 0x0600;
5254         for (n = 0; n < nb_hw_breakpoint; n++) {
5255             dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
5256             dbg->arch.debugreg[7] |= (2 << (n * 2)) |
5257                 (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
5258                 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
5259         }
5260     }
5261 }
5262 
5263 static bool kvm_install_msr_filters(KVMState *s)
5264 {
5265     uint64_t zero = 0;
5266     struct kvm_msr_filter filter = {
5267         .flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
5268     };
5269     int r, i, j = 0;
5270 
5271     for (i = 0; i < KVM_MSR_FILTER_MAX_RANGES; i++) {
5272         KVMMSRHandlers *handler = &msr_handlers[i];
5273         if (handler->msr) {
5274             struct kvm_msr_filter_range *range = &filter.ranges[j++];
5275 
5276             *range = (struct kvm_msr_filter_range) {
5277                 .flags = 0,
5278                 .nmsrs = 1,
5279                 .base = handler->msr,
5280                 .bitmap = (__u8 *)&zero,
5281             };
5282 
5283             if (handler->rdmsr) {
5284                 range->flags |= KVM_MSR_FILTER_READ;
5285             }
5286 
5287             if (handler->wrmsr) {
5288                 range->flags |= KVM_MSR_FILTER_WRITE;
5289             }
5290         }
5291     }
5292 
5293     r = kvm_vm_ioctl(s, KVM_X86_SET_MSR_FILTER, &filter);
5294     if (r) {
5295         return false;
5296     }
5297 
5298     return true;
5299 }
5300 
5301 bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
5302                     QEMUWRMSRHandler *wrmsr)
5303 {
5304     int i;
5305 
5306     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
5307         if (!msr_handlers[i].msr) {
5308             msr_handlers[i] = (KVMMSRHandlers) {
5309                 .msr = msr,
5310                 .rdmsr = rdmsr,
5311                 .wrmsr = wrmsr,
5312             };
5313 
5314             if (!kvm_install_msr_filters(s)) {
5315                 msr_handlers[i] = (KVMMSRHandlers) { };
5316                 return false;
5317             }
5318 
5319             return true;
5320         }
5321     }
5322 
5323     return false;
5324 }
5325 
5326 static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run)
5327 {
5328     int i;
5329     bool r;
5330 
5331     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
5332         KVMMSRHandlers *handler = &msr_handlers[i];
5333         if (run->msr.index == handler->msr) {
5334             if (handler->rdmsr) {
5335                 r = handler->rdmsr(cpu, handler->msr,
5336                                    (uint64_t *)&run->msr.data);
5337                 run->msr.error = r ? 0 : 1;
5338                 return 0;
5339             }
5340         }
5341     }
5342 
5343     assert(false);
5344 }
5345 
5346 static int kvm_handle_wrmsr(X86CPU *cpu, struct kvm_run *run)
5347 {
5348     int i;
5349     bool r;
5350 
5351     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
5352         KVMMSRHandlers *handler = &msr_handlers[i];
5353         if (run->msr.index == handler->msr) {
5354             if (handler->wrmsr) {
5355                 r = handler->wrmsr(cpu, handler->msr, run->msr.data);
5356                 run->msr.error = r ? 0 : 1;
5357                 return 0;
5358             }
5359         }
5360     }
5361 
5362     assert(false);
5363 }
5364 
5365 static bool has_sgx_provisioning;
5366 
5367 static bool __kvm_enable_sgx_provisioning(KVMState *s)
5368 {
5369     int fd, ret;
5370 
5371     if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) {
5372         return false;
5373     }
5374 
5375     fd = qemu_open_old("/dev/sgx_provision", O_RDONLY);
5376     if (fd < 0) {
5377         return false;
5378     }
5379 
5380     ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd);
5381     if (ret) {
5382         error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret));
5383         exit(1);
5384     }
5385     close(fd);
5386     return true;
5387 }
5388 
5389 bool kvm_enable_sgx_provisioning(KVMState *s)
5390 {
5391     return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning);
5392 }
5393 
5394 static bool host_supports_vmx(void)
5395 {
5396     uint32_t ecx, unused;
5397 
5398     host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
5399     return ecx & CPUID_EXT_VMX;
5400 }
5401 
5402 /*
5403  * Currently the handling here only supports use of KVM_HC_MAP_GPA_RANGE
5404  * to service guest-initiated memory attribute update requests so that
5405  * KVM_SET_MEMORY_ATTRIBUTES can update whether or not a page should be
5406  * backed by the private memory pool provided by guest_memfd, and as such
5407  * is only applicable to guest_memfd-backed guests (e.g. SNP/TDX).
5408  *
5409  * Other other use-cases for KVM_HC_MAP_GPA_RANGE, such as for SEV live
5410  * migration, are not implemented here currently.
5411  *
5412  * For the guest_memfd use-case, these exits will generally be synthesized
5413  * by KVM based on platform-specific hypercalls, like GHCB requests in the
5414  * case of SEV-SNP, and not issued directly within the guest though the
5415  * KVM_HC_MAP_GPA_RANGE hypercall. So in this case, KVM_HC_MAP_GPA_RANGE is
5416  * not actually advertised to guests via the KVM CPUID feature bit, as
5417  * opposed to SEV live migration where it would be. Since it is unlikely the
5418  * SEV live migration use-case would be useful for guest-memfd backed guests,
5419  * because private/shared page tracking is already provided through other
5420  * means, these 2 use-cases should be treated as being mutually-exclusive.
5421  */
5422 static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
5423 {
5424     uint64_t gpa, size, attributes;
5425 
5426     if (!machine_require_guest_memfd(current_machine))
5427         return -EINVAL;
5428 
5429     gpa = run->hypercall.args[0];
5430     size = run->hypercall.args[1] * TARGET_PAGE_SIZE;
5431     attributes = run->hypercall.args[2];
5432 
5433     trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags);
5434 
5435     return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
5436 }
5437 
5438 static int kvm_handle_hypercall(struct kvm_run *run)
5439 {
5440     if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE)
5441         return kvm_handle_hc_map_gpa_range(run);
5442 
5443     return -EINVAL;
5444 }
5445 
5446 #define VMX_INVALID_GUEST_STATE 0x80000021
5447 
5448 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
5449 {
5450     X86CPU *cpu = X86_CPU(cs);
5451     uint64_t code;
5452     int ret;
5453     bool ctx_invalid;
5454     KVMState *state;
5455 
5456     switch (run->exit_reason) {
5457     case KVM_EXIT_HLT:
5458         DPRINTF("handle_hlt\n");
5459         bql_lock();
5460         ret = kvm_handle_halt(cpu);
5461         bql_unlock();
5462         break;
5463     case KVM_EXIT_SET_TPR:
5464         ret = 0;
5465         break;
5466     case KVM_EXIT_TPR_ACCESS:
5467         bql_lock();
5468         ret = kvm_handle_tpr_access(cpu);
5469         bql_unlock();
5470         break;
5471     case KVM_EXIT_FAIL_ENTRY:
5472         code = run->fail_entry.hardware_entry_failure_reason;
5473         fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
5474                 code);
5475         if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
5476             fprintf(stderr,
5477                     "\nIf you're running a guest on an Intel machine without "
5478                         "unrestricted mode\n"
5479                     "support, the failure can be most likely due to the guest "
5480                         "entering an invalid\n"
5481                     "state for Intel VT. For example, the guest maybe running "
5482                         "in big real mode\n"
5483                     "which is not supported on less recent Intel processors."
5484                         "\n\n");
5485         }
5486         ret = -1;
5487         break;
5488     case KVM_EXIT_EXCEPTION:
5489         fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
5490                 run->ex.exception, run->ex.error_code);
5491         ret = -1;
5492         break;
5493     case KVM_EXIT_DEBUG:
5494         DPRINTF("kvm_exit_debug\n");
5495         bql_lock();
5496         ret = kvm_handle_debug(cpu, &run->debug.arch);
5497         bql_unlock();
5498         break;
5499     case KVM_EXIT_HYPERV:
5500         ret = kvm_hv_handle_exit(cpu, &run->hyperv);
5501         break;
5502     case KVM_EXIT_IOAPIC_EOI:
5503         ioapic_eoi_broadcast(run->eoi.vector);
5504         ret = 0;
5505         break;
5506     case KVM_EXIT_X86_BUS_LOCK:
5507         /* already handled in kvm_arch_post_run */
5508         ret = 0;
5509         break;
5510     case KVM_EXIT_NOTIFY:
5511         ctx_invalid = !!(run->notify.flags & KVM_NOTIFY_CONTEXT_INVALID);
5512         state = KVM_STATE(current_accel());
5513         if (ctx_invalid ||
5514             state->notify_vmexit == NOTIFY_VMEXIT_OPTION_INTERNAL_ERROR) {
5515             warn_report("KVM internal error: Encountered a notify exit "
5516                         "with invalid context in guest.");
5517             ret = -1;
5518         } else {
5519             warn_report_once("KVM: Encountered a notify exit with valid "
5520                              "context in guest. "
5521                              "The guest could be misbehaving.");
5522             ret = 0;
5523         }
5524         break;
5525     case KVM_EXIT_X86_RDMSR:
5526         /* We only enable MSR filtering, any other exit is bogus */
5527         assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER);
5528         ret = kvm_handle_rdmsr(cpu, run);
5529         break;
5530     case KVM_EXIT_X86_WRMSR:
5531         /* We only enable MSR filtering, any other exit is bogus */
5532         assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER);
5533         ret = kvm_handle_wrmsr(cpu, run);
5534         break;
5535 #ifdef CONFIG_XEN_EMU
5536     case KVM_EXIT_XEN:
5537         ret = kvm_xen_handle_exit(cpu, &run->xen);
5538         break;
5539 #endif
5540     case KVM_EXIT_HYPERCALL:
5541         ret = kvm_handle_hypercall(run);
5542         break;
5543     default:
5544         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
5545         ret = -1;
5546         break;
5547     }
5548 
5549     return ret;
5550 }
5551 
5552 bool kvm_arch_stop_on_emulation_error(CPUState *cs)
5553 {
5554     X86CPU *cpu = X86_CPU(cs);
5555     CPUX86State *env = &cpu->env;
5556 
5557     kvm_cpu_synchronize_state(cs);
5558     return !(env->cr[0] & CR0_PE_MASK) ||
5559            ((env->segs[R_CS].selector  & 3) != 3);
5560 }
5561 
5562 void kvm_arch_init_irq_routing(KVMState *s)
5563 {
5564     /* We know at this point that we're using the in-kernel
5565      * irqchip, so we can use irqfds, and on x86 we know
5566      * we can use msi via irqfd and GSI routing.
5567      */
5568     kvm_msi_via_irqfd_allowed = true;
5569     kvm_gsi_routing_allowed = true;
5570 
5571     if (kvm_irqchip_is_split()) {
5572         KVMRouteChange c = kvm_irqchip_begin_route_changes(s);
5573         int i;
5574 
5575         /* If the ioapic is in QEMU and the lapics are in KVM, reserve
5576            MSI routes for signaling interrupts to the local apics. */
5577         for (i = 0; i < IOAPIC_NUM_PINS; i++) {
5578             if (kvm_irqchip_add_msi_route(&c, 0, NULL) < 0) {
5579                 error_report("Could not enable split IRQ mode.");
5580                 exit(1);
5581             }
5582         }
5583         kvm_irqchip_commit_route_changes(&c);
5584     }
5585 }
5586 
5587 int kvm_arch_irqchip_create(KVMState *s)
5588 {
5589     int ret;
5590     if (kvm_kernel_irqchip_split()) {
5591         ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24);
5592         if (ret) {
5593             error_report("Could not enable split irqchip mode: %s",
5594                          strerror(-ret));
5595             exit(1);
5596         } else {
5597             DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n");
5598             kvm_split_irqchip = true;
5599             return 1;
5600         }
5601     } else {
5602         return 0;
5603     }
5604 }
5605 
5606 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)
5607 {
5608     CPUX86State *env;
5609     uint64_t ext_id;
5610 
5611     if (!first_cpu) {
5612         return address;
5613     }
5614     env = &X86_CPU(first_cpu)->env;
5615     if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) {
5616         return address;
5617     }
5618 
5619     /*
5620      * If the remappable format bit is set, or the upper bits are
5621      * already set in address_hi, or the low extended bits aren't
5622      * there anyway, do nothing.
5623      */
5624     ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT);
5625     if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) {
5626         return address;
5627     }
5628 
5629     address &= ~ext_id;
5630     address |= ext_id << 35;
5631     return address;
5632 }
5633 
5634 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
5635                              uint64_t address, uint32_t data, PCIDevice *dev)
5636 {
5637     X86IOMMUState *iommu = x86_iommu_get_default();
5638 
5639     if (iommu) {
5640         X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu);
5641 
5642         if (class->int_remap) {
5643             int ret;
5644             MSIMessage src, dst;
5645 
5646             src.address = route->u.msi.address_hi;
5647             src.address <<= VTD_MSI_ADDR_HI_SHIFT;
5648             src.address |= route->u.msi.address_lo;
5649             src.data = route->u.msi.data;
5650 
5651             ret = class->int_remap(iommu, &src, &dst, dev ?     \
5652                                    pci_requester_id(dev) :      \
5653                                    X86_IOMMU_SID_INVALID);
5654             if (ret) {
5655                 trace_kvm_x86_fixup_msi_error(route->gsi);
5656                 return 1;
5657             }
5658 
5659             /*
5660              * Handled untranslated compatibility format interrupt with
5661              * extended destination ID in the low bits 11-5. */
5662             dst.address = kvm_swizzle_msi_ext_dest_id(dst.address);
5663 
5664             route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
5665             route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
5666             route->u.msi.data = dst.data;
5667             return 0;
5668         }
5669     }
5670 
5671 #ifdef CONFIG_XEN_EMU
5672     if (xen_mode == XEN_EMULATE) {
5673         int handled = xen_evtchn_translate_pirq_msi(route, address, data);
5674 
5675         /*
5676          * If it was a PIRQ and successfully routed (handled == 0) or it was
5677          * an error (handled < 0), return. If it wasn't a PIRQ, keep going.
5678          */
5679         if (handled <= 0) {
5680             return handled;
5681         }
5682     }
5683 #endif
5684 
5685     address = kvm_swizzle_msi_ext_dest_id(address);
5686     route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT;
5687     route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK;
5688     return 0;
5689 }
5690 
5691 typedef struct MSIRouteEntry MSIRouteEntry;
5692 
5693 struct MSIRouteEntry {
5694     PCIDevice *dev;             /* Device pointer */
5695     int vector;                 /* MSI/MSIX vector index */
5696     int virq;                   /* Virtual IRQ index */
5697     QLIST_ENTRY(MSIRouteEntry) list;
5698 };
5699 
5700 /* List of used GSI routes */
5701 static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \
5702     QLIST_HEAD_INITIALIZER(msi_route_list);
5703 
5704 void kvm_update_msi_routes_all(void *private, bool global,
5705                                uint32_t index, uint32_t mask)
5706 {
5707     int cnt = 0, vector;
5708     MSIRouteEntry *entry;
5709     MSIMessage msg;
5710     PCIDevice *dev;
5711 
5712     /* TODO: explicit route update */
5713     QLIST_FOREACH(entry, &msi_route_list, list) {
5714         cnt++;
5715         vector = entry->vector;
5716         dev = entry->dev;
5717         if (msix_enabled(dev) && !msix_is_masked(dev, vector)) {
5718             msg = msix_get_message(dev, vector);
5719         } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) {
5720             msg = msi_get_message(dev, vector);
5721         } else {
5722             /*
5723              * Either MSI/MSIX is disabled for the device, or the
5724              * specific message was masked out.  Skip this one.
5725              */
5726             continue;
5727         }
5728         kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
5729     }
5730     kvm_irqchip_commit_routes(kvm_state);
5731     trace_kvm_x86_update_msi_routes(cnt);
5732 }
5733 
5734 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
5735                                 int vector, PCIDevice *dev)
5736 {
5737     static bool notify_list_inited = false;
5738     MSIRouteEntry *entry;
5739 
5740     if (!dev) {
5741         /* These are (possibly) IOAPIC routes only used for split
5742          * kernel irqchip mode, while what we are housekeeping are
5743          * PCI devices only. */
5744         return 0;
5745     }
5746 
5747     entry = g_new0(MSIRouteEntry, 1);
5748     entry->dev = dev;
5749     entry->vector = vector;
5750     entry->virq = route->gsi;
5751     QLIST_INSERT_HEAD(&msi_route_list, entry, list);
5752 
5753     trace_kvm_x86_add_msi_route(route->gsi);
5754 
5755     if (!notify_list_inited) {
5756         /* For the first time we do add route, add ourselves into
5757          * IOMMU's IEC notify list if needed. */
5758         X86IOMMUState *iommu = x86_iommu_get_default();
5759         if (iommu) {
5760             x86_iommu_iec_register_notifier(iommu,
5761                                             kvm_update_msi_routes_all,
5762                                             NULL);
5763         }
5764         notify_list_inited = true;
5765     }
5766     return 0;
5767 }
5768 
5769 int kvm_arch_release_virq_post(int virq)
5770 {
5771     MSIRouteEntry *entry, *next;
5772     QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) {
5773         if (entry->virq == virq) {
5774             trace_kvm_x86_remove_msi_route(virq);
5775             QLIST_REMOVE(entry, list);
5776             g_free(entry);
5777             break;
5778         }
5779     }
5780     return 0;
5781 }
5782 
5783 int kvm_arch_msi_data_to_gsi(uint32_t data)
5784 {
5785     abort();
5786 }
5787 
5788 bool kvm_has_waitpkg(void)
5789 {
5790     return has_msr_umwait;
5791 }
5792 
5793 #define ARCH_REQ_XCOMP_GUEST_PERM       0x1025
5794 
5795 void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask)
5796 {
5797     KVMState *s = kvm_state;
5798     uint64_t supported;
5799 
5800     mask &= XSTATE_DYNAMIC_MASK;
5801     if (!mask) {
5802         return;
5803     }
5804     /*
5805      * Just ignore bits that are not in CPUID[EAX=0xD,ECX=0].
5806      * ARCH_REQ_XCOMP_GUEST_PERM would fail, and QEMU has warned
5807      * about them already because they are not supported features.
5808      */
5809     supported = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
5810     supported |= (uint64_t)kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EDX) << 32;
5811     mask &= supported;
5812 
5813     while (mask) {
5814         int bit = ctz64(mask);
5815         int rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
5816         if (rc) {
5817             /*
5818              * Older kernel version (<5.17) do not support
5819              * ARCH_REQ_XCOMP_GUEST_PERM, but also do not return
5820              * any dynamic feature from kvm_arch_get_supported_cpuid.
5821              */
5822             warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure "
5823                         "for feature bit %d", bit);
5824         }
5825         mask &= ~BIT_ULL(bit);
5826     }
5827 }
5828 
5829 static int kvm_arch_get_notify_vmexit(Object *obj, Error **errp)
5830 {
5831     KVMState *s = KVM_STATE(obj);
5832     return s->notify_vmexit;
5833 }
5834 
5835 static void kvm_arch_set_notify_vmexit(Object *obj, int value, Error **errp)
5836 {
5837     KVMState *s = KVM_STATE(obj);
5838 
5839     if (s->fd != -1) {
5840         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
5841         return;
5842     }
5843 
5844     s->notify_vmexit = value;
5845 }
5846 
5847 static void kvm_arch_get_notify_window(Object *obj, Visitor *v,
5848                                        const char *name, void *opaque,
5849                                        Error **errp)
5850 {
5851     KVMState *s = KVM_STATE(obj);
5852     uint32_t value = s->notify_window;
5853 
5854     visit_type_uint32(v, name, &value, errp);
5855 }
5856 
5857 static void kvm_arch_set_notify_window(Object *obj, Visitor *v,
5858                                        const char *name, void *opaque,
5859                                        Error **errp)
5860 {
5861     KVMState *s = KVM_STATE(obj);
5862     uint32_t value;
5863 
5864     if (s->fd != -1) {
5865         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
5866         return;
5867     }
5868 
5869     if (!visit_type_uint32(v, name, &value, errp)) {
5870         return;
5871     }
5872 
5873     s->notify_window = value;
5874 }
5875 
5876 static void kvm_arch_get_xen_version(Object *obj, Visitor *v,
5877                                      const char *name, void *opaque,
5878                                      Error **errp)
5879 {
5880     KVMState *s = KVM_STATE(obj);
5881     uint32_t value = s->xen_version;
5882 
5883     visit_type_uint32(v, name, &value, errp);
5884 }
5885 
5886 static void kvm_arch_set_xen_version(Object *obj, Visitor *v,
5887                                      const char *name, void *opaque,
5888                                      Error **errp)
5889 {
5890     KVMState *s = KVM_STATE(obj);
5891     Error *error = NULL;
5892     uint32_t value;
5893 
5894     visit_type_uint32(v, name, &value, &error);
5895     if (error) {
5896         error_propagate(errp, error);
5897         return;
5898     }
5899 
5900     s->xen_version = value;
5901     if (value && xen_mode == XEN_DISABLED) {
5902         xen_mode = XEN_EMULATE;
5903     }
5904 }
5905 
5906 static void kvm_arch_get_xen_gnttab_max_frames(Object *obj, Visitor *v,
5907                                                const char *name, void *opaque,
5908                                                Error **errp)
5909 {
5910     KVMState *s = KVM_STATE(obj);
5911     uint16_t value = s->xen_gnttab_max_frames;
5912 
5913     visit_type_uint16(v, name, &value, errp);
5914 }
5915 
5916 static void kvm_arch_set_xen_gnttab_max_frames(Object *obj, Visitor *v,
5917                                                const char *name, void *opaque,
5918                                                Error **errp)
5919 {
5920     KVMState *s = KVM_STATE(obj);
5921     Error *error = NULL;
5922     uint16_t value;
5923 
5924     visit_type_uint16(v, name, &value, &error);
5925     if (error) {
5926         error_propagate(errp, error);
5927         return;
5928     }
5929 
5930     s->xen_gnttab_max_frames = value;
5931 }
5932 
5933 static void kvm_arch_get_xen_evtchn_max_pirq(Object *obj, Visitor *v,
5934                                              const char *name, void *opaque,
5935                                              Error **errp)
5936 {
5937     KVMState *s = KVM_STATE(obj);
5938     uint16_t value = s->xen_evtchn_max_pirq;
5939 
5940     visit_type_uint16(v, name, &value, errp);
5941 }
5942 
5943 static void kvm_arch_set_xen_evtchn_max_pirq(Object *obj, Visitor *v,
5944                                              const char *name, void *opaque,
5945                                              Error **errp)
5946 {
5947     KVMState *s = KVM_STATE(obj);
5948     Error *error = NULL;
5949     uint16_t value;
5950 
5951     visit_type_uint16(v, name, &value, &error);
5952     if (error) {
5953         error_propagate(errp, error);
5954         return;
5955     }
5956 
5957     s->xen_evtchn_max_pirq = value;
5958 }
5959 
5960 void kvm_arch_accel_class_init(ObjectClass *oc)
5961 {
5962     object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption",
5963                                    &NotifyVmexitOption_lookup,
5964                                    kvm_arch_get_notify_vmexit,
5965                                    kvm_arch_set_notify_vmexit);
5966     object_class_property_set_description(oc, "notify-vmexit",
5967                                           "Enable notify VM exit");
5968 
5969     object_class_property_add(oc, "notify-window", "uint32",
5970                               kvm_arch_get_notify_window,
5971                               kvm_arch_set_notify_window,
5972                               NULL, NULL);
5973     object_class_property_set_description(oc, "notify-window",
5974                                           "Clock cycles without an event window "
5975                                           "after which a notification VM exit occurs");
5976 
5977     object_class_property_add(oc, "xen-version", "uint32",
5978                               kvm_arch_get_xen_version,
5979                               kvm_arch_set_xen_version,
5980                               NULL, NULL);
5981     object_class_property_set_description(oc, "xen-version",
5982                                           "Xen version to be emulated "
5983                                           "(in XENVER_version form "
5984                                           "e.g. 0x4000a for 4.10)");
5985 
5986     object_class_property_add(oc, "xen-gnttab-max-frames", "uint16",
5987                               kvm_arch_get_xen_gnttab_max_frames,
5988                               kvm_arch_set_xen_gnttab_max_frames,
5989                               NULL, NULL);
5990     object_class_property_set_description(oc, "xen-gnttab-max-frames",
5991                                           "Maximum number of grant table frames");
5992 
5993     object_class_property_add(oc, "xen-evtchn-max-pirq", "uint16",
5994                               kvm_arch_get_xen_evtchn_max_pirq,
5995                               kvm_arch_set_xen_evtchn_max_pirq,
5996                               NULL, NULL);
5997     object_class_property_set_description(oc, "xen-evtchn-max-pirq",
5998                                           "Maximum number of Xen PIRQs");
5999 }
6000 
6001 void kvm_set_max_apic_id(uint32_t max_apic_id)
6002 {
6003     kvm_vm_enable_cap(kvm_state, KVM_CAP_MAX_VCPU_ID, 0, max_apic_id);
6004 }
6005