xref: /openbmc/qemu/target/i386/kvm/kvm.c (revision 940e43aa30e0f793bd18b79221296cdf17724018)
1 /*
2  * QEMU KVM support
3  *
4  * Copyright (C) 2006-2008 Qumranet Technologies
5  * Copyright IBM, Corp. 2008
6  *
7  * Authors:
8  *  Anthony Liguori   <aliguori@us.ibm.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
11  * See the COPYING file in the top-level directory.
12  *
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qapi/qapi-events-run-state.h"
17 #include "qapi/error.h"
18 #include <sys/ioctl.h>
19 #include <sys/utsname.h>
20 
21 #include <linux/kvm.h>
22 #include "standard-headers/asm-x86/kvm_para.h"
23 
24 #include "cpu.h"
25 #include "sysemu/sysemu.h"
26 #include "sysemu/hw_accel.h"
27 #include "sysemu/kvm_int.h"
28 #include "sysemu/runstate.h"
29 #include "kvm_i386.h"
30 #include "hyperv.h"
31 #include "hyperv-proto.h"
32 
33 #include "exec/gdbstub.h"
34 #include "qemu/host-utils.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/config-file.h"
37 #include "qemu/error-report.h"
38 #include "hw/i386/x86.h"
39 #include "hw/i386/apic.h"
40 #include "hw/i386/apic_internal.h"
41 #include "hw/i386/apic-msidef.h"
42 #include "hw/i386/intel_iommu.h"
43 #include "hw/i386/x86-iommu.h"
44 #include "hw/i386/e820_memory_layout.h"
45 
46 #include "hw/pci/pci.h"
47 #include "hw/pci/msi.h"
48 #include "hw/pci/msix.h"
49 #include "migration/blocker.h"
50 #include "exec/memattrs.h"
51 #include "trace.h"
52 
53 //#define DEBUG_KVM
54 
55 #ifdef DEBUG_KVM
56 #define DPRINTF(fmt, ...) \
57     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
58 #else
59 #define DPRINTF(fmt, ...) \
60     do { } while (0)
61 #endif
62 
63 /* From arch/x86/kvm/lapic.h */
64 #define KVM_APIC_BUS_CYCLE_NS       1
65 #define KVM_APIC_BUS_FREQUENCY      (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
66 
67 #define MSR_KVM_WALL_CLOCK  0x11
68 #define MSR_KVM_SYSTEM_TIME 0x12
69 
70 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
71  * 255 kvm_msr_entry structs */
72 #define MSR_BUF_SIZE 4096
73 
74 static void kvm_init_msrs(X86CPU *cpu);
75 
76 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
77     KVM_CAP_INFO(SET_TSS_ADDR),
78     KVM_CAP_INFO(EXT_CPUID),
79     KVM_CAP_INFO(MP_STATE),
80     KVM_CAP_LAST_INFO
81 };
82 
83 static bool has_msr_star;
84 static bool has_msr_hsave_pa;
85 static bool has_msr_tsc_aux;
86 static bool has_msr_tsc_adjust;
87 static bool has_msr_tsc_deadline;
88 static bool has_msr_feature_control;
89 static bool has_msr_misc_enable;
90 static bool has_msr_smbase;
91 static bool has_msr_bndcfgs;
92 static int lm_capable_kernel;
93 static bool has_msr_hv_hypercall;
94 static bool has_msr_hv_crash;
95 static bool has_msr_hv_reset;
96 static bool has_msr_hv_vpindex;
97 static bool hv_vpindex_settable;
98 static bool has_msr_hv_runtime;
99 static bool has_msr_hv_synic;
100 static bool has_msr_hv_stimer;
101 static bool has_msr_hv_frequencies;
102 static bool has_msr_hv_reenlightenment;
103 static bool has_msr_xss;
104 static bool has_msr_umwait;
105 static bool has_msr_spec_ctrl;
106 static bool has_msr_tsx_ctrl;
107 static bool has_msr_virt_ssbd;
108 static bool has_msr_smi_count;
109 static bool has_msr_arch_capabs;
110 static bool has_msr_core_capabs;
111 static bool has_msr_vmx_vmfunc;
112 static bool has_msr_ucode_rev;
113 static bool has_msr_vmx_procbased_ctls2;
114 static bool has_msr_perf_capabs;
115 
116 static uint32_t has_architectural_pmu_version;
117 static uint32_t num_architectural_pmu_gp_counters;
118 static uint32_t num_architectural_pmu_fixed_counters;
119 
120 static int has_xsave;
121 static int has_xcrs;
122 static int has_pit_state2;
123 static int has_exception_payload;
124 
125 static bool has_msr_mcg_ext_ctl;
126 
127 static struct kvm_cpuid2 *cpuid_cache;
128 static struct kvm_msr_list *kvm_feature_msrs;
129 
130 int kvm_has_pit_state2(void)
131 {
132     return has_pit_state2;
133 }
134 
135 bool kvm_has_smm(void)
136 {
137     return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
138 }
139 
140 bool kvm_has_adjust_clock_stable(void)
141 {
142     int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
143 
144     return (ret == KVM_CLOCK_TSC_STABLE);
145 }
146 
147 bool kvm_has_adjust_clock(void)
148 {
149     return kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
150 }
151 
152 bool kvm_has_exception_payload(void)
153 {
154     return has_exception_payload;
155 }
156 
157 static bool kvm_x2apic_api_set_flags(uint64_t flags)
158 {
159     KVMState *s = KVM_STATE(current_accel());
160 
161     return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
162 }
163 
164 #define MEMORIZE(fn, _result) \
165     ({ \
166         static bool _memorized; \
167         \
168         if (_memorized) { \
169             return _result; \
170         } \
171         _memorized = true; \
172         _result = fn; \
173     })
174 
175 static bool has_x2apic_api;
176 
177 bool kvm_has_x2apic_api(void)
178 {
179     return has_x2apic_api;
180 }
181 
182 bool kvm_enable_x2apic(void)
183 {
184     return MEMORIZE(
185              kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
186                                       KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
187              has_x2apic_api);
188 }
189 
190 bool kvm_hv_vpindex_settable(void)
191 {
192     return hv_vpindex_settable;
193 }
194 
195 static int kvm_get_tsc(CPUState *cs)
196 {
197     X86CPU *cpu = X86_CPU(cs);
198     CPUX86State *env = &cpu->env;
199     struct {
200         struct kvm_msrs info;
201         struct kvm_msr_entry entries[1];
202     } msr_data = {};
203     int ret;
204 
205     if (env->tsc_valid) {
206         return 0;
207     }
208 
209     memset(&msr_data, 0, sizeof(msr_data));
210     msr_data.info.nmsrs = 1;
211     msr_data.entries[0].index = MSR_IA32_TSC;
212     env->tsc_valid = !runstate_is_running();
213 
214     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
215     if (ret < 0) {
216         return ret;
217     }
218 
219     assert(ret == 1);
220     env->tsc = msr_data.entries[0].data;
221     return 0;
222 }
223 
224 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
225 {
226     kvm_get_tsc(cpu);
227 }
228 
229 void kvm_synchronize_all_tsc(void)
230 {
231     CPUState *cpu;
232 
233     if (kvm_enabled()) {
234         CPU_FOREACH(cpu) {
235             run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
236         }
237     }
238 }
239 
240 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
241 {
242     struct kvm_cpuid2 *cpuid;
243     int r, size;
244 
245     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
246     cpuid = g_malloc0(size);
247     cpuid->nent = max;
248     r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
249     if (r == 0 && cpuid->nent >= max) {
250         r = -E2BIG;
251     }
252     if (r < 0) {
253         if (r == -E2BIG) {
254             g_free(cpuid);
255             return NULL;
256         } else {
257             fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
258                     strerror(-r));
259             exit(1);
260         }
261     }
262     return cpuid;
263 }
264 
265 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
266  * for all entries.
267  */
268 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
269 {
270     struct kvm_cpuid2 *cpuid;
271     int max = 1;
272 
273     if (cpuid_cache != NULL) {
274         return cpuid_cache;
275     }
276     while ((cpuid = try_get_cpuid(s, max)) == NULL) {
277         max *= 2;
278     }
279     cpuid_cache = cpuid;
280     return cpuid;
281 }
282 
283 static bool host_tsx_broken(void)
284 {
285     int family, model, stepping;\
286     char vendor[CPUID_VENDOR_SZ + 1];
287 
288     host_vendor_fms(vendor, &family, &model, &stepping);
289 
290     /* Check if we are running on a Haswell host known to have broken TSX */
291     return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
292            (family == 6) &&
293            ((model == 63 && stepping < 4) ||
294             model == 60 || model == 69 || model == 70);
295 }
296 
297 /* Returns the value for a specific register on the cpuid entry
298  */
299 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
300 {
301     uint32_t ret = 0;
302     switch (reg) {
303     case R_EAX:
304         ret = entry->eax;
305         break;
306     case R_EBX:
307         ret = entry->ebx;
308         break;
309     case R_ECX:
310         ret = entry->ecx;
311         break;
312     case R_EDX:
313         ret = entry->edx;
314         break;
315     }
316     return ret;
317 }
318 
319 /* Find matching entry for function/index on kvm_cpuid2 struct
320  */
321 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
322                                                  uint32_t function,
323                                                  uint32_t index)
324 {
325     int i;
326     for (i = 0; i < cpuid->nent; ++i) {
327         if (cpuid->entries[i].function == function &&
328             cpuid->entries[i].index == index) {
329             return &cpuid->entries[i];
330         }
331     }
332     /* not found: */
333     return NULL;
334 }
335 
336 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
337                                       uint32_t index, int reg)
338 {
339     struct kvm_cpuid2 *cpuid;
340     uint32_t ret = 0;
341     uint32_t cpuid_1_edx;
342 
343     cpuid = get_supported_cpuid(s);
344 
345     struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
346     if (entry) {
347         ret = cpuid_entry_get_reg(entry, reg);
348     }
349 
350     /* Fixups for the data returned by KVM, below */
351 
352     if (function == 1 && reg == R_EDX) {
353         /* KVM before 2.6.30 misreports the following features */
354         ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
355     } else if (function == 1 && reg == R_ECX) {
356         /* We can set the hypervisor flag, even if KVM does not return it on
357          * GET_SUPPORTED_CPUID
358          */
359         ret |= CPUID_EXT_HYPERVISOR;
360         /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
361          * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
362          * and the irqchip is in the kernel.
363          */
364         if (kvm_irqchip_in_kernel() &&
365                 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
366             ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
367         }
368 
369         /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
370          * without the in-kernel irqchip
371          */
372         if (!kvm_irqchip_in_kernel()) {
373             ret &= ~CPUID_EXT_X2APIC;
374         }
375 
376         if (enable_cpu_pm) {
377             int disable_exits = kvm_check_extension(s,
378                                                     KVM_CAP_X86_DISABLE_EXITS);
379 
380             if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
381                 ret |= CPUID_EXT_MONITOR;
382             }
383         }
384     } else if (function == 6 && reg == R_EAX) {
385         ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
386     } else if (function == 7 && index == 0 && reg == R_EBX) {
387         if (host_tsx_broken()) {
388             ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
389         }
390     } else if (function == 7 && index == 0 && reg == R_EDX) {
391         /*
392          * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts.
393          * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is
394          * returned by KVM_GET_MSR_INDEX_LIST.
395          */
396         if (!has_msr_arch_capabs) {
397             ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
398         }
399     } else if (function == 0x80000001 && reg == R_ECX) {
400         /*
401          * It's safe to enable TOPOEXT even if it's not returned by
402          * GET_SUPPORTED_CPUID.  Unconditionally enabling TOPOEXT here allows
403          * us to keep CPU models including TOPOEXT runnable on older kernels.
404          */
405         ret |= CPUID_EXT3_TOPOEXT;
406     } else if (function == 0x80000001 && reg == R_EDX) {
407         /* On Intel, kvm returns cpuid according to the Intel spec,
408          * so add missing bits according to the AMD spec:
409          */
410         cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
411         ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
412     } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
413         /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
414          * be enabled without the in-kernel irqchip
415          */
416         if (!kvm_irqchip_in_kernel()) {
417             ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
418         }
419         if (kvm_irqchip_is_split()) {
420             ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID;
421         }
422     } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
423         ret |= 1U << KVM_HINTS_REALTIME;
424     }
425 
426     return ret;
427 }
428 
429 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
430 {
431     struct {
432         struct kvm_msrs info;
433         struct kvm_msr_entry entries[1];
434     } msr_data = {};
435     uint64_t value;
436     uint32_t ret, can_be_one, must_be_one;
437 
438     if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */
439         return 0;
440     }
441 
442     /* Check if requested MSR is supported feature MSR */
443     int i;
444     for (i = 0; i < kvm_feature_msrs->nmsrs; i++)
445         if (kvm_feature_msrs->indices[i] == index) {
446             break;
447         }
448     if (i == kvm_feature_msrs->nmsrs) {
449         return 0; /* if the feature MSR is not supported, simply return 0 */
450     }
451 
452     msr_data.info.nmsrs = 1;
453     msr_data.entries[0].index = index;
454 
455     ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data);
456     if (ret != 1) {
457         error_report("KVM get MSR (index=0x%x) feature failed, %s",
458             index, strerror(-ret));
459         exit(1);
460     }
461 
462     value = msr_data.entries[0].data;
463     switch (index) {
464     case MSR_IA32_VMX_PROCBASED_CTLS2:
465         if (!has_msr_vmx_procbased_ctls2) {
466             /* KVM forgot to add these bits for some time, do this ourselves. */
467             if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) &
468                 CPUID_XSAVE_XSAVES) {
469                 value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32;
470             }
471             if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) &
472                 CPUID_EXT_RDRAND) {
473                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32;
474             }
475             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
476                 CPUID_7_0_EBX_INVPCID) {
477                 value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32;
478             }
479             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
480                 CPUID_7_0_EBX_RDSEED) {
481                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32;
482             }
483             if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) &
484                 CPUID_EXT2_RDTSCP) {
485                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32;
486             }
487         }
488         /* fall through */
489     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
490     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
491     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
492     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
493         /*
494          * Return true for bits that can be one, but do not have to be one.
495          * The SDM tells us which bits could have a "must be one" setting,
496          * so we can do the opposite transformation in make_vmx_msr_value.
497          */
498         must_be_one = (uint32_t)value;
499         can_be_one = (uint32_t)(value >> 32);
500         return can_be_one & ~must_be_one;
501 
502     default:
503         return value;
504     }
505 }
506 
507 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
508                                      int *max_banks)
509 {
510     int r;
511 
512     r = kvm_check_extension(s, KVM_CAP_MCE);
513     if (r > 0) {
514         *max_banks = r;
515         return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
516     }
517     return -ENOSYS;
518 }
519 
520 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
521 {
522     CPUState *cs = CPU(cpu);
523     CPUX86State *env = &cpu->env;
524     uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
525                       MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
526     uint64_t mcg_status = MCG_STATUS_MCIP;
527     int flags = 0;
528 
529     if (code == BUS_MCEERR_AR) {
530         status |= MCI_STATUS_AR | 0x134;
531         mcg_status |= MCG_STATUS_EIPV;
532     } else {
533         status |= 0xc0;
534         mcg_status |= MCG_STATUS_RIPV;
535     }
536 
537     flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
538     /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
539      * guest kernel back into env->mcg_ext_ctl.
540      */
541     cpu_synchronize_state(cs);
542     if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
543         mcg_status |= MCG_STATUS_LMCE;
544         flags = 0;
545     }
546 
547     cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
548                        (MCM_ADDR_PHYS << 6) | 0xc, flags);
549 }
550 
551 static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar)
552 {
553     MemoryFailureFlags mff = {.action_required = ar, .recursive = false};
554 
555     qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action,
556                                    &mff);
557 }
558 
559 static void hardware_memory_error(void *host_addr)
560 {
561     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true);
562     error_report("QEMU got Hardware memory error at addr %p", host_addr);
563     exit(1);
564 }
565 
566 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
567 {
568     X86CPU *cpu = X86_CPU(c);
569     CPUX86State *env = &cpu->env;
570     ram_addr_t ram_addr;
571     hwaddr paddr;
572 
573     /* If we get an action required MCE, it has been injected by KVM
574      * while the VM was running.  An action optional MCE instead should
575      * be coming from the main thread, which qemu_init_sigbus identifies
576      * as the "early kill" thread.
577      */
578     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
579 
580     if ((env->mcg_cap & MCG_SER_P) && addr) {
581         ram_addr = qemu_ram_addr_from_host(addr);
582         if (ram_addr != RAM_ADDR_INVALID &&
583             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
584             kvm_hwpoison_page_add(ram_addr);
585             kvm_mce_inject(cpu, paddr, code);
586 
587             /*
588              * Use different logging severity based on error type.
589              * If there is additional MCE reporting on the hypervisor, QEMU VA
590              * could be another source to identify the PA and MCE details.
591              */
592             if (code == BUS_MCEERR_AR) {
593                 error_report("Guest MCE Memory Error at QEMU addr %p and "
594                     "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
595                     addr, paddr, "BUS_MCEERR_AR");
596             } else {
597                  warn_report("Guest MCE Memory Error at QEMU addr %p and "
598                      "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
599                      addr, paddr, "BUS_MCEERR_AO");
600             }
601 
602             return;
603         }
604 
605         if (code == BUS_MCEERR_AO) {
606             warn_report("Hardware memory error at addr %p of type %s "
607                 "for memory used by QEMU itself instead of guest system!",
608                  addr, "BUS_MCEERR_AO");
609         }
610     }
611 
612     if (code == BUS_MCEERR_AR) {
613         hardware_memory_error(addr);
614     }
615 
616     /* Hope we are lucky for AO MCE, just notify a event */
617     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false);
618 }
619 
620 static void kvm_reset_exception(CPUX86State *env)
621 {
622     env->exception_nr = -1;
623     env->exception_pending = 0;
624     env->exception_injected = 0;
625     env->exception_has_payload = false;
626     env->exception_payload = 0;
627 }
628 
629 static void kvm_queue_exception(CPUX86State *env,
630                                 int32_t exception_nr,
631                                 uint8_t exception_has_payload,
632                                 uint64_t exception_payload)
633 {
634     assert(env->exception_nr == -1);
635     assert(!env->exception_pending);
636     assert(!env->exception_injected);
637     assert(!env->exception_has_payload);
638 
639     env->exception_nr = exception_nr;
640 
641     if (has_exception_payload) {
642         env->exception_pending = 1;
643 
644         env->exception_has_payload = exception_has_payload;
645         env->exception_payload = exception_payload;
646     } else {
647         env->exception_injected = 1;
648 
649         if (exception_nr == EXCP01_DB) {
650             assert(exception_has_payload);
651             env->dr[6] = exception_payload;
652         } else if (exception_nr == EXCP0E_PAGE) {
653             assert(exception_has_payload);
654             env->cr[2] = exception_payload;
655         } else {
656             assert(!exception_has_payload);
657         }
658     }
659 }
660 
661 static int kvm_inject_mce_oldstyle(X86CPU *cpu)
662 {
663     CPUX86State *env = &cpu->env;
664 
665     if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
666         unsigned int bank, bank_num = env->mcg_cap & 0xff;
667         struct kvm_x86_mce mce;
668 
669         kvm_reset_exception(env);
670 
671         /*
672          * There must be at least one bank in use if an MCE is pending.
673          * Find it and use its values for the event injection.
674          */
675         for (bank = 0; bank < bank_num; bank++) {
676             if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
677                 break;
678             }
679         }
680         assert(bank < bank_num);
681 
682         mce.bank = bank;
683         mce.status = env->mce_banks[bank * 4 + 1];
684         mce.mcg_status = env->mcg_status;
685         mce.addr = env->mce_banks[bank * 4 + 2];
686         mce.misc = env->mce_banks[bank * 4 + 3];
687 
688         return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
689     }
690     return 0;
691 }
692 
693 static void cpu_update_state(void *opaque, int running, RunState state)
694 {
695     CPUX86State *env = opaque;
696 
697     if (running) {
698         env->tsc_valid = false;
699     }
700 }
701 
702 unsigned long kvm_arch_vcpu_id(CPUState *cs)
703 {
704     X86CPU *cpu = X86_CPU(cs);
705     return cpu->apic_id;
706 }
707 
708 #ifndef KVM_CPUID_SIGNATURE_NEXT
709 #define KVM_CPUID_SIGNATURE_NEXT                0x40000100
710 #endif
711 
712 static bool hyperv_enabled(X86CPU *cpu)
713 {
714     CPUState *cs = CPU(cpu);
715     return kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0 &&
716         ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) ||
717          cpu->hyperv_features || cpu->hyperv_passthrough);
718 }
719 
720 /*
721  * Check whether target_freq is within conservative
722  * ntp correctable bounds (250ppm) of freq
723  */
724 static inline bool freq_within_bounds(int freq, int target_freq)
725 {
726         int max_freq = freq + (freq * 250 / 1000000);
727         int min_freq = freq - (freq * 250 / 1000000);
728 
729         if (target_freq >= min_freq && target_freq <= max_freq) {
730                 return true;
731         }
732 
733         return false;
734 }
735 
736 static int kvm_arch_set_tsc_khz(CPUState *cs)
737 {
738     X86CPU *cpu = X86_CPU(cs);
739     CPUX86State *env = &cpu->env;
740     int r, cur_freq;
741     bool set_ioctl = false;
742 
743     if (!env->tsc_khz) {
744         return 0;
745     }
746 
747     cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
748                kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
749 
750     /*
751      * If TSC scaling is supported, attempt to set TSC frequency.
752      */
753     if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
754         set_ioctl = true;
755     }
756 
757     /*
758      * If desired TSC frequency is within bounds of NTP correction,
759      * attempt to set TSC frequency.
760      */
761     if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) {
762         set_ioctl = true;
763     }
764 
765     r = set_ioctl ?
766         kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
767         -ENOTSUP;
768 
769     if (r < 0) {
770         /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
771          * TSC frequency doesn't match the one we want.
772          */
773         cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
774                    kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
775                    -ENOTSUP;
776         if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
777             warn_report("TSC frequency mismatch between "
778                         "VM (%" PRId64 " kHz) and host (%d kHz), "
779                         "and TSC scaling unavailable",
780                         env->tsc_khz, cur_freq);
781             return r;
782         }
783     }
784 
785     return 0;
786 }
787 
788 static bool tsc_is_stable_and_known(CPUX86State *env)
789 {
790     if (!env->tsc_khz) {
791         return false;
792     }
793     return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
794         || env->user_tsc_khz;
795 }
796 
797 static struct {
798     const char *desc;
799     struct {
800         uint32_t fw;
801         uint32_t bits;
802     } flags[2];
803     uint64_t dependencies;
804 } kvm_hyperv_properties[] = {
805     [HYPERV_FEAT_RELAXED] = {
806         .desc = "relaxed timing (hv-relaxed)",
807         .flags = {
808             {.fw = FEAT_HYPERV_EAX,
809              .bits = HV_HYPERCALL_AVAILABLE},
810             {.fw = FEAT_HV_RECOMM_EAX,
811              .bits = HV_RELAXED_TIMING_RECOMMENDED}
812         }
813     },
814     [HYPERV_FEAT_VAPIC] = {
815         .desc = "virtual APIC (hv-vapic)",
816         .flags = {
817             {.fw = FEAT_HYPERV_EAX,
818              .bits = HV_HYPERCALL_AVAILABLE | HV_APIC_ACCESS_AVAILABLE},
819             {.fw = FEAT_HV_RECOMM_EAX,
820              .bits = HV_APIC_ACCESS_RECOMMENDED}
821         }
822     },
823     [HYPERV_FEAT_TIME] = {
824         .desc = "clocksources (hv-time)",
825         .flags = {
826             {.fw = FEAT_HYPERV_EAX,
827              .bits = HV_HYPERCALL_AVAILABLE | HV_TIME_REF_COUNT_AVAILABLE |
828              HV_REFERENCE_TSC_AVAILABLE}
829         }
830     },
831     [HYPERV_FEAT_CRASH] = {
832         .desc = "crash MSRs (hv-crash)",
833         .flags = {
834             {.fw = FEAT_HYPERV_EDX,
835              .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
836         }
837     },
838     [HYPERV_FEAT_RESET] = {
839         .desc = "reset MSR (hv-reset)",
840         .flags = {
841             {.fw = FEAT_HYPERV_EAX,
842              .bits = HV_RESET_AVAILABLE}
843         }
844     },
845     [HYPERV_FEAT_VPINDEX] = {
846         .desc = "VP_INDEX MSR (hv-vpindex)",
847         .flags = {
848             {.fw = FEAT_HYPERV_EAX,
849              .bits = HV_VP_INDEX_AVAILABLE}
850         }
851     },
852     [HYPERV_FEAT_RUNTIME] = {
853         .desc = "VP_RUNTIME MSR (hv-runtime)",
854         .flags = {
855             {.fw = FEAT_HYPERV_EAX,
856              .bits = HV_VP_RUNTIME_AVAILABLE}
857         }
858     },
859     [HYPERV_FEAT_SYNIC] = {
860         .desc = "synthetic interrupt controller (hv-synic)",
861         .flags = {
862             {.fw = FEAT_HYPERV_EAX,
863              .bits = HV_SYNIC_AVAILABLE}
864         }
865     },
866     [HYPERV_FEAT_STIMER] = {
867         .desc = "synthetic timers (hv-stimer)",
868         .flags = {
869             {.fw = FEAT_HYPERV_EAX,
870              .bits = HV_SYNTIMERS_AVAILABLE}
871         },
872         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
873     },
874     [HYPERV_FEAT_FREQUENCIES] = {
875         .desc = "frequency MSRs (hv-frequencies)",
876         .flags = {
877             {.fw = FEAT_HYPERV_EAX,
878              .bits = HV_ACCESS_FREQUENCY_MSRS},
879             {.fw = FEAT_HYPERV_EDX,
880              .bits = HV_FREQUENCY_MSRS_AVAILABLE}
881         }
882     },
883     [HYPERV_FEAT_REENLIGHTENMENT] = {
884         .desc = "reenlightenment MSRs (hv-reenlightenment)",
885         .flags = {
886             {.fw = FEAT_HYPERV_EAX,
887              .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
888         }
889     },
890     [HYPERV_FEAT_TLBFLUSH] = {
891         .desc = "paravirtualized TLB flush (hv-tlbflush)",
892         .flags = {
893             {.fw = FEAT_HV_RECOMM_EAX,
894              .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
895              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
896         },
897         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
898     },
899     [HYPERV_FEAT_EVMCS] = {
900         .desc = "enlightened VMCS (hv-evmcs)",
901         .flags = {
902             {.fw = FEAT_HV_RECOMM_EAX,
903              .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
904         },
905         .dependencies = BIT(HYPERV_FEAT_VAPIC)
906     },
907     [HYPERV_FEAT_IPI] = {
908         .desc = "paravirtualized IPI (hv-ipi)",
909         .flags = {
910             {.fw = FEAT_HV_RECOMM_EAX,
911              .bits = HV_CLUSTER_IPI_RECOMMENDED |
912              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
913         },
914         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
915     },
916     [HYPERV_FEAT_STIMER_DIRECT] = {
917         .desc = "direct mode synthetic timers (hv-stimer-direct)",
918         .flags = {
919             {.fw = FEAT_HYPERV_EDX,
920              .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
921         },
922         .dependencies = BIT(HYPERV_FEAT_STIMER)
923     },
924 };
925 
926 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max)
927 {
928     struct kvm_cpuid2 *cpuid;
929     int r, size;
930 
931     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
932     cpuid = g_malloc0(size);
933     cpuid->nent = max;
934 
935     r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
936     if (r == 0 && cpuid->nent >= max) {
937         r = -E2BIG;
938     }
939     if (r < 0) {
940         if (r == -E2BIG) {
941             g_free(cpuid);
942             return NULL;
943         } else {
944             fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n",
945                     strerror(-r));
946             exit(1);
947         }
948     }
949     return cpuid;
950 }
951 
952 /*
953  * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough
954  * for all entries.
955  */
956 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs)
957 {
958     struct kvm_cpuid2 *cpuid;
959     int max = 7; /* 0x40000000..0x40000005, 0x4000000A */
960 
961     /*
962      * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with
963      * -E2BIG, however, it doesn't report back the right size. Keep increasing
964      * it and re-trying until we succeed.
965      */
966     while ((cpuid = try_get_hv_cpuid(cs, max)) == NULL) {
967         max++;
968     }
969     return cpuid;
970 }
971 
972 /*
973  * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature
974  * leaves from KVM_CAP_HYPERV* and present MSRs data.
975  */
976 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs)
977 {
978     X86CPU *cpu = X86_CPU(cs);
979     struct kvm_cpuid2 *cpuid;
980     struct kvm_cpuid_entry2 *entry_feat, *entry_recomm;
981 
982     /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */
983     cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries));
984     cpuid->nent = 2;
985 
986     /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */
987     entry_feat = &cpuid->entries[0];
988     entry_feat->function = HV_CPUID_FEATURES;
989 
990     entry_recomm = &cpuid->entries[1];
991     entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO;
992     entry_recomm->ebx = cpu->hyperv_spinlock_attempts;
993 
994     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) {
995         entry_feat->eax |= HV_HYPERCALL_AVAILABLE;
996         entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE;
997         entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
998         entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED;
999         entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED;
1000     }
1001 
1002     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
1003         entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE;
1004         entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE;
1005     }
1006 
1007     if (has_msr_hv_frequencies) {
1008         entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
1009         entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE;
1010     }
1011 
1012     if (has_msr_hv_crash) {
1013         entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE;
1014     }
1015 
1016     if (has_msr_hv_reenlightenment) {
1017         entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
1018     }
1019 
1020     if (has_msr_hv_reset) {
1021         entry_feat->eax |= HV_RESET_AVAILABLE;
1022     }
1023 
1024     if (has_msr_hv_vpindex) {
1025         entry_feat->eax |= HV_VP_INDEX_AVAILABLE;
1026     }
1027 
1028     if (has_msr_hv_runtime) {
1029         entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE;
1030     }
1031 
1032     if (has_msr_hv_synic) {
1033         unsigned int cap = cpu->hyperv_synic_kvm_only ?
1034             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1035 
1036         if (kvm_check_extension(cs->kvm_state, cap) > 0) {
1037             entry_feat->eax |= HV_SYNIC_AVAILABLE;
1038         }
1039     }
1040 
1041     if (has_msr_hv_stimer) {
1042         entry_feat->eax |= HV_SYNTIMERS_AVAILABLE;
1043     }
1044 
1045     if (kvm_check_extension(cs->kvm_state,
1046                             KVM_CAP_HYPERV_TLBFLUSH) > 0) {
1047         entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
1048         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1049     }
1050 
1051     if (kvm_check_extension(cs->kvm_state,
1052                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1053         entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1054     }
1055 
1056     if (kvm_check_extension(cs->kvm_state,
1057                             KVM_CAP_HYPERV_SEND_IPI) > 0) {
1058         entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED;
1059         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1060     }
1061 
1062     return cpuid;
1063 }
1064 
1065 static int hv_cpuid_get_fw(struct kvm_cpuid2 *cpuid, int fw, uint32_t *r)
1066 {
1067     struct kvm_cpuid_entry2 *entry;
1068     uint32_t func;
1069     int reg;
1070 
1071     switch (fw) {
1072     case FEAT_HYPERV_EAX:
1073         reg = R_EAX;
1074         func = HV_CPUID_FEATURES;
1075         break;
1076     case FEAT_HYPERV_EDX:
1077         reg = R_EDX;
1078         func = HV_CPUID_FEATURES;
1079         break;
1080     case FEAT_HV_RECOMM_EAX:
1081         reg = R_EAX;
1082         func = HV_CPUID_ENLIGHTMENT_INFO;
1083         break;
1084     default:
1085         return -EINVAL;
1086     }
1087 
1088     entry = cpuid_find_entry(cpuid, func, 0);
1089     if (!entry) {
1090         return -ENOENT;
1091     }
1092 
1093     switch (reg) {
1094     case R_EAX:
1095         *r = entry->eax;
1096         break;
1097     case R_EDX:
1098         *r = entry->edx;
1099         break;
1100     default:
1101         return -EINVAL;
1102     }
1103 
1104     return 0;
1105 }
1106 
1107 static int hv_cpuid_check_and_set(CPUState *cs, struct kvm_cpuid2 *cpuid,
1108                                   int feature)
1109 {
1110     X86CPU *cpu = X86_CPU(cs);
1111     CPUX86State *env = &cpu->env;
1112     uint32_t r, fw, bits;
1113     uint64_t deps;
1114     int i, dep_feat;
1115 
1116     if (!hyperv_feat_enabled(cpu, feature) && !cpu->hyperv_passthrough) {
1117         return 0;
1118     }
1119 
1120     deps = kvm_hyperv_properties[feature].dependencies;
1121     while (deps) {
1122         dep_feat = ctz64(deps);
1123         if (!(hyperv_feat_enabled(cpu, dep_feat))) {
1124                 fprintf(stderr,
1125                         "Hyper-V %s requires Hyper-V %s\n",
1126                         kvm_hyperv_properties[feature].desc,
1127                         kvm_hyperv_properties[dep_feat].desc);
1128                 return 1;
1129         }
1130         deps &= ~(1ull << dep_feat);
1131     }
1132 
1133     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
1134         fw = kvm_hyperv_properties[feature].flags[i].fw;
1135         bits = kvm_hyperv_properties[feature].flags[i].bits;
1136 
1137         if (!fw) {
1138             continue;
1139         }
1140 
1141         if (hv_cpuid_get_fw(cpuid, fw, &r) || (r & bits) != bits) {
1142             if (hyperv_feat_enabled(cpu, feature)) {
1143                 fprintf(stderr,
1144                         "Hyper-V %s is not supported by kernel\n",
1145                         kvm_hyperv_properties[feature].desc);
1146                 return 1;
1147             } else {
1148                 return 0;
1149             }
1150         }
1151 
1152         env->features[fw] |= bits;
1153     }
1154 
1155     if (cpu->hyperv_passthrough) {
1156         cpu->hyperv_features |= BIT(feature);
1157     }
1158 
1159     return 0;
1160 }
1161 
1162 /*
1163  * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent in
1164  * case of success, errno < 0 in case of failure and 0 when no Hyper-V
1165  * extentions are enabled.
1166  */
1167 static int hyperv_handle_properties(CPUState *cs,
1168                                     struct kvm_cpuid_entry2 *cpuid_ent)
1169 {
1170     X86CPU *cpu = X86_CPU(cs);
1171     CPUX86State *env = &cpu->env;
1172     struct kvm_cpuid2 *cpuid;
1173     struct kvm_cpuid_entry2 *c;
1174     uint32_t cpuid_i = 0;
1175     int r;
1176 
1177     if (!hyperv_enabled(cpu))
1178         return 0;
1179 
1180     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ||
1181         cpu->hyperv_passthrough) {
1182         uint16_t evmcs_version;
1183 
1184         r = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
1185                                 (uintptr_t)&evmcs_version);
1186 
1187         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) && r) {
1188             fprintf(stderr, "Hyper-V %s is not supported by kernel\n",
1189                     kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc);
1190             return -ENOSYS;
1191         }
1192 
1193         if (!r) {
1194             env->features[FEAT_HV_RECOMM_EAX] |=
1195                 HV_ENLIGHTENED_VMCS_RECOMMENDED;
1196             env->features[FEAT_HV_NESTED_EAX] = evmcs_version;
1197         }
1198     }
1199 
1200     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_CPUID) > 0) {
1201         cpuid = get_supported_hv_cpuid(cs);
1202     } else {
1203         cpuid = get_supported_hv_cpuid_legacy(cs);
1204     }
1205 
1206     if (cpu->hyperv_passthrough) {
1207         memcpy(cpuid_ent, &cpuid->entries[0],
1208                cpuid->nent * sizeof(cpuid->entries[0]));
1209 
1210         c = cpuid_find_entry(cpuid, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, 0);
1211         if (c) {
1212             cpu->hyperv_vendor_id[0] = c->ebx;
1213             cpu->hyperv_vendor_id[1] = c->ecx;
1214             cpu->hyperv_vendor_id[2] = c->edx;
1215         }
1216 
1217         c = cpuid_find_entry(cpuid, HV_CPUID_INTERFACE, 0);
1218         if (c) {
1219             cpu->hyperv_interface_id[0] = c->eax;
1220             cpu->hyperv_interface_id[1] = c->ebx;
1221             cpu->hyperv_interface_id[2] = c->ecx;
1222             cpu->hyperv_interface_id[3] = c->edx;
1223         }
1224 
1225         c = cpuid_find_entry(cpuid, HV_CPUID_VERSION, 0);
1226         if (c) {
1227             cpu->hyperv_version_id[0] = c->eax;
1228             cpu->hyperv_version_id[1] = c->ebx;
1229             cpu->hyperv_version_id[2] = c->ecx;
1230             cpu->hyperv_version_id[3] = c->edx;
1231         }
1232 
1233         c = cpuid_find_entry(cpuid, HV_CPUID_FEATURES, 0);
1234         if (c) {
1235             env->features[FEAT_HYPERV_EAX] = c->eax;
1236             env->features[FEAT_HYPERV_EBX] = c->ebx;
1237             env->features[FEAT_HYPERV_EDX] = c->edx;
1238         }
1239 
1240         c = cpuid_find_entry(cpuid, HV_CPUID_IMPLEMENT_LIMITS, 0);
1241         if (c) {
1242             cpu->hv_max_vps = c->eax;
1243             cpu->hyperv_limits[0] = c->ebx;
1244             cpu->hyperv_limits[1] = c->ecx;
1245             cpu->hyperv_limits[2] = c->edx;
1246         }
1247 
1248         c = cpuid_find_entry(cpuid, HV_CPUID_ENLIGHTMENT_INFO, 0);
1249         if (c) {
1250             env->features[FEAT_HV_RECOMM_EAX] = c->eax;
1251 
1252             /* hv-spinlocks may have been overriden */
1253             if (cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) {
1254                 c->ebx = cpu->hyperv_spinlock_attempts;
1255             }
1256         }
1257         c = cpuid_find_entry(cpuid, HV_CPUID_NESTED_FEATURES, 0);
1258         if (c) {
1259             env->features[FEAT_HV_NESTED_EAX] = c->eax;
1260         }
1261     }
1262 
1263     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) {
1264         env->features[FEAT_HV_RECOMM_EAX] |= HV_NO_NONARCH_CORESHARING;
1265     } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) {
1266         c = cpuid_find_entry(cpuid, HV_CPUID_ENLIGHTMENT_INFO, 0);
1267         if (c) {
1268             env->features[FEAT_HV_RECOMM_EAX] |=
1269                 c->eax & HV_NO_NONARCH_CORESHARING;
1270         }
1271     }
1272 
1273     /* Features */
1274     r = hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_RELAXED);
1275     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_VAPIC);
1276     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_TIME);
1277     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_CRASH);
1278     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_RESET);
1279     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_VPINDEX);
1280     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_RUNTIME);
1281     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_SYNIC);
1282     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_STIMER);
1283     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_FREQUENCIES);
1284     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_REENLIGHTENMENT);
1285     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_TLBFLUSH);
1286     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_EVMCS);
1287     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_IPI);
1288     r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_STIMER_DIRECT);
1289 
1290     /* Additional dependencies not covered by kvm_hyperv_properties[] */
1291     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1292         !cpu->hyperv_synic_kvm_only &&
1293         !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
1294         fprintf(stderr, "Hyper-V %s requires Hyper-V %s\n",
1295                 kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
1296                 kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
1297         r |= 1;
1298     }
1299 
1300     /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
1301     env->features[FEAT_HYPERV_EDX] |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1302 
1303     if (r) {
1304         r = -ENOSYS;
1305         goto free;
1306     }
1307 
1308     if (cpu->hyperv_passthrough) {
1309         /* We already copied all feature words from KVM as is */
1310         r = cpuid->nent;
1311         goto free;
1312     }
1313 
1314     c = &cpuid_ent[cpuid_i++];
1315     c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
1316     c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
1317         HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
1318     c->ebx = cpu->hyperv_vendor_id[0];
1319     c->ecx = cpu->hyperv_vendor_id[1];
1320     c->edx = cpu->hyperv_vendor_id[2];
1321 
1322     c = &cpuid_ent[cpuid_i++];
1323     c->function = HV_CPUID_INTERFACE;
1324     c->eax = cpu->hyperv_interface_id[0];
1325     c->ebx = cpu->hyperv_interface_id[1];
1326     c->ecx = cpu->hyperv_interface_id[2];
1327     c->edx = cpu->hyperv_interface_id[3];
1328 
1329     c = &cpuid_ent[cpuid_i++];
1330     c->function = HV_CPUID_VERSION;
1331     c->eax = cpu->hyperv_version_id[0];
1332     c->ebx = cpu->hyperv_version_id[1];
1333     c->ecx = cpu->hyperv_version_id[2];
1334     c->edx = cpu->hyperv_version_id[3];
1335 
1336     c = &cpuid_ent[cpuid_i++];
1337     c->function = HV_CPUID_FEATURES;
1338     c->eax = env->features[FEAT_HYPERV_EAX];
1339     c->ebx = env->features[FEAT_HYPERV_EBX];
1340     c->edx = env->features[FEAT_HYPERV_EDX];
1341 
1342     c = &cpuid_ent[cpuid_i++];
1343     c->function = HV_CPUID_ENLIGHTMENT_INFO;
1344     c->eax = env->features[FEAT_HV_RECOMM_EAX];
1345     c->ebx = cpu->hyperv_spinlock_attempts;
1346 
1347     c = &cpuid_ent[cpuid_i++];
1348     c->function = HV_CPUID_IMPLEMENT_LIMITS;
1349     c->eax = cpu->hv_max_vps;
1350     c->ebx = cpu->hyperv_limits[0];
1351     c->ecx = cpu->hyperv_limits[1];
1352     c->edx = cpu->hyperv_limits[2];
1353 
1354     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1355         __u32 function;
1356 
1357         /* Create zeroed 0x40000006..0x40000009 leaves */
1358         for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
1359              function < HV_CPUID_NESTED_FEATURES; function++) {
1360             c = &cpuid_ent[cpuid_i++];
1361             c->function = function;
1362         }
1363 
1364         c = &cpuid_ent[cpuid_i++];
1365         c->function = HV_CPUID_NESTED_FEATURES;
1366         c->eax = env->features[FEAT_HV_NESTED_EAX];
1367     }
1368     r = cpuid_i;
1369 
1370 free:
1371     g_free(cpuid);
1372 
1373     return r;
1374 }
1375 
1376 static Error *hv_passthrough_mig_blocker;
1377 static Error *hv_no_nonarch_cs_mig_blocker;
1378 
1379 static int hyperv_init_vcpu(X86CPU *cpu)
1380 {
1381     CPUState *cs = CPU(cpu);
1382     Error *local_err = NULL;
1383     int ret;
1384 
1385     if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) {
1386         error_setg(&hv_passthrough_mig_blocker,
1387                    "'hv-passthrough' CPU flag prevents migration, use explicit"
1388                    " set of hv-* flags instead");
1389         ret = migrate_add_blocker(hv_passthrough_mig_blocker, &local_err);
1390         if (local_err) {
1391             error_report_err(local_err);
1392             error_free(hv_passthrough_mig_blocker);
1393             return ret;
1394         }
1395     }
1396 
1397     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO &&
1398         hv_no_nonarch_cs_mig_blocker == NULL) {
1399         error_setg(&hv_no_nonarch_cs_mig_blocker,
1400                    "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration"
1401                    " use explicit 'hv-no-nonarch-coresharing=on' instead (but"
1402                    " make sure SMT is disabled and/or that vCPUs are properly"
1403                    " pinned)");
1404         ret = migrate_add_blocker(hv_no_nonarch_cs_mig_blocker, &local_err);
1405         if (local_err) {
1406             error_report_err(local_err);
1407             error_free(hv_no_nonarch_cs_mig_blocker);
1408             return ret;
1409         }
1410     }
1411 
1412     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) {
1413         /*
1414          * the kernel doesn't support setting vp_index; assert that its value
1415          * is in sync
1416          */
1417         struct {
1418             struct kvm_msrs info;
1419             struct kvm_msr_entry entries[1];
1420         } msr_data = {
1421             .info.nmsrs = 1,
1422             .entries[0].index = HV_X64_MSR_VP_INDEX,
1423         };
1424 
1425         ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
1426         if (ret < 0) {
1427             return ret;
1428         }
1429         assert(ret == 1);
1430 
1431         if (msr_data.entries[0].data != hyperv_vp_index(CPU(cpu))) {
1432             error_report("kernel's vp_index != QEMU's vp_index");
1433             return -ENXIO;
1434         }
1435     }
1436 
1437     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
1438         uint32_t synic_cap = cpu->hyperv_synic_kvm_only ?
1439             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1440         ret = kvm_vcpu_enable_cap(cs, synic_cap, 0);
1441         if (ret < 0) {
1442             error_report("failed to turn on HyperV SynIC in KVM: %s",
1443                          strerror(-ret));
1444             return ret;
1445         }
1446 
1447         if (!cpu->hyperv_synic_kvm_only) {
1448             ret = hyperv_x86_synic_add(cpu);
1449             if (ret < 0) {
1450                 error_report("failed to create HyperV SynIC: %s",
1451                              strerror(-ret));
1452                 return ret;
1453             }
1454         }
1455     }
1456 
1457     return 0;
1458 }
1459 
1460 static Error *invtsc_mig_blocker;
1461 
1462 #define KVM_MAX_CPUID_ENTRIES  100
1463 
1464 int kvm_arch_init_vcpu(CPUState *cs)
1465 {
1466     struct {
1467         struct kvm_cpuid2 cpuid;
1468         struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
1469     } cpuid_data;
1470     /*
1471      * The kernel defines these structs with padding fields so there
1472      * should be no extra padding in our cpuid_data struct.
1473      */
1474     QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
1475                       sizeof(struct kvm_cpuid2) +
1476                       sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
1477 
1478     X86CPU *cpu = X86_CPU(cs);
1479     CPUX86State *env = &cpu->env;
1480     uint32_t limit, i, j, cpuid_i;
1481     uint32_t unused;
1482     struct kvm_cpuid_entry2 *c;
1483     uint32_t signature[3];
1484     int kvm_base = KVM_CPUID_SIGNATURE;
1485     int max_nested_state_len;
1486     int r;
1487     Error *local_err = NULL;
1488 
1489     memset(&cpuid_data, 0, sizeof(cpuid_data));
1490 
1491     cpuid_i = 0;
1492 
1493     r = kvm_arch_set_tsc_khz(cs);
1494     if (r < 0) {
1495         return r;
1496     }
1497 
1498     /* vcpu's TSC frequency is either specified by user, or following
1499      * the value used by KVM if the former is not present. In the
1500      * latter case, we query it from KVM and record in env->tsc_khz,
1501      * so that vcpu's TSC frequency can be migrated later via this field.
1502      */
1503     if (!env->tsc_khz) {
1504         r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
1505             kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
1506             -ENOTSUP;
1507         if (r > 0) {
1508             env->tsc_khz = r;
1509         }
1510     }
1511 
1512     env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
1513 
1514     /* Paravirtualization CPUIDs */
1515     r = hyperv_handle_properties(cs, cpuid_data.entries);
1516     if (r < 0) {
1517         return r;
1518     } else if (r > 0) {
1519         cpuid_i = r;
1520         kvm_base = KVM_CPUID_SIGNATURE_NEXT;
1521         has_msr_hv_hypercall = true;
1522     }
1523 
1524     if (cpu->expose_kvm) {
1525         memcpy(signature, "KVMKVMKVM\0\0\0", 12);
1526         c = &cpuid_data.entries[cpuid_i++];
1527         c->function = KVM_CPUID_SIGNATURE | kvm_base;
1528         c->eax = KVM_CPUID_FEATURES | kvm_base;
1529         c->ebx = signature[0];
1530         c->ecx = signature[1];
1531         c->edx = signature[2];
1532 
1533         c = &cpuid_data.entries[cpuid_i++];
1534         c->function = KVM_CPUID_FEATURES | kvm_base;
1535         c->eax = env->features[FEAT_KVM];
1536         c->edx = env->features[FEAT_KVM_HINTS];
1537     }
1538 
1539     cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
1540 
1541     for (i = 0; i <= limit; i++) {
1542         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1543             fprintf(stderr, "unsupported level value: 0x%x\n", limit);
1544             abort();
1545         }
1546         c = &cpuid_data.entries[cpuid_i++];
1547 
1548         switch (i) {
1549         case 2: {
1550             /* Keep reading function 2 till all the input is received */
1551             int times;
1552 
1553             c->function = i;
1554             c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
1555                        KVM_CPUID_FLAG_STATE_READ_NEXT;
1556             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1557             times = c->eax & 0xff;
1558 
1559             for (j = 1; j < times; ++j) {
1560                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1561                     fprintf(stderr, "cpuid_data is full, no space for "
1562                             "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
1563                     abort();
1564                 }
1565                 c = &cpuid_data.entries[cpuid_i++];
1566                 c->function = i;
1567                 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
1568                 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1569             }
1570             break;
1571         }
1572         case 0x1f:
1573             if (env->nr_dies < 2) {
1574                 break;
1575             }
1576             /* fallthrough */
1577         case 4:
1578         case 0xb:
1579         case 0xd:
1580             for (j = 0; ; j++) {
1581                 if (i == 0xd && j == 64) {
1582                     break;
1583                 }
1584 
1585                 if (i == 0x1f && j == 64) {
1586                     break;
1587                 }
1588 
1589                 c->function = i;
1590                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1591                 c->index = j;
1592                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1593 
1594                 if (i == 4 && c->eax == 0) {
1595                     break;
1596                 }
1597                 if (i == 0xb && !(c->ecx & 0xff00)) {
1598                     break;
1599                 }
1600                 if (i == 0x1f && !(c->ecx & 0xff00)) {
1601                     break;
1602                 }
1603                 if (i == 0xd && c->eax == 0) {
1604                     continue;
1605                 }
1606                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1607                     fprintf(stderr, "cpuid_data is full, no space for "
1608                             "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1609                     abort();
1610                 }
1611                 c = &cpuid_data.entries[cpuid_i++];
1612             }
1613             break;
1614         case 0x7:
1615         case 0x14: {
1616             uint32_t times;
1617 
1618             c->function = i;
1619             c->index = 0;
1620             c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1621             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1622             times = c->eax;
1623 
1624             for (j = 1; j <= times; ++j) {
1625                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1626                     fprintf(stderr, "cpuid_data is full, no space for "
1627                                 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1628                     abort();
1629                 }
1630                 c = &cpuid_data.entries[cpuid_i++];
1631                 c->function = i;
1632                 c->index = j;
1633                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1634                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1635             }
1636             break;
1637         }
1638         default:
1639             c->function = i;
1640             c->flags = 0;
1641             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1642             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1643                 /*
1644                  * KVM already returns all zeroes if a CPUID entry is missing,
1645                  * so we can omit it and avoid hitting KVM's 80-entry limit.
1646                  */
1647                 cpuid_i--;
1648             }
1649             break;
1650         }
1651     }
1652 
1653     if (limit >= 0x0a) {
1654         uint32_t eax, edx;
1655 
1656         cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
1657 
1658         has_architectural_pmu_version = eax & 0xff;
1659         if (has_architectural_pmu_version > 0) {
1660             num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
1661 
1662             /* Shouldn't be more than 32, since that's the number of bits
1663              * available in EBX to tell us _which_ counters are available.
1664              * Play it safe.
1665              */
1666             if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
1667                 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
1668             }
1669 
1670             if (has_architectural_pmu_version > 1) {
1671                 num_architectural_pmu_fixed_counters = edx & 0x1f;
1672 
1673                 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
1674                     num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
1675                 }
1676             }
1677         }
1678     }
1679 
1680     cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
1681 
1682     for (i = 0x80000000; i <= limit; i++) {
1683         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1684             fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
1685             abort();
1686         }
1687         c = &cpuid_data.entries[cpuid_i++];
1688 
1689         switch (i) {
1690         case 0x8000001d:
1691             /* Query for all AMD cache information leaves */
1692             for (j = 0; ; j++) {
1693                 c->function = i;
1694                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1695                 c->index = j;
1696                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1697 
1698                 if (c->eax == 0) {
1699                     break;
1700                 }
1701                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1702                     fprintf(stderr, "cpuid_data is full, no space for "
1703                             "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1704                     abort();
1705                 }
1706                 c = &cpuid_data.entries[cpuid_i++];
1707             }
1708             break;
1709         default:
1710             c->function = i;
1711             c->flags = 0;
1712             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1713             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1714                 /*
1715                  * KVM already returns all zeroes if a CPUID entry is missing,
1716                  * so we can omit it and avoid hitting KVM's 80-entry limit.
1717                  */
1718                 cpuid_i--;
1719             }
1720             break;
1721         }
1722     }
1723 
1724     /* Call Centaur's CPUID instructions they are supported. */
1725     if (env->cpuid_xlevel2 > 0) {
1726         cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
1727 
1728         for (i = 0xC0000000; i <= limit; i++) {
1729             if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1730                 fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
1731                 abort();
1732             }
1733             c = &cpuid_data.entries[cpuid_i++];
1734 
1735             c->function = i;
1736             c->flags = 0;
1737             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1738         }
1739     }
1740 
1741     cpuid_data.cpuid.nent = cpuid_i;
1742 
1743     if (((env->cpuid_version >> 8)&0xF) >= 6
1744         && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
1745            (CPUID_MCE | CPUID_MCA)
1746         && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
1747         uint64_t mcg_cap, unsupported_caps;
1748         int banks;
1749         int ret;
1750 
1751         ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
1752         if (ret < 0) {
1753             fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
1754             return ret;
1755         }
1756 
1757         if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
1758             error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
1759                          (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
1760             return -ENOTSUP;
1761         }
1762 
1763         unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
1764         if (unsupported_caps) {
1765             if (unsupported_caps & MCG_LMCE_P) {
1766                 error_report("kvm: LMCE not supported");
1767                 return -ENOTSUP;
1768             }
1769             warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
1770                         unsupported_caps);
1771         }
1772 
1773         env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
1774         ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
1775         if (ret < 0) {
1776             fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
1777             return ret;
1778         }
1779     }
1780 
1781     cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env);
1782 
1783     c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
1784     if (c) {
1785         has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
1786                                   !!(c->ecx & CPUID_EXT_SMX);
1787     }
1788 
1789     if (env->mcg_cap & MCG_LMCE_P) {
1790         has_msr_mcg_ext_ctl = has_msr_feature_control = true;
1791     }
1792 
1793     if (!env->user_tsc_khz) {
1794         if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
1795             invtsc_mig_blocker == NULL) {
1796             error_setg(&invtsc_mig_blocker,
1797                        "State blocked by non-migratable CPU device"
1798                        " (invtsc flag)");
1799             r = migrate_add_blocker(invtsc_mig_blocker, &local_err);
1800             if (local_err) {
1801                 error_report_err(local_err);
1802                 error_free(invtsc_mig_blocker);
1803                 return r;
1804             }
1805         }
1806     }
1807 
1808     if (cpu->vmware_cpuid_freq
1809         /* Guests depend on 0x40000000 to detect this feature, so only expose
1810          * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
1811         && cpu->expose_kvm
1812         && kvm_base == KVM_CPUID_SIGNATURE
1813         /* TSC clock must be stable and known for this feature. */
1814         && tsc_is_stable_and_known(env)) {
1815 
1816         c = &cpuid_data.entries[cpuid_i++];
1817         c->function = KVM_CPUID_SIGNATURE | 0x10;
1818         c->eax = env->tsc_khz;
1819         c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */
1820         c->ecx = c->edx = 0;
1821 
1822         c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
1823         c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
1824     }
1825 
1826     cpuid_data.cpuid.nent = cpuid_i;
1827 
1828     cpuid_data.cpuid.padding = 0;
1829     r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
1830     if (r) {
1831         goto fail;
1832     }
1833 
1834     if (has_xsave) {
1835         env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
1836         memset(env->xsave_buf, 0, sizeof(struct kvm_xsave));
1837     }
1838 
1839     max_nested_state_len = kvm_max_nested_state_length();
1840     if (max_nested_state_len > 0) {
1841         assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
1842 
1843         if (cpu_has_vmx(env) || cpu_has_svm(env)) {
1844             struct kvm_vmx_nested_state_hdr *vmx_hdr;
1845 
1846             env->nested_state = g_malloc0(max_nested_state_len);
1847             env->nested_state->size = max_nested_state_len;
1848 
1849             if (cpu_has_vmx(env)) {
1850                 env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
1851                 vmx_hdr = &env->nested_state->hdr.vmx;
1852                 vmx_hdr->vmxon_pa = -1ull;
1853                 vmx_hdr->vmcs12_pa = -1ull;
1854             } else {
1855                 env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
1856             }
1857         }
1858     }
1859 
1860     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
1861 
1862     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
1863         has_msr_tsc_aux = false;
1864     }
1865 
1866     kvm_init_msrs(cpu);
1867 
1868     r = hyperv_init_vcpu(cpu);
1869     if (r) {
1870         goto fail;
1871     }
1872 
1873     return 0;
1874 
1875  fail:
1876     migrate_del_blocker(invtsc_mig_blocker);
1877 
1878     return r;
1879 }
1880 
1881 int kvm_arch_destroy_vcpu(CPUState *cs)
1882 {
1883     X86CPU *cpu = X86_CPU(cs);
1884     CPUX86State *env = &cpu->env;
1885 
1886     if (cpu->kvm_msr_buf) {
1887         g_free(cpu->kvm_msr_buf);
1888         cpu->kvm_msr_buf = NULL;
1889     }
1890 
1891     if (env->nested_state) {
1892         g_free(env->nested_state);
1893         env->nested_state = NULL;
1894     }
1895 
1896     qemu_del_vm_change_state_handler(cpu->vmsentry);
1897 
1898     return 0;
1899 }
1900 
1901 void kvm_arch_reset_vcpu(X86CPU *cpu)
1902 {
1903     CPUX86State *env = &cpu->env;
1904 
1905     env->xcr0 = 1;
1906     if (kvm_irqchip_in_kernel()) {
1907         env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
1908                                           KVM_MP_STATE_UNINITIALIZED;
1909     } else {
1910         env->mp_state = KVM_MP_STATE_RUNNABLE;
1911     }
1912 
1913     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
1914         int i;
1915         for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
1916             env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
1917         }
1918 
1919         hyperv_x86_synic_reset(cpu);
1920     }
1921     /* enabled by default */
1922     env->poll_control_msr = 1;
1923 }
1924 
1925 void kvm_arch_do_init_vcpu(X86CPU *cpu)
1926 {
1927     CPUX86State *env = &cpu->env;
1928 
1929     /* APs get directly into wait-for-SIPI state.  */
1930     if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
1931         env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
1932     }
1933 }
1934 
1935 static int kvm_get_supported_feature_msrs(KVMState *s)
1936 {
1937     int ret = 0;
1938 
1939     if (kvm_feature_msrs != NULL) {
1940         return 0;
1941     }
1942 
1943     if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) {
1944         return 0;
1945     }
1946 
1947     struct kvm_msr_list msr_list;
1948 
1949     msr_list.nmsrs = 0;
1950     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list);
1951     if (ret < 0 && ret != -E2BIG) {
1952         error_report("Fetch KVM feature MSR list failed: %s",
1953             strerror(-ret));
1954         return ret;
1955     }
1956 
1957     assert(msr_list.nmsrs > 0);
1958     kvm_feature_msrs = (struct kvm_msr_list *) \
1959         g_malloc0(sizeof(msr_list) +
1960                  msr_list.nmsrs * sizeof(msr_list.indices[0]));
1961 
1962     kvm_feature_msrs->nmsrs = msr_list.nmsrs;
1963     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs);
1964 
1965     if (ret < 0) {
1966         error_report("Fetch KVM feature MSR list failed: %s",
1967             strerror(-ret));
1968         g_free(kvm_feature_msrs);
1969         kvm_feature_msrs = NULL;
1970         return ret;
1971     }
1972 
1973     return 0;
1974 }
1975 
1976 static int kvm_get_supported_msrs(KVMState *s)
1977 {
1978     int ret = 0;
1979     struct kvm_msr_list msr_list, *kvm_msr_list;
1980 
1981     /*
1982      *  Obtain MSR list from KVM.  These are the MSRs that we must
1983      *  save/restore.
1984      */
1985     msr_list.nmsrs = 0;
1986     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
1987     if (ret < 0 && ret != -E2BIG) {
1988         return ret;
1989     }
1990     /*
1991      * Old kernel modules had a bug and could write beyond the provided
1992      * memory. Allocate at least a safe amount of 1K.
1993      */
1994     kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
1995                                           msr_list.nmsrs *
1996                                           sizeof(msr_list.indices[0])));
1997 
1998     kvm_msr_list->nmsrs = msr_list.nmsrs;
1999     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
2000     if (ret >= 0) {
2001         int i;
2002 
2003         for (i = 0; i < kvm_msr_list->nmsrs; i++) {
2004             switch (kvm_msr_list->indices[i]) {
2005             case MSR_STAR:
2006                 has_msr_star = true;
2007                 break;
2008             case MSR_VM_HSAVE_PA:
2009                 has_msr_hsave_pa = true;
2010                 break;
2011             case MSR_TSC_AUX:
2012                 has_msr_tsc_aux = true;
2013                 break;
2014             case MSR_TSC_ADJUST:
2015                 has_msr_tsc_adjust = true;
2016                 break;
2017             case MSR_IA32_TSCDEADLINE:
2018                 has_msr_tsc_deadline = true;
2019                 break;
2020             case MSR_IA32_SMBASE:
2021                 has_msr_smbase = true;
2022                 break;
2023             case MSR_SMI_COUNT:
2024                 has_msr_smi_count = true;
2025                 break;
2026             case MSR_IA32_MISC_ENABLE:
2027                 has_msr_misc_enable = true;
2028                 break;
2029             case MSR_IA32_BNDCFGS:
2030                 has_msr_bndcfgs = true;
2031                 break;
2032             case MSR_IA32_XSS:
2033                 has_msr_xss = true;
2034                 break;
2035             case MSR_IA32_UMWAIT_CONTROL:
2036                 has_msr_umwait = true;
2037                 break;
2038             case HV_X64_MSR_CRASH_CTL:
2039                 has_msr_hv_crash = true;
2040                 break;
2041             case HV_X64_MSR_RESET:
2042                 has_msr_hv_reset = true;
2043                 break;
2044             case HV_X64_MSR_VP_INDEX:
2045                 has_msr_hv_vpindex = true;
2046                 break;
2047             case HV_X64_MSR_VP_RUNTIME:
2048                 has_msr_hv_runtime = true;
2049                 break;
2050             case HV_X64_MSR_SCONTROL:
2051                 has_msr_hv_synic = true;
2052                 break;
2053             case HV_X64_MSR_STIMER0_CONFIG:
2054                 has_msr_hv_stimer = true;
2055                 break;
2056             case HV_X64_MSR_TSC_FREQUENCY:
2057                 has_msr_hv_frequencies = true;
2058                 break;
2059             case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2060                 has_msr_hv_reenlightenment = true;
2061                 break;
2062             case MSR_IA32_SPEC_CTRL:
2063                 has_msr_spec_ctrl = true;
2064                 break;
2065             case MSR_IA32_TSX_CTRL:
2066                 has_msr_tsx_ctrl = true;
2067                 break;
2068             case MSR_VIRT_SSBD:
2069                 has_msr_virt_ssbd = true;
2070                 break;
2071             case MSR_IA32_ARCH_CAPABILITIES:
2072                 has_msr_arch_capabs = true;
2073                 break;
2074             case MSR_IA32_CORE_CAPABILITY:
2075                 has_msr_core_capabs = true;
2076                 break;
2077             case MSR_IA32_PERF_CAPABILITIES:
2078                 has_msr_perf_capabs = true;
2079                 break;
2080             case MSR_IA32_VMX_VMFUNC:
2081                 has_msr_vmx_vmfunc = true;
2082                 break;
2083             case MSR_IA32_UCODE_REV:
2084                 has_msr_ucode_rev = true;
2085                 break;
2086             case MSR_IA32_VMX_PROCBASED_CTLS2:
2087                 has_msr_vmx_procbased_ctls2 = true;
2088                 break;
2089             }
2090         }
2091     }
2092 
2093     g_free(kvm_msr_list);
2094 
2095     return ret;
2096 }
2097 
2098 static Notifier smram_machine_done;
2099 static KVMMemoryListener smram_listener;
2100 static AddressSpace smram_address_space;
2101 static MemoryRegion smram_as_root;
2102 static MemoryRegion smram_as_mem;
2103 
2104 static void register_smram_listener(Notifier *n, void *unused)
2105 {
2106     MemoryRegion *smram =
2107         (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
2108 
2109     /* Outer container... */
2110     memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
2111     memory_region_set_enabled(&smram_as_root, true);
2112 
2113     /* ... with two regions inside: normal system memory with low
2114      * priority, and...
2115      */
2116     memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
2117                              get_system_memory(), 0, ~0ull);
2118     memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
2119     memory_region_set_enabled(&smram_as_mem, true);
2120 
2121     if (smram) {
2122         /* ... SMRAM with higher priority */
2123         memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
2124         memory_region_set_enabled(smram, true);
2125     }
2126 
2127     address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
2128     kvm_memory_listener_register(kvm_state, &smram_listener,
2129                                  &smram_address_space, 1);
2130 }
2131 
2132 int kvm_arch_init(MachineState *ms, KVMState *s)
2133 {
2134     uint64_t identity_base = 0xfffbc000;
2135     uint64_t shadow_mem;
2136     int ret;
2137     struct utsname utsname;
2138 
2139     if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
2140         error_report("kvm: KVM_CAP_IRQ_ROUTING not supported by KVM");
2141         return -ENOTSUP;
2142     }
2143 
2144     has_xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
2145     has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
2146     has_pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
2147 
2148     hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
2149 
2150     has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
2151     if (has_exception_payload) {
2152         ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
2153         if (ret < 0) {
2154             error_report("kvm: Failed to enable exception payload cap: %s",
2155                          strerror(-ret));
2156             return ret;
2157         }
2158     }
2159 
2160     ret = kvm_get_supported_msrs(s);
2161     if (ret < 0) {
2162         return ret;
2163     }
2164 
2165     kvm_get_supported_feature_msrs(s);
2166 
2167     uname(&utsname);
2168     lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
2169 
2170     /*
2171      * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
2172      * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
2173      * Since these must be part of guest physical memory, we need to allocate
2174      * them, both by setting their start addresses in the kernel and by
2175      * creating a corresponding e820 entry. We need 4 pages before the BIOS.
2176      *
2177      * Older KVM versions may not support setting the identity map base. In
2178      * that case we need to stick with the default, i.e. a 256K maximum BIOS
2179      * size.
2180      */
2181     if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
2182         /* Allows up to 16M BIOSes. */
2183         identity_base = 0xfeffc000;
2184 
2185         ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
2186         if (ret < 0) {
2187             return ret;
2188         }
2189     }
2190 
2191     /* Set TSS base one page after EPT identity map. */
2192     ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
2193     if (ret < 0) {
2194         return ret;
2195     }
2196 
2197     /* Tell fw_cfg to notify the BIOS to reserve the range. */
2198     ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
2199     if (ret < 0) {
2200         fprintf(stderr, "e820_add_entry() table is full\n");
2201         return ret;
2202     }
2203 
2204     shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort);
2205     if (shadow_mem != -1) {
2206         shadow_mem /= 4096;
2207         ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
2208         if (ret < 0) {
2209             return ret;
2210         }
2211     }
2212 
2213     if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
2214         object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
2215         x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
2216         smram_machine_done.notify = register_smram_listener;
2217         qemu_add_machine_init_done_notifier(&smram_machine_done);
2218     }
2219 
2220     if (enable_cpu_pm) {
2221         int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
2222         int ret;
2223 
2224 /* Work around for kernel header with a typo. TODO: fix header and drop. */
2225 #if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
2226 #define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
2227 #endif
2228         if (disable_exits) {
2229             disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
2230                               KVM_X86_DISABLE_EXITS_HLT |
2231                               KVM_X86_DISABLE_EXITS_PAUSE |
2232                               KVM_X86_DISABLE_EXITS_CSTATE);
2233         }
2234 
2235         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
2236                                 disable_exits);
2237         if (ret < 0) {
2238             error_report("kvm: guest stopping CPU not supported: %s",
2239                          strerror(-ret));
2240         }
2241     }
2242 
2243     return 0;
2244 }
2245 
2246 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2247 {
2248     lhs->selector = rhs->selector;
2249     lhs->base = rhs->base;
2250     lhs->limit = rhs->limit;
2251     lhs->type = 3;
2252     lhs->present = 1;
2253     lhs->dpl = 3;
2254     lhs->db = 0;
2255     lhs->s = 1;
2256     lhs->l = 0;
2257     lhs->g = 0;
2258     lhs->avl = 0;
2259     lhs->unusable = 0;
2260 }
2261 
2262 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2263 {
2264     unsigned flags = rhs->flags;
2265     lhs->selector = rhs->selector;
2266     lhs->base = rhs->base;
2267     lhs->limit = rhs->limit;
2268     lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
2269     lhs->present = (flags & DESC_P_MASK) != 0;
2270     lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
2271     lhs->db = (flags >> DESC_B_SHIFT) & 1;
2272     lhs->s = (flags & DESC_S_MASK) != 0;
2273     lhs->l = (flags >> DESC_L_SHIFT) & 1;
2274     lhs->g = (flags & DESC_G_MASK) != 0;
2275     lhs->avl = (flags & DESC_AVL_MASK) != 0;
2276     lhs->unusable = !lhs->present;
2277     lhs->padding = 0;
2278 }
2279 
2280 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
2281 {
2282     lhs->selector = rhs->selector;
2283     lhs->base = rhs->base;
2284     lhs->limit = rhs->limit;
2285     lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
2286                  ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
2287                  (rhs->dpl << DESC_DPL_SHIFT) |
2288                  (rhs->db << DESC_B_SHIFT) |
2289                  (rhs->s * DESC_S_MASK) |
2290                  (rhs->l << DESC_L_SHIFT) |
2291                  (rhs->g * DESC_G_MASK) |
2292                  (rhs->avl * DESC_AVL_MASK);
2293 }
2294 
2295 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
2296 {
2297     if (set) {
2298         *kvm_reg = *qemu_reg;
2299     } else {
2300         *qemu_reg = *kvm_reg;
2301     }
2302 }
2303 
2304 static int kvm_getput_regs(X86CPU *cpu, int set)
2305 {
2306     CPUX86State *env = &cpu->env;
2307     struct kvm_regs regs;
2308     int ret = 0;
2309 
2310     if (!set) {
2311         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
2312         if (ret < 0) {
2313             return ret;
2314         }
2315     }
2316 
2317     kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
2318     kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
2319     kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
2320     kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
2321     kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
2322     kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
2323     kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
2324     kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
2325 #ifdef TARGET_X86_64
2326     kvm_getput_reg(&regs.r8, &env->regs[8], set);
2327     kvm_getput_reg(&regs.r9, &env->regs[9], set);
2328     kvm_getput_reg(&regs.r10, &env->regs[10], set);
2329     kvm_getput_reg(&regs.r11, &env->regs[11], set);
2330     kvm_getput_reg(&regs.r12, &env->regs[12], set);
2331     kvm_getput_reg(&regs.r13, &env->regs[13], set);
2332     kvm_getput_reg(&regs.r14, &env->regs[14], set);
2333     kvm_getput_reg(&regs.r15, &env->regs[15], set);
2334 #endif
2335 
2336     kvm_getput_reg(&regs.rflags, &env->eflags, set);
2337     kvm_getput_reg(&regs.rip, &env->eip, set);
2338 
2339     if (set) {
2340         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
2341     }
2342 
2343     return ret;
2344 }
2345 
2346 static int kvm_put_fpu(X86CPU *cpu)
2347 {
2348     CPUX86State *env = &cpu->env;
2349     struct kvm_fpu fpu;
2350     int i;
2351 
2352     memset(&fpu, 0, sizeof fpu);
2353     fpu.fsw = env->fpus & ~(7 << 11);
2354     fpu.fsw |= (env->fpstt & 7) << 11;
2355     fpu.fcw = env->fpuc;
2356     fpu.last_opcode = env->fpop;
2357     fpu.last_ip = env->fpip;
2358     fpu.last_dp = env->fpdp;
2359     for (i = 0; i < 8; ++i) {
2360         fpu.ftwx |= (!env->fptags[i]) << i;
2361     }
2362     memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
2363     for (i = 0; i < CPU_NB_REGS; i++) {
2364         stq_p(&fpu.xmm[i][0], env->xmm_regs[i].ZMM_Q(0));
2365         stq_p(&fpu.xmm[i][8], env->xmm_regs[i].ZMM_Q(1));
2366     }
2367     fpu.mxcsr = env->mxcsr;
2368 
2369     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
2370 }
2371 
2372 #define XSAVE_FCW_FSW     0
2373 #define XSAVE_FTW_FOP     1
2374 #define XSAVE_CWD_RIP     2
2375 #define XSAVE_CWD_RDP     4
2376 #define XSAVE_MXCSR       6
2377 #define XSAVE_ST_SPACE    8
2378 #define XSAVE_XMM_SPACE   40
2379 #define XSAVE_XSTATE_BV   128
2380 #define XSAVE_YMMH_SPACE  144
2381 #define XSAVE_BNDREGS     240
2382 #define XSAVE_BNDCSR      256
2383 #define XSAVE_OPMASK      272
2384 #define XSAVE_ZMM_Hi256   288
2385 #define XSAVE_Hi16_ZMM    416
2386 #define XSAVE_PKRU        672
2387 
2388 #define XSAVE_BYTE_OFFSET(word_offset) \
2389     ((word_offset) * sizeof_field(struct kvm_xsave, region[0]))
2390 
2391 #define ASSERT_OFFSET(word_offset, field) \
2392     QEMU_BUILD_BUG_ON(XSAVE_BYTE_OFFSET(word_offset) != \
2393                       offsetof(X86XSaveArea, field))
2394 
2395 ASSERT_OFFSET(XSAVE_FCW_FSW, legacy.fcw);
2396 ASSERT_OFFSET(XSAVE_FTW_FOP, legacy.ftw);
2397 ASSERT_OFFSET(XSAVE_CWD_RIP, legacy.fpip);
2398 ASSERT_OFFSET(XSAVE_CWD_RDP, legacy.fpdp);
2399 ASSERT_OFFSET(XSAVE_MXCSR, legacy.mxcsr);
2400 ASSERT_OFFSET(XSAVE_ST_SPACE, legacy.fpregs);
2401 ASSERT_OFFSET(XSAVE_XMM_SPACE, legacy.xmm_regs);
2402 ASSERT_OFFSET(XSAVE_XSTATE_BV, header.xstate_bv);
2403 ASSERT_OFFSET(XSAVE_YMMH_SPACE, avx_state);
2404 ASSERT_OFFSET(XSAVE_BNDREGS, bndreg_state);
2405 ASSERT_OFFSET(XSAVE_BNDCSR, bndcsr_state);
2406 ASSERT_OFFSET(XSAVE_OPMASK, opmask_state);
2407 ASSERT_OFFSET(XSAVE_ZMM_Hi256, zmm_hi256_state);
2408 ASSERT_OFFSET(XSAVE_Hi16_ZMM, hi16_zmm_state);
2409 ASSERT_OFFSET(XSAVE_PKRU, pkru_state);
2410 
2411 static int kvm_put_xsave(X86CPU *cpu)
2412 {
2413     CPUX86State *env = &cpu->env;
2414     X86XSaveArea *xsave = env->xsave_buf;
2415 
2416     if (!has_xsave) {
2417         return kvm_put_fpu(cpu);
2418     }
2419     x86_cpu_xsave_all_areas(cpu, xsave);
2420 
2421     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
2422 }
2423 
2424 static int kvm_put_xcrs(X86CPU *cpu)
2425 {
2426     CPUX86State *env = &cpu->env;
2427     struct kvm_xcrs xcrs = {};
2428 
2429     if (!has_xcrs) {
2430         return 0;
2431     }
2432 
2433     xcrs.nr_xcrs = 1;
2434     xcrs.flags = 0;
2435     xcrs.xcrs[0].xcr = 0;
2436     xcrs.xcrs[0].value = env->xcr0;
2437     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
2438 }
2439 
2440 static int kvm_put_sregs(X86CPU *cpu)
2441 {
2442     CPUX86State *env = &cpu->env;
2443     struct kvm_sregs sregs;
2444 
2445     memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
2446     if (env->interrupt_injected >= 0) {
2447         sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
2448                 (uint64_t)1 << (env->interrupt_injected % 64);
2449     }
2450 
2451     if ((env->eflags & VM_MASK)) {
2452         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2453         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2454         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2455         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2456         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2457         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2458     } else {
2459         set_seg(&sregs.cs, &env->segs[R_CS]);
2460         set_seg(&sregs.ds, &env->segs[R_DS]);
2461         set_seg(&sregs.es, &env->segs[R_ES]);
2462         set_seg(&sregs.fs, &env->segs[R_FS]);
2463         set_seg(&sregs.gs, &env->segs[R_GS]);
2464         set_seg(&sregs.ss, &env->segs[R_SS]);
2465     }
2466 
2467     set_seg(&sregs.tr, &env->tr);
2468     set_seg(&sregs.ldt, &env->ldt);
2469 
2470     sregs.idt.limit = env->idt.limit;
2471     sregs.idt.base = env->idt.base;
2472     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2473     sregs.gdt.limit = env->gdt.limit;
2474     sregs.gdt.base = env->gdt.base;
2475     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2476 
2477     sregs.cr0 = env->cr[0];
2478     sregs.cr2 = env->cr[2];
2479     sregs.cr3 = env->cr[3];
2480     sregs.cr4 = env->cr[4];
2481 
2482     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2483     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2484 
2485     sregs.efer = env->efer;
2486 
2487     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
2488 }
2489 
2490 static void kvm_msr_buf_reset(X86CPU *cpu)
2491 {
2492     memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
2493 }
2494 
2495 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
2496 {
2497     struct kvm_msrs *msrs = cpu->kvm_msr_buf;
2498     void *limit = ((void *)msrs) + MSR_BUF_SIZE;
2499     struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
2500 
2501     assert((void *)(entry + 1) <= limit);
2502 
2503     entry->index = index;
2504     entry->reserved = 0;
2505     entry->data = value;
2506     msrs->nmsrs++;
2507 }
2508 
2509 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
2510 {
2511     kvm_msr_buf_reset(cpu);
2512     kvm_msr_entry_add(cpu, index, value);
2513 
2514     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
2515 }
2516 
2517 void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
2518 {
2519     int ret;
2520 
2521     ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
2522     assert(ret == 1);
2523 }
2524 
2525 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
2526 {
2527     CPUX86State *env = &cpu->env;
2528     int ret;
2529 
2530     if (!has_msr_tsc_deadline) {
2531         return 0;
2532     }
2533 
2534     ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
2535     if (ret < 0) {
2536         return ret;
2537     }
2538 
2539     assert(ret == 1);
2540     return 0;
2541 }
2542 
2543 /*
2544  * Provide a separate write service for the feature control MSR in order to
2545  * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
2546  * before writing any other state because forcibly leaving nested mode
2547  * invalidates the VCPU state.
2548  */
2549 static int kvm_put_msr_feature_control(X86CPU *cpu)
2550 {
2551     int ret;
2552 
2553     if (!has_msr_feature_control) {
2554         return 0;
2555     }
2556 
2557     ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
2558                           cpu->env.msr_ia32_feature_control);
2559     if (ret < 0) {
2560         return ret;
2561     }
2562 
2563     assert(ret == 1);
2564     return 0;
2565 }
2566 
2567 static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features)
2568 {
2569     uint32_t default1, can_be_one, can_be_zero;
2570     uint32_t must_be_one;
2571 
2572     switch (index) {
2573     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2574         default1 = 0x00000016;
2575         break;
2576     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2577         default1 = 0x0401e172;
2578         break;
2579     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2580         default1 = 0x000011ff;
2581         break;
2582     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2583         default1 = 0x00036dff;
2584         break;
2585     case MSR_IA32_VMX_PROCBASED_CTLS2:
2586         default1 = 0;
2587         break;
2588     default:
2589         abort();
2590     }
2591 
2592     /* If a feature bit is set, the control can be either set or clear.
2593      * Otherwise the value is limited to either 0 or 1 by default1.
2594      */
2595     can_be_one = features | default1;
2596     can_be_zero = features | ~default1;
2597     must_be_one = ~can_be_zero;
2598 
2599     /*
2600      * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one).
2601      * Bit 32:63 -> 1 if the control bit can be one.
2602      */
2603     return must_be_one | (((uint64_t)can_be_one) << 32);
2604 }
2605 
2606 #define VMCS12_MAX_FIELD_INDEX (0x17)
2607 
2608 static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
2609 {
2610     uint64_t kvm_vmx_basic =
2611         kvm_arch_get_supported_msr_feature(kvm_state,
2612                                            MSR_IA32_VMX_BASIC);
2613 
2614     if (!kvm_vmx_basic) {
2615         /* If the kernel doesn't support VMX feature (kvm_intel.nested=0),
2616          * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail.
2617          */
2618         return;
2619     }
2620 
2621     uint64_t kvm_vmx_misc =
2622         kvm_arch_get_supported_msr_feature(kvm_state,
2623                                            MSR_IA32_VMX_MISC);
2624     uint64_t kvm_vmx_ept_vpid =
2625         kvm_arch_get_supported_msr_feature(kvm_state,
2626                                            MSR_IA32_VMX_EPT_VPID_CAP);
2627 
2628     /*
2629      * If the guest is 64-bit, a value of 1 is allowed for the host address
2630      * space size vmexit control.
2631      */
2632     uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM
2633         ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0;
2634 
2635     /*
2636      * Bits 0-30, 32-44 and 50-53 come from the host.  KVM should
2637      * not change them for backwards compatibility.
2638      */
2639     uint64_t fixed_vmx_basic = kvm_vmx_basic &
2640         (MSR_VMX_BASIC_VMCS_REVISION_MASK |
2641          MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK |
2642          MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK);
2643 
2644     /*
2645      * Same for bits 0-4 and 25-27.  Bits 16-24 (CR3 target count) can
2646      * change in the future but are always zero for now, clear them to be
2647      * future proof.  Bits 32-63 in theory could change, though KVM does
2648      * not support dual-monitor treatment and probably never will; mask
2649      * them out as well.
2650      */
2651     uint64_t fixed_vmx_misc = kvm_vmx_misc &
2652         (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK |
2653          MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK);
2654 
2655     /*
2656      * EPT memory types should not change either, so we do not bother
2657      * adding features for them.
2658      */
2659     uint64_t fixed_vmx_ept_mask =
2660             (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ?
2661              MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0);
2662     uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask;
2663 
2664     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
2665                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
2666                                          f[FEAT_VMX_PROCBASED_CTLS]));
2667     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS,
2668                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS,
2669                                          f[FEAT_VMX_PINBASED_CTLS]));
2670     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS,
2671                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS,
2672                                          f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit);
2673     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS,
2674                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS,
2675                                          f[FEAT_VMX_ENTRY_CTLS]));
2676     kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2,
2677                       make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2,
2678                                          f[FEAT_VMX_SECONDARY_CTLS]));
2679     kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP,
2680                       f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid);
2681     kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC,
2682                       f[FEAT_VMX_BASIC] | fixed_vmx_basic);
2683     kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC,
2684                       f[FEAT_VMX_MISC] | fixed_vmx_misc);
2685     if (has_msr_vmx_vmfunc) {
2686         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]);
2687     }
2688 
2689     /*
2690      * Just to be safe, write these with constant values.  The CRn_FIXED1
2691      * MSRs are generated by KVM based on the vCPU's CPUID.
2692      */
2693     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0,
2694                       CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK);
2695     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
2696                       CR4_VMXE_MASK);
2697     kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM,
2698                       VMCS12_MAX_FIELD_INDEX << 1);
2699 }
2700 
2701 static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f)
2702 {
2703     uint64_t kvm_perf_cap =
2704         kvm_arch_get_supported_msr_feature(kvm_state,
2705                                            MSR_IA32_PERF_CAPABILITIES);
2706 
2707     if (kvm_perf_cap) {
2708         kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES,
2709                         kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]);
2710     }
2711 }
2712 
2713 static int kvm_buf_set_msrs(X86CPU *cpu)
2714 {
2715     int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
2716     if (ret < 0) {
2717         return ret;
2718     }
2719 
2720     if (ret < cpu->kvm_msr_buf->nmsrs) {
2721         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
2722         error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
2723                      (uint32_t)e->index, (uint64_t)e->data);
2724     }
2725 
2726     assert(ret == cpu->kvm_msr_buf->nmsrs);
2727     return 0;
2728 }
2729 
2730 static void kvm_init_msrs(X86CPU *cpu)
2731 {
2732     CPUX86State *env = &cpu->env;
2733 
2734     kvm_msr_buf_reset(cpu);
2735     if (has_msr_arch_capabs) {
2736         kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
2737                           env->features[FEAT_ARCH_CAPABILITIES]);
2738     }
2739 
2740     if (has_msr_core_capabs) {
2741         kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
2742                           env->features[FEAT_CORE_CAPABILITY]);
2743     }
2744 
2745     if (has_msr_perf_capabs && cpu->enable_pmu) {
2746         kvm_msr_entry_add_perf(cpu, env->features);
2747     }
2748 
2749     if (has_msr_ucode_rev) {
2750         kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
2751     }
2752 
2753     /*
2754      * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
2755      * all kernels with MSR features should have them.
2756      */
2757     if (kvm_feature_msrs && cpu_has_vmx(env)) {
2758         kvm_msr_entry_add_vmx(cpu, env->features);
2759     }
2760 
2761     assert(kvm_buf_set_msrs(cpu) == 0);
2762 }
2763 
2764 static int kvm_put_msrs(X86CPU *cpu, int level)
2765 {
2766     CPUX86State *env = &cpu->env;
2767     int i;
2768 
2769     kvm_msr_buf_reset(cpu);
2770 
2771     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
2772     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
2773     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
2774     kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
2775     if (has_msr_star) {
2776         kvm_msr_entry_add(cpu, MSR_STAR, env->star);
2777     }
2778     if (has_msr_hsave_pa) {
2779         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
2780     }
2781     if (has_msr_tsc_aux) {
2782         kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
2783     }
2784     if (has_msr_tsc_adjust) {
2785         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
2786     }
2787     if (has_msr_misc_enable) {
2788         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
2789                           env->msr_ia32_misc_enable);
2790     }
2791     if (has_msr_smbase) {
2792         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
2793     }
2794     if (has_msr_smi_count) {
2795         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
2796     }
2797     if (has_msr_bndcfgs) {
2798         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
2799     }
2800     if (has_msr_xss) {
2801         kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
2802     }
2803     if (has_msr_umwait) {
2804         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait);
2805     }
2806     if (has_msr_spec_ctrl) {
2807         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
2808     }
2809     if (has_msr_tsx_ctrl) {
2810         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl);
2811     }
2812     if (has_msr_virt_ssbd) {
2813         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
2814     }
2815 
2816 #ifdef TARGET_X86_64
2817     if (lm_capable_kernel) {
2818         kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
2819         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
2820         kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
2821         kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
2822     }
2823 #endif
2824 
2825     /*
2826      * The following MSRs have side effects on the guest or are too heavy
2827      * for normal writeback. Limit them to reset or full state updates.
2828      */
2829     if (level >= KVM_PUT_RESET_STATE) {
2830         kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
2831         kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
2832         kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
2833         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
2834             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr);
2835         }
2836         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
2837             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
2838         }
2839         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
2840             kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
2841         }
2842         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
2843             kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
2844         }
2845 
2846         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
2847             kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
2848         }
2849 
2850         if (has_architectural_pmu_version > 0) {
2851             if (has_architectural_pmu_version > 1) {
2852                 /* Stop the counter.  */
2853                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
2854                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
2855             }
2856 
2857             /* Set the counter values.  */
2858             for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
2859                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
2860                                   env->msr_fixed_counters[i]);
2861             }
2862             for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
2863                 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
2864                                   env->msr_gp_counters[i]);
2865                 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
2866                                   env->msr_gp_evtsel[i]);
2867             }
2868             if (has_architectural_pmu_version > 1) {
2869                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
2870                                   env->msr_global_status);
2871                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
2872                                   env->msr_global_ovf_ctrl);
2873 
2874                 /* Now start the PMU.  */
2875                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
2876                                   env->msr_fixed_ctr_ctrl);
2877                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
2878                                   env->msr_global_ctrl);
2879             }
2880         }
2881         /*
2882          * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
2883          * only sync them to KVM on the first cpu
2884          */
2885         if (current_cpu == first_cpu) {
2886             if (has_msr_hv_hypercall) {
2887                 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
2888                                   env->msr_hv_guest_os_id);
2889                 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
2890                                   env->msr_hv_hypercall);
2891             }
2892             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
2893                 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
2894                                   env->msr_hv_tsc);
2895             }
2896             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
2897                 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
2898                                   env->msr_hv_reenlightenment_control);
2899                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
2900                                   env->msr_hv_tsc_emulation_control);
2901                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
2902                                   env->msr_hv_tsc_emulation_status);
2903             }
2904         }
2905         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
2906             kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
2907                               env->msr_hv_vapic);
2908         }
2909         if (has_msr_hv_crash) {
2910             int j;
2911 
2912             for (j = 0; j < HV_CRASH_PARAMS; j++)
2913                 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
2914                                   env->msr_hv_crash_params[j]);
2915 
2916             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
2917         }
2918         if (has_msr_hv_runtime) {
2919             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
2920         }
2921         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)
2922             && hv_vpindex_settable) {
2923             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX,
2924                               hyperv_vp_index(CPU(cpu)));
2925         }
2926         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
2927             int j;
2928 
2929             kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
2930 
2931             kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
2932                               env->msr_hv_synic_control);
2933             kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
2934                               env->msr_hv_synic_evt_page);
2935             kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
2936                               env->msr_hv_synic_msg_page);
2937 
2938             for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
2939                 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
2940                                   env->msr_hv_synic_sint[j]);
2941             }
2942         }
2943         if (has_msr_hv_stimer) {
2944             int j;
2945 
2946             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
2947                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
2948                                 env->msr_hv_stimer_config[j]);
2949             }
2950 
2951             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
2952                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
2953                                 env->msr_hv_stimer_count[j]);
2954             }
2955         }
2956         if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
2957             uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
2958 
2959             kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
2960             kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
2961             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
2962             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
2963             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
2964             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
2965             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
2966             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
2967             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
2968             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
2969             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
2970             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
2971             for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
2972                 /* The CPU GPs if we write to a bit above the physical limit of
2973                  * the host CPU (and KVM emulates that)
2974                  */
2975                 uint64_t mask = env->mtrr_var[i].mask;
2976                 mask &= phys_mask;
2977 
2978                 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
2979                                   env->mtrr_var[i].base);
2980                 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
2981             }
2982         }
2983         if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
2984             int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
2985                                                     0x14, 1, R_EAX) & 0x7;
2986 
2987             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
2988                             env->msr_rtit_ctrl);
2989             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
2990                             env->msr_rtit_status);
2991             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
2992                             env->msr_rtit_output_base);
2993             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
2994                             env->msr_rtit_output_mask);
2995             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
2996                             env->msr_rtit_cr3_match);
2997             for (i = 0; i < addr_num; i++) {
2998                 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
2999                             env->msr_rtit_addrs[i]);
3000             }
3001         }
3002 
3003         /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
3004          *       kvm_put_msr_feature_control. */
3005     }
3006 
3007     if (env->mcg_cap) {
3008         int i;
3009 
3010         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
3011         kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
3012         if (has_msr_mcg_ext_ctl) {
3013             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
3014         }
3015         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3016             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
3017         }
3018     }
3019 
3020     return kvm_buf_set_msrs(cpu);
3021 }
3022 
3023 
3024 static int kvm_get_fpu(X86CPU *cpu)
3025 {
3026     CPUX86State *env = &cpu->env;
3027     struct kvm_fpu fpu;
3028     int i, ret;
3029 
3030     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
3031     if (ret < 0) {
3032         return ret;
3033     }
3034 
3035     env->fpstt = (fpu.fsw >> 11) & 7;
3036     env->fpus = fpu.fsw;
3037     env->fpuc = fpu.fcw;
3038     env->fpop = fpu.last_opcode;
3039     env->fpip = fpu.last_ip;
3040     env->fpdp = fpu.last_dp;
3041     for (i = 0; i < 8; ++i) {
3042         env->fptags[i] = !((fpu.ftwx >> i) & 1);
3043     }
3044     memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
3045     for (i = 0; i < CPU_NB_REGS; i++) {
3046         env->xmm_regs[i].ZMM_Q(0) = ldq_p(&fpu.xmm[i][0]);
3047         env->xmm_regs[i].ZMM_Q(1) = ldq_p(&fpu.xmm[i][8]);
3048     }
3049     env->mxcsr = fpu.mxcsr;
3050 
3051     return 0;
3052 }
3053 
3054 static int kvm_get_xsave(X86CPU *cpu)
3055 {
3056     CPUX86State *env = &cpu->env;
3057     X86XSaveArea *xsave = env->xsave_buf;
3058     int ret;
3059 
3060     if (!has_xsave) {
3061         return kvm_get_fpu(cpu);
3062     }
3063 
3064     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
3065     if (ret < 0) {
3066         return ret;
3067     }
3068     x86_cpu_xrstor_all_areas(cpu, xsave);
3069 
3070     return 0;
3071 }
3072 
3073 static int kvm_get_xcrs(X86CPU *cpu)
3074 {
3075     CPUX86State *env = &cpu->env;
3076     int i, ret;
3077     struct kvm_xcrs xcrs;
3078 
3079     if (!has_xcrs) {
3080         return 0;
3081     }
3082 
3083     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
3084     if (ret < 0) {
3085         return ret;
3086     }
3087 
3088     for (i = 0; i < xcrs.nr_xcrs; i++) {
3089         /* Only support xcr0 now */
3090         if (xcrs.xcrs[i].xcr == 0) {
3091             env->xcr0 = xcrs.xcrs[i].value;
3092             break;
3093         }
3094     }
3095     return 0;
3096 }
3097 
3098 static int kvm_get_sregs(X86CPU *cpu)
3099 {
3100     CPUX86State *env = &cpu->env;
3101     struct kvm_sregs sregs;
3102     int bit, i, ret;
3103 
3104     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
3105     if (ret < 0) {
3106         return ret;
3107     }
3108 
3109     /* There can only be one pending IRQ set in the bitmap at a time, so try
3110        to find it and save its number instead (-1 for none). */
3111     env->interrupt_injected = -1;
3112     for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
3113         if (sregs.interrupt_bitmap[i]) {
3114             bit = ctz64(sregs.interrupt_bitmap[i]);
3115             env->interrupt_injected = i * 64 + bit;
3116             break;
3117         }
3118     }
3119 
3120     get_seg(&env->segs[R_CS], &sregs.cs);
3121     get_seg(&env->segs[R_DS], &sregs.ds);
3122     get_seg(&env->segs[R_ES], &sregs.es);
3123     get_seg(&env->segs[R_FS], &sregs.fs);
3124     get_seg(&env->segs[R_GS], &sregs.gs);
3125     get_seg(&env->segs[R_SS], &sregs.ss);
3126 
3127     get_seg(&env->tr, &sregs.tr);
3128     get_seg(&env->ldt, &sregs.ldt);
3129 
3130     env->idt.limit = sregs.idt.limit;
3131     env->idt.base = sregs.idt.base;
3132     env->gdt.limit = sregs.gdt.limit;
3133     env->gdt.base = sregs.gdt.base;
3134 
3135     env->cr[0] = sregs.cr0;
3136     env->cr[2] = sregs.cr2;
3137     env->cr[3] = sregs.cr3;
3138     env->cr[4] = sregs.cr4;
3139 
3140     env->efer = sregs.efer;
3141 
3142     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3143     x86_update_hflags(env);
3144 
3145     return 0;
3146 }
3147 
3148 static int kvm_get_msrs(X86CPU *cpu)
3149 {
3150     CPUX86State *env = &cpu->env;
3151     struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
3152     int ret, i;
3153     uint64_t mtrr_top_bits;
3154 
3155     kvm_msr_buf_reset(cpu);
3156 
3157     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
3158     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
3159     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
3160     kvm_msr_entry_add(cpu, MSR_PAT, 0);
3161     if (has_msr_star) {
3162         kvm_msr_entry_add(cpu, MSR_STAR, 0);
3163     }
3164     if (has_msr_hsave_pa) {
3165         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
3166     }
3167     if (has_msr_tsc_aux) {
3168         kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
3169     }
3170     if (has_msr_tsc_adjust) {
3171         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
3172     }
3173     if (has_msr_tsc_deadline) {
3174         kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
3175     }
3176     if (has_msr_misc_enable) {
3177         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
3178     }
3179     if (has_msr_smbase) {
3180         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
3181     }
3182     if (has_msr_smi_count) {
3183         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
3184     }
3185     if (has_msr_feature_control) {
3186         kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
3187     }
3188     if (has_msr_bndcfgs) {
3189         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
3190     }
3191     if (has_msr_xss) {
3192         kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
3193     }
3194     if (has_msr_umwait) {
3195         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0);
3196     }
3197     if (has_msr_spec_ctrl) {
3198         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
3199     }
3200     if (has_msr_tsx_ctrl) {
3201         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0);
3202     }
3203     if (has_msr_virt_ssbd) {
3204         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
3205     }
3206     if (!env->tsc_valid) {
3207         kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
3208         env->tsc_valid = !runstate_is_running();
3209     }
3210 
3211 #ifdef TARGET_X86_64
3212     if (lm_capable_kernel) {
3213         kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
3214         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
3215         kvm_msr_entry_add(cpu, MSR_FMASK, 0);
3216         kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
3217     }
3218 #endif
3219     kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
3220     kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
3221     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3222         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0);
3223     }
3224     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3225         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
3226     }
3227     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3228         kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
3229     }
3230     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3231         kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
3232     }
3233     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3234         kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
3235     }
3236     if (has_architectural_pmu_version > 0) {
3237         if (has_architectural_pmu_version > 1) {
3238             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3239             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3240             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
3241             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
3242         }
3243         for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3244             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
3245         }
3246         for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3247             kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
3248             kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
3249         }
3250     }
3251 
3252     if (env->mcg_cap) {
3253         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
3254         kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
3255         if (has_msr_mcg_ext_ctl) {
3256             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
3257         }
3258         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3259             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
3260         }
3261     }
3262 
3263     if (has_msr_hv_hypercall) {
3264         kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
3265         kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
3266     }
3267     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3268         kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
3269     }
3270     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3271         kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
3272     }
3273     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3274         kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
3275         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
3276         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
3277     }
3278     if (has_msr_hv_crash) {
3279         int j;
3280 
3281         for (j = 0; j < HV_CRASH_PARAMS; j++) {
3282             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
3283         }
3284     }
3285     if (has_msr_hv_runtime) {
3286         kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
3287     }
3288     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3289         uint32_t msr;
3290 
3291         kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
3292         kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
3293         kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
3294         for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
3295             kvm_msr_entry_add(cpu, msr, 0);
3296         }
3297     }
3298     if (has_msr_hv_stimer) {
3299         uint32_t msr;
3300 
3301         for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
3302              msr++) {
3303             kvm_msr_entry_add(cpu, msr, 0);
3304         }
3305     }
3306     if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3307         kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
3308         kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
3309         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
3310         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
3311         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
3312         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
3313         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
3314         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
3315         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
3316         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
3317         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
3318         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
3319         for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3320             kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
3321             kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
3322         }
3323     }
3324 
3325     if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3326         int addr_num =
3327             kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
3328 
3329         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
3330         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
3331         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
3332         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
3333         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
3334         for (i = 0; i < addr_num; i++) {
3335             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
3336         }
3337     }
3338 
3339     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
3340     if (ret < 0) {
3341         return ret;
3342     }
3343 
3344     if (ret < cpu->kvm_msr_buf->nmsrs) {
3345         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
3346         error_report("error: failed to get MSR 0x%" PRIx32,
3347                      (uint32_t)e->index);
3348     }
3349 
3350     assert(ret == cpu->kvm_msr_buf->nmsrs);
3351     /*
3352      * MTRR masks: Each mask consists of 5 parts
3353      * a  10..0: must be zero
3354      * b  11   : valid bit
3355      * c n-1.12: actual mask bits
3356      * d  51..n: reserved must be zero
3357      * e  63.52: reserved must be zero
3358      *
3359      * 'n' is the number of physical bits supported by the CPU and is
3360      * apparently always <= 52.   We know our 'n' but don't know what
3361      * the destinations 'n' is; it might be smaller, in which case
3362      * it masks (c) on loading. It might be larger, in which case
3363      * we fill 'd' so that d..c is consistent irrespetive of the 'n'
3364      * we're migrating to.
3365      */
3366 
3367     if (cpu->fill_mtrr_mask) {
3368         QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
3369         assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
3370         mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
3371     } else {
3372         mtrr_top_bits = 0;
3373     }
3374 
3375     for (i = 0; i < ret; i++) {
3376         uint32_t index = msrs[i].index;
3377         switch (index) {
3378         case MSR_IA32_SYSENTER_CS:
3379             env->sysenter_cs = msrs[i].data;
3380             break;
3381         case MSR_IA32_SYSENTER_ESP:
3382             env->sysenter_esp = msrs[i].data;
3383             break;
3384         case MSR_IA32_SYSENTER_EIP:
3385             env->sysenter_eip = msrs[i].data;
3386             break;
3387         case MSR_PAT:
3388             env->pat = msrs[i].data;
3389             break;
3390         case MSR_STAR:
3391             env->star = msrs[i].data;
3392             break;
3393 #ifdef TARGET_X86_64
3394         case MSR_CSTAR:
3395             env->cstar = msrs[i].data;
3396             break;
3397         case MSR_KERNELGSBASE:
3398             env->kernelgsbase = msrs[i].data;
3399             break;
3400         case MSR_FMASK:
3401             env->fmask = msrs[i].data;
3402             break;
3403         case MSR_LSTAR:
3404             env->lstar = msrs[i].data;
3405             break;
3406 #endif
3407         case MSR_IA32_TSC:
3408             env->tsc = msrs[i].data;
3409             break;
3410         case MSR_TSC_AUX:
3411             env->tsc_aux = msrs[i].data;
3412             break;
3413         case MSR_TSC_ADJUST:
3414             env->tsc_adjust = msrs[i].data;
3415             break;
3416         case MSR_IA32_TSCDEADLINE:
3417             env->tsc_deadline = msrs[i].data;
3418             break;
3419         case MSR_VM_HSAVE_PA:
3420             env->vm_hsave = msrs[i].data;
3421             break;
3422         case MSR_KVM_SYSTEM_TIME:
3423             env->system_time_msr = msrs[i].data;
3424             break;
3425         case MSR_KVM_WALL_CLOCK:
3426             env->wall_clock_msr = msrs[i].data;
3427             break;
3428         case MSR_MCG_STATUS:
3429             env->mcg_status = msrs[i].data;
3430             break;
3431         case MSR_MCG_CTL:
3432             env->mcg_ctl = msrs[i].data;
3433             break;
3434         case MSR_MCG_EXT_CTL:
3435             env->mcg_ext_ctl = msrs[i].data;
3436             break;
3437         case MSR_IA32_MISC_ENABLE:
3438             env->msr_ia32_misc_enable = msrs[i].data;
3439             break;
3440         case MSR_IA32_SMBASE:
3441             env->smbase = msrs[i].data;
3442             break;
3443         case MSR_SMI_COUNT:
3444             env->msr_smi_count = msrs[i].data;
3445             break;
3446         case MSR_IA32_FEATURE_CONTROL:
3447             env->msr_ia32_feature_control = msrs[i].data;
3448             break;
3449         case MSR_IA32_BNDCFGS:
3450             env->msr_bndcfgs = msrs[i].data;
3451             break;
3452         case MSR_IA32_XSS:
3453             env->xss = msrs[i].data;
3454             break;
3455         case MSR_IA32_UMWAIT_CONTROL:
3456             env->umwait = msrs[i].data;
3457             break;
3458         default:
3459             if (msrs[i].index >= MSR_MC0_CTL &&
3460                 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
3461                 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
3462             }
3463             break;
3464         case MSR_KVM_ASYNC_PF_EN:
3465             env->async_pf_en_msr = msrs[i].data;
3466             break;
3467         case MSR_KVM_ASYNC_PF_INT:
3468             env->async_pf_int_msr = msrs[i].data;
3469             break;
3470         case MSR_KVM_PV_EOI_EN:
3471             env->pv_eoi_en_msr = msrs[i].data;
3472             break;
3473         case MSR_KVM_STEAL_TIME:
3474             env->steal_time_msr = msrs[i].data;
3475             break;
3476         case MSR_KVM_POLL_CONTROL: {
3477             env->poll_control_msr = msrs[i].data;
3478             break;
3479         }
3480         case MSR_CORE_PERF_FIXED_CTR_CTRL:
3481             env->msr_fixed_ctr_ctrl = msrs[i].data;
3482             break;
3483         case MSR_CORE_PERF_GLOBAL_CTRL:
3484             env->msr_global_ctrl = msrs[i].data;
3485             break;
3486         case MSR_CORE_PERF_GLOBAL_STATUS:
3487             env->msr_global_status = msrs[i].data;
3488             break;
3489         case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
3490             env->msr_global_ovf_ctrl = msrs[i].data;
3491             break;
3492         case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
3493             env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
3494             break;
3495         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
3496             env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
3497             break;
3498         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
3499             env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
3500             break;
3501         case HV_X64_MSR_HYPERCALL:
3502             env->msr_hv_hypercall = msrs[i].data;
3503             break;
3504         case HV_X64_MSR_GUEST_OS_ID:
3505             env->msr_hv_guest_os_id = msrs[i].data;
3506             break;
3507         case HV_X64_MSR_APIC_ASSIST_PAGE:
3508             env->msr_hv_vapic = msrs[i].data;
3509             break;
3510         case HV_X64_MSR_REFERENCE_TSC:
3511             env->msr_hv_tsc = msrs[i].data;
3512             break;
3513         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3514             env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
3515             break;
3516         case HV_X64_MSR_VP_RUNTIME:
3517             env->msr_hv_runtime = msrs[i].data;
3518             break;
3519         case HV_X64_MSR_SCONTROL:
3520             env->msr_hv_synic_control = msrs[i].data;
3521             break;
3522         case HV_X64_MSR_SIEFP:
3523             env->msr_hv_synic_evt_page = msrs[i].data;
3524             break;
3525         case HV_X64_MSR_SIMP:
3526             env->msr_hv_synic_msg_page = msrs[i].data;
3527             break;
3528         case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
3529             env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
3530             break;
3531         case HV_X64_MSR_STIMER0_CONFIG:
3532         case HV_X64_MSR_STIMER1_CONFIG:
3533         case HV_X64_MSR_STIMER2_CONFIG:
3534         case HV_X64_MSR_STIMER3_CONFIG:
3535             env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
3536                                 msrs[i].data;
3537             break;
3538         case HV_X64_MSR_STIMER0_COUNT:
3539         case HV_X64_MSR_STIMER1_COUNT:
3540         case HV_X64_MSR_STIMER2_COUNT:
3541         case HV_X64_MSR_STIMER3_COUNT:
3542             env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
3543                                 msrs[i].data;
3544             break;
3545         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3546             env->msr_hv_reenlightenment_control = msrs[i].data;
3547             break;
3548         case HV_X64_MSR_TSC_EMULATION_CONTROL:
3549             env->msr_hv_tsc_emulation_control = msrs[i].data;
3550             break;
3551         case HV_X64_MSR_TSC_EMULATION_STATUS:
3552             env->msr_hv_tsc_emulation_status = msrs[i].data;
3553             break;
3554         case MSR_MTRRdefType:
3555             env->mtrr_deftype = msrs[i].data;
3556             break;
3557         case MSR_MTRRfix64K_00000:
3558             env->mtrr_fixed[0] = msrs[i].data;
3559             break;
3560         case MSR_MTRRfix16K_80000:
3561             env->mtrr_fixed[1] = msrs[i].data;
3562             break;
3563         case MSR_MTRRfix16K_A0000:
3564             env->mtrr_fixed[2] = msrs[i].data;
3565             break;
3566         case MSR_MTRRfix4K_C0000:
3567             env->mtrr_fixed[3] = msrs[i].data;
3568             break;
3569         case MSR_MTRRfix4K_C8000:
3570             env->mtrr_fixed[4] = msrs[i].data;
3571             break;
3572         case MSR_MTRRfix4K_D0000:
3573             env->mtrr_fixed[5] = msrs[i].data;
3574             break;
3575         case MSR_MTRRfix4K_D8000:
3576             env->mtrr_fixed[6] = msrs[i].data;
3577             break;
3578         case MSR_MTRRfix4K_E0000:
3579             env->mtrr_fixed[7] = msrs[i].data;
3580             break;
3581         case MSR_MTRRfix4K_E8000:
3582             env->mtrr_fixed[8] = msrs[i].data;
3583             break;
3584         case MSR_MTRRfix4K_F0000:
3585             env->mtrr_fixed[9] = msrs[i].data;
3586             break;
3587         case MSR_MTRRfix4K_F8000:
3588             env->mtrr_fixed[10] = msrs[i].data;
3589             break;
3590         case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
3591             if (index & 1) {
3592                 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
3593                                                                mtrr_top_bits;
3594             } else {
3595                 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
3596             }
3597             break;
3598         case MSR_IA32_SPEC_CTRL:
3599             env->spec_ctrl = msrs[i].data;
3600             break;
3601         case MSR_IA32_TSX_CTRL:
3602             env->tsx_ctrl = msrs[i].data;
3603             break;
3604         case MSR_VIRT_SSBD:
3605             env->virt_ssbd = msrs[i].data;
3606             break;
3607         case MSR_IA32_RTIT_CTL:
3608             env->msr_rtit_ctrl = msrs[i].data;
3609             break;
3610         case MSR_IA32_RTIT_STATUS:
3611             env->msr_rtit_status = msrs[i].data;
3612             break;
3613         case MSR_IA32_RTIT_OUTPUT_BASE:
3614             env->msr_rtit_output_base = msrs[i].data;
3615             break;
3616         case MSR_IA32_RTIT_OUTPUT_MASK:
3617             env->msr_rtit_output_mask = msrs[i].data;
3618             break;
3619         case MSR_IA32_RTIT_CR3_MATCH:
3620             env->msr_rtit_cr3_match = msrs[i].data;
3621             break;
3622         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
3623             env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
3624             break;
3625         }
3626     }
3627 
3628     return 0;
3629 }
3630 
3631 static int kvm_put_mp_state(X86CPU *cpu)
3632 {
3633     struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
3634 
3635     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
3636 }
3637 
3638 static int kvm_get_mp_state(X86CPU *cpu)
3639 {
3640     CPUState *cs = CPU(cpu);
3641     CPUX86State *env = &cpu->env;
3642     struct kvm_mp_state mp_state;
3643     int ret;
3644 
3645     ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
3646     if (ret < 0) {
3647         return ret;
3648     }
3649     env->mp_state = mp_state.mp_state;
3650     if (kvm_irqchip_in_kernel()) {
3651         cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
3652     }
3653     return 0;
3654 }
3655 
3656 static int kvm_get_apic(X86CPU *cpu)
3657 {
3658     DeviceState *apic = cpu->apic_state;
3659     struct kvm_lapic_state kapic;
3660     int ret;
3661 
3662     if (apic && kvm_irqchip_in_kernel()) {
3663         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
3664         if (ret < 0) {
3665             return ret;
3666         }
3667 
3668         kvm_get_apic_state(apic, &kapic);
3669     }
3670     return 0;
3671 }
3672 
3673 static int kvm_put_vcpu_events(X86CPU *cpu, int level)
3674 {
3675     CPUState *cs = CPU(cpu);
3676     CPUX86State *env = &cpu->env;
3677     struct kvm_vcpu_events events = {};
3678 
3679     if (!kvm_has_vcpu_events()) {
3680         return 0;
3681     }
3682 
3683     events.flags = 0;
3684 
3685     if (has_exception_payload) {
3686         events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
3687         events.exception.pending = env->exception_pending;
3688         events.exception_has_payload = env->exception_has_payload;
3689         events.exception_payload = env->exception_payload;
3690     }
3691     events.exception.nr = env->exception_nr;
3692     events.exception.injected = env->exception_injected;
3693     events.exception.has_error_code = env->has_error_code;
3694     events.exception.error_code = env->error_code;
3695 
3696     events.interrupt.injected = (env->interrupt_injected >= 0);
3697     events.interrupt.nr = env->interrupt_injected;
3698     events.interrupt.soft = env->soft_interrupt;
3699 
3700     events.nmi.injected = env->nmi_injected;
3701     events.nmi.pending = env->nmi_pending;
3702     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
3703 
3704     events.sipi_vector = env->sipi_vector;
3705 
3706     if (has_msr_smbase) {
3707         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
3708         events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
3709         if (kvm_irqchip_in_kernel()) {
3710             /* As soon as these are moved to the kernel, remove them
3711              * from cs->interrupt_request.
3712              */
3713             events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
3714             events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
3715             cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
3716         } else {
3717             /* Keep these in cs->interrupt_request.  */
3718             events.smi.pending = 0;
3719             events.smi.latched_init = 0;
3720         }
3721         /* Stop SMI delivery on old machine types to avoid a reboot
3722          * on an inward migration of an old VM.
3723          */
3724         if (!cpu->kvm_no_smi_migration) {
3725             events.flags |= KVM_VCPUEVENT_VALID_SMM;
3726         }
3727     }
3728 
3729     if (level >= KVM_PUT_RESET_STATE) {
3730         events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
3731         if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
3732             events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR;
3733         }
3734     }
3735 
3736     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
3737 }
3738 
3739 static int kvm_get_vcpu_events(X86CPU *cpu)
3740 {
3741     CPUX86State *env = &cpu->env;
3742     struct kvm_vcpu_events events;
3743     int ret;
3744 
3745     if (!kvm_has_vcpu_events()) {
3746         return 0;
3747     }
3748 
3749     memset(&events, 0, sizeof(events));
3750     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
3751     if (ret < 0) {
3752        return ret;
3753     }
3754 
3755     if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
3756         env->exception_pending = events.exception.pending;
3757         env->exception_has_payload = events.exception_has_payload;
3758         env->exception_payload = events.exception_payload;
3759     } else {
3760         env->exception_pending = 0;
3761         env->exception_has_payload = false;
3762     }
3763     env->exception_injected = events.exception.injected;
3764     env->exception_nr =
3765         (env->exception_pending || env->exception_injected) ?
3766         events.exception.nr : -1;
3767     env->has_error_code = events.exception.has_error_code;
3768     env->error_code = events.exception.error_code;
3769 
3770     env->interrupt_injected =
3771         events.interrupt.injected ? events.interrupt.nr : -1;
3772     env->soft_interrupt = events.interrupt.soft;
3773 
3774     env->nmi_injected = events.nmi.injected;
3775     env->nmi_pending = events.nmi.pending;
3776     if (events.nmi.masked) {
3777         env->hflags2 |= HF2_NMI_MASK;
3778     } else {
3779         env->hflags2 &= ~HF2_NMI_MASK;
3780     }
3781 
3782     if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
3783         if (events.smi.smm) {
3784             env->hflags |= HF_SMM_MASK;
3785         } else {
3786             env->hflags &= ~HF_SMM_MASK;
3787         }
3788         if (events.smi.pending) {
3789             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
3790         } else {
3791             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
3792         }
3793         if (events.smi.smm_inside_nmi) {
3794             env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
3795         } else {
3796             env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
3797         }
3798         if (events.smi.latched_init) {
3799             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
3800         } else {
3801             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
3802         }
3803     }
3804 
3805     env->sipi_vector = events.sipi_vector;
3806 
3807     return 0;
3808 }
3809 
3810 static int kvm_guest_debug_workarounds(X86CPU *cpu)
3811 {
3812     CPUState *cs = CPU(cpu);
3813     CPUX86State *env = &cpu->env;
3814     int ret = 0;
3815     unsigned long reinject_trap = 0;
3816 
3817     if (!kvm_has_vcpu_events()) {
3818         if (env->exception_nr == EXCP01_DB) {
3819             reinject_trap = KVM_GUESTDBG_INJECT_DB;
3820         } else if (env->exception_injected == EXCP03_INT3) {
3821             reinject_trap = KVM_GUESTDBG_INJECT_BP;
3822         }
3823         kvm_reset_exception(env);
3824     }
3825 
3826     /*
3827      * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
3828      * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
3829      * by updating the debug state once again if single-stepping is on.
3830      * Another reason to call kvm_update_guest_debug here is a pending debug
3831      * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
3832      * reinject them via SET_GUEST_DEBUG.
3833      */
3834     if (reinject_trap ||
3835         (!kvm_has_robust_singlestep() && cs->singlestep_enabled)) {
3836         ret = kvm_update_guest_debug(cs, reinject_trap);
3837     }
3838     return ret;
3839 }
3840 
3841 static int kvm_put_debugregs(X86CPU *cpu)
3842 {
3843     CPUX86State *env = &cpu->env;
3844     struct kvm_debugregs dbgregs;
3845     int i;
3846 
3847     if (!kvm_has_debugregs()) {
3848         return 0;
3849     }
3850 
3851     memset(&dbgregs, 0, sizeof(dbgregs));
3852     for (i = 0; i < 4; i++) {
3853         dbgregs.db[i] = env->dr[i];
3854     }
3855     dbgregs.dr6 = env->dr[6];
3856     dbgregs.dr7 = env->dr[7];
3857     dbgregs.flags = 0;
3858 
3859     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
3860 }
3861 
3862 static int kvm_get_debugregs(X86CPU *cpu)
3863 {
3864     CPUX86State *env = &cpu->env;
3865     struct kvm_debugregs dbgregs;
3866     int i, ret;
3867 
3868     if (!kvm_has_debugregs()) {
3869         return 0;
3870     }
3871 
3872     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
3873     if (ret < 0) {
3874         return ret;
3875     }
3876     for (i = 0; i < 4; i++) {
3877         env->dr[i] = dbgregs.db[i];
3878     }
3879     env->dr[4] = env->dr[6] = dbgregs.dr6;
3880     env->dr[5] = env->dr[7] = dbgregs.dr7;
3881 
3882     return 0;
3883 }
3884 
3885 static int kvm_put_nested_state(X86CPU *cpu)
3886 {
3887     CPUX86State *env = &cpu->env;
3888     int max_nested_state_len = kvm_max_nested_state_length();
3889 
3890     if (!env->nested_state) {
3891         return 0;
3892     }
3893 
3894     /*
3895      * Copy flags that are affected by reset from env->hflags and env->hflags2.
3896      */
3897     if (env->hflags & HF_GUEST_MASK) {
3898         env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
3899     } else {
3900         env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
3901     }
3902 
3903     /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */
3904     if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) {
3905         env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
3906     } else {
3907         env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
3908     }
3909 
3910     assert(env->nested_state->size <= max_nested_state_len);
3911     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
3912 }
3913 
3914 static int kvm_get_nested_state(X86CPU *cpu)
3915 {
3916     CPUX86State *env = &cpu->env;
3917     int max_nested_state_len = kvm_max_nested_state_length();
3918     int ret;
3919 
3920     if (!env->nested_state) {
3921         return 0;
3922     }
3923 
3924     /*
3925      * It is possible that migration restored a smaller size into
3926      * nested_state->hdr.size than what our kernel support.
3927      * We preserve migration origin nested_state->hdr.size for
3928      * call to KVM_SET_NESTED_STATE but wish that our next call
3929      * to KVM_GET_NESTED_STATE will use max size our kernel support.
3930      */
3931     env->nested_state->size = max_nested_state_len;
3932 
3933     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
3934     if (ret < 0) {
3935         return ret;
3936     }
3937 
3938     /*
3939      * Copy flags that are affected by reset to env->hflags and env->hflags2.
3940      */
3941     if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
3942         env->hflags |= HF_GUEST_MASK;
3943     } else {
3944         env->hflags &= ~HF_GUEST_MASK;
3945     }
3946 
3947     /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */
3948     if (cpu_has_svm(env)) {
3949         if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
3950             env->hflags2 |= HF2_GIF_MASK;
3951         } else {
3952             env->hflags2 &= ~HF2_GIF_MASK;
3953         }
3954     }
3955 
3956     return ret;
3957 }
3958 
3959 int kvm_arch_put_registers(CPUState *cpu, int level)
3960 {
3961     X86CPU *x86_cpu = X86_CPU(cpu);
3962     int ret;
3963 
3964     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
3965 
3966     /* must be before kvm_put_nested_state so that EFER.SVME is set */
3967     ret = kvm_put_sregs(x86_cpu);
3968     if (ret < 0) {
3969         return ret;
3970     }
3971 
3972     if (level >= KVM_PUT_RESET_STATE) {
3973         ret = kvm_put_nested_state(x86_cpu);
3974         if (ret < 0) {
3975             return ret;
3976         }
3977 
3978         ret = kvm_put_msr_feature_control(x86_cpu);
3979         if (ret < 0) {
3980             return ret;
3981         }
3982     }
3983 
3984     if (level == KVM_PUT_FULL_STATE) {
3985         /* We don't check for kvm_arch_set_tsc_khz() errors here,
3986          * because TSC frequency mismatch shouldn't abort migration,
3987          * unless the user explicitly asked for a more strict TSC
3988          * setting (e.g. using an explicit "tsc-freq" option).
3989          */
3990         kvm_arch_set_tsc_khz(cpu);
3991     }
3992 
3993     ret = kvm_getput_regs(x86_cpu, 1);
3994     if (ret < 0) {
3995         return ret;
3996     }
3997     ret = kvm_put_xsave(x86_cpu);
3998     if (ret < 0) {
3999         return ret;
4000     }
4001     ret = kvm_put_xcrs(x86_cpu);
4002     if (ret < 0) {
4003         return ret;
4004     }
4005     /* must be before kvm_put_msrs */
4006     ret = kvm_inject_mce_oldstyle(x86_cpu);
4007     if (ret < 0) {
4008         return ret;
4009     }
4010     ret = kvm_put_msrs(x86_cpu, level);
4011     if (ret < 0) {
4012         return ret;
4013     }
4014     ret = kvm_put_vcpu_events(x86_cpu, level);
4015     if (ret < 0) {
4016         return ret;
4017     }
4018     if (level >= KVM_PUT_RESET_STATE) {
4019         ret = kvm_put_mp_state(x86_cpu);
4020         if (ret < 0) {
4021             return ret;
4022         }
4023     }
4024 
4025     ret = kvm_put_tscdeadline_msr(x86_cpu);
4026     if (ret < 0) {
4027         return ret;
4028     }
4029     ret = kvm_put_debugregs(x86_cpu);
4030     if (ret < 0) {
4031         return ret;
4032     }
4033     /* must be last */
4034     ret = kvm_guest_debug_workarounds(x86_cpu);
4035     if (ret < 0) {
4036         return ret;
4037     }
4038     return 0;
4039 }
4040 
4041 int kvm_arch_get_registers(CPUState *cs)
4042 {
4043     X86CPU *cpu = X86_CPU(cs);
4044     int ret;
4045 
4046     assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
4047 
4048     ret = kvm_get_vcpu_events(cpu);
4049     if (ret < 0) {
4050         goto out;
4051     }
4052     /*
4053      * KVM_GET_MPSTATE can modify CS and RIP, call it before
4054      * KVM_GET_REGS and KVM_GET_SREGS.
4055      */
4056     ret = kvm_get_mp_state(cpu);
4057     if (ret < 0) {
4058         goto out;
4059     }
4060     ret = kvm_getput_regs(cpu, 0);
4061     if (ret < 0) {
4062         goto out;
4063     }
4064     ret = kvm_get_xsave(cpu);
4065     if (ret < 0) {
4066         goto out;
4067     }
4068     ret = kvm_get_xcrs(cpu);
4069     if (ret < 0) {
4070         goto out;
4071     }
4072     ret = kvm_get_sregs(cpu);
4073     if (ret < 0) {
4074         goto out;
4075     }
4076     ret = kvm_get_msrs(cpu);
4077     if (ret < 0) {
4078         goto out;
4079     }
4080     ret = kvm_get_apic(cpu);
4081     if (ret < 0) {
4082         goto out;
4083     }
4084     ret = kvm_get_debugregs(cpu);
4085     if (ret < 0) {
4086         goto out;
4087     }
4088     ret = kvm_get_nested_state(cpu);
4089     if (ret < 0) {
4090         goto out;
4091     }
4092     ret = 0;
4093  out:
4094     cpu_sync_bndcs_hflags(&cpu->env);
4095     return ret;
4096 }
4097 
4098 void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
4099 {
4100     X86CPU *x86_cpu = X86_CPU(cpu);
4101     CPUX86State *env = &x86_cpu->env;
4102     int ret;
4103 
4104     /* Inject NMI */
4105     if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
4106         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
4107             qemu_mutex_lock_iothread();
4108             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
4109             qemu_mutex_unlock_iothread();
4110             DPRINTF("injected NMI\n");
4111             ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
4112             if (ret < 0) {
4113                 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
4114                         strerror(-ret));
4115             }
4116         }
4117         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
4118             qemu_mutex_lock_iothread();
4119             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
4120             qemu_mutex_unlock_iothread();
4121             DPRINTF("injected SMI\n");
4122             ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
4123             if (ret < 0) {
4124                 fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
4125                         strerror(-ret));
4126             }
4127         }
4128     }
4129 
4130     if (!kvm_pic_in_kernel()) {
4131         qemu_mutex_lock_iothread();
4132     }
4133 
4134     /* Force the VCPU out of its inner loop to process any INIT requests
4135      * or (for userspace APIC, but it is cheap to combine the checks here)
4136      * pending TPR access reports.
4137      */
4138     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
4139         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
4140             !(env->hflags & HF_SMM_MASK)) {
4141             cpu->exit_request = 1;
4142         }
4143         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
4144             cpu->exit_request = 1;
4145         }
4146     }
4147 
4148     if (!kvm_pic_in_kernel()) {
4149         /* Try to inject an interrupt if the guest can accept it */
4150         if (run->ready_for_interrupt_injection &&
4151             (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
4152             (env->eflags & IF_MASK)) {
4153             int irq;
4154 
4155             cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
4156             irq = cpu_get_pic_interrupt(env);
4157             if (irq >= 0) {
4158                 struct kvm_interrupt intr;
4159 
4160                 intr.irq = irq;
4161                 DPRINTF("injected interrupt %d\n", irq);
4162                 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
4163                 if (ret < 0) {
4164                     fprintf(stderr,
4165                             "KVM: injection failed, interrupt lost (%s)\n",
4166                             strerror(-ret));
4167                 }
4168             }
4169         }
4170 
4171         /* If we have an interrupt but the guest is not ready to receive an
4172          * interrupt, request an interrupt window exit.  This will
4173          * cause a return to userspace as soon as the guest is ready to
4174          * receive interrupts. */
4175         if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
4176             run->request_interrupt_window = 1;
4177         } else {
4178             run->request_interrupt_window = 0;
4179         }
4180 
4181         DPRINTF("setting tpr\n");
4182         run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
4183 
4184         qemu_mutex_unlock_iothread();
4185     }
4186 }
4187 
4188 MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
4189 {
4190     X86CPU *x86_cpu = X86_CPU(cpu);
4191     CPUX86State *env = &x86_cpu->env;
4192 
4193     if (run->flags & KVM_RUN_X86_SMM) {
4194         env->hflags |= HF_SMM_MASK;
4195     } else {
4196         env->hflags &= ~HF_SMM_MASK;
4197     }
4198     if (run->if_flag) {
4199         env->eflags |= IF_MASK;
4200     } else {
4201         env->eflags &= ~IF_MASK;
4202     }
4203 
4204     /* We need to protect the apic state against concurrent accesses from
4205      * different threads in case the userspace irqchip is used. */
4206     if (!kvm_irqchip_in_kernel()) {
4207         qemu_mutex_lock_iothread();
4208     }
4209     cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
4210     cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
4211     if (!kvm_irqchip_in_kernel()) {
4212         qemu_mutex_unlock_iothread();
4213     }
4214     return cpu_get_mem_attrs(env);
4215 }
4216 
4217 int kvm_arch_process_async_events(CPUState *cs)
4218 {
4219     X86CPU *cpu = X86_CPU(cs);
4220     CPUX86State *env = &cpu->env;
4221 
4222     if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
4223         /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
4224         assert(env->mcg_cap);
4225 
4226         cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
4227 
4228         kvm_cpu_synchronize_state(cs);
4229 
4230         if (env->exception_nr == EXCP08_DBLE) {
4231             /* this means triple fault */
4232             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
4233             cs->exit_request = 1;
4234             return 0;
4235         }
4236         kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
4237         env->has_error_code = 0;
4238 
4239         cs->halted = 0;
4240         if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
4241             env->mp_state = KVM_MP_STATE_RUNNABLE;
4242         }
4243     }
4244 
4245     if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
4246         !(env->hflags & HF_SMM_MASK)) {
4247         kvm_cpu_synchronize_state(cs);
4248         do_cpu_init(cpu);
4249     }
4250 
4251     if (kvm_irqchip_in_kernel()) {
4252         return 0;
4253     }
4254 
4255     if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
4256         cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
4257         apic_poll_irq(cpu->apic_state);
4258     }
4259     if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
4260          (env->eflags & IF_MASK)) ||
4261         (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
4262         cs->halted = 0;
4263     }
4264     if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
4265         kvm_cpu_synchronize_state(cs);
4266         do_cpu_sipi(cpu);
4267     }
4268     if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
4269         cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
4270         kvm_cpu_synchronize_state(cs);
4271         apic_handle_tpr_access_report(cpu->apic_state, env->eip,
4272                                       env->tpr_access_type);
4273     }
4274 
4275     return cs->halted;
4276 }
4277 
4278 static int kvm_handle_halt(X86CPU *cpu)
4279 {
4280     CPUState *cs = CPU(cpu);
4281     CPUX86State *env = &cpu->env;
4282 
4283     if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
4284           (env->eflags & IF_MASK)) &&
4285         !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
4286         cs->halted = 1;
4287         return EXCP_HLT;
4288     }
4289 
4290     return 0;
4291 }
4292 
4293 static int kvm_handle_tpr_access(X86CPU *cpu)
4294 {
4295     CPUState *cs = CPU(cpu);
4296     struct kvm_run *run = cs->kvm_run;
4297 
4298     apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
4299                                   run->tpr_access.is_write ? TPR_ACCESS_WRITE
4300                                                            : TPR_ACCESS_READ);
4301     return 1;
4302 }
4303 
4304 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
4305 {
4306     static const uint8_t int3 = 0xcc;
4307 
4308     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
4309         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
4310         return -EINVAL;
4311     }
4312     return 0;
4313 }
4314 
4315 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
4316 {
4317     uint8_t int3;
4318 
4319     if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0) || int3 != 0xcc ||
4320         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
4321         return -EINVAL;
4322     }
4323     return 0;
4324 }
4325 
4326 static struct {
4327     target_ulong addr;
4328     int len;
4329     int type;
4330 } hw_breakpoint[4];
4331 
4332 static int nb_hw_breakpoint;
4333 
4334 static int find_hw_breakpoint(target_ulong addr, int len, int type)
4335 {
4336     int n;
4337 
4338     for (n = 0; n < nb_hw_breakpoint; n++) {
4339         if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
4340             (hw_breakpoint[n].len == len || len == -1)) {
4341             return n;
4342         }
4343     }
4344     return -1;
4345 }
4346 
4347 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
4348                                   target_ulong len, int type)
4349 {
4350     switch (type) {
4351     case GDB_BREAKPOINT_HW:
4352         len = 1;
4353         break;
4354     case GDB_WATCHPOINT_WRITE:
4355     case GDB_WATCHPOINT_ACCESS:
4356         switch (len) {
4357         case 1:
4358             break;
4359         case 2:
4360         case 4:
4361         case 8:
4362             if (addr & (len - 1)) {
4363                 return -EINVAL;
4364             }
4365             break;
4366         default:
4367             return -EINVAL;
4368         }
4369         break;
4370     default:
4371         return -ENOSYS;
4372     }
4373 
4374     if (nb_hw_breakpoint == 4) {
4375         return -ENOBUFS;
4376     }
4377     if (find_hw_breakpoint(addr, len, type) >= 0) {
4378         return -EEXIST;
4379     }
4380     hw_breakpoint[nb_hw_breakpoint].addr = addr;
4381     hw_breakpoint[nb_hw_breakpoint].len = len;
4382     hw_breakpoint[nb_hw_breakpoint].type = type;
4383     nb_hw_breakpoint++;
4384 
4385     return 0;
4386 }
4387 
4388 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
4389                                   target_ulong len, int type)
4390 {
4391     int n;
4392 
4393     n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
4394     if (n < 0) {
4395         return -ENOENT;
4396     }
4397     nb_hw_breakpoint--;
4398     hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
4399 
4400     return 0;
4401 }
4402 
4403 void kvm_arch_remove_all_hw_breakpoints(void)
4404 {
4405     nb_hw_breakpoint = 0;
4406 }
4407 
4408 static CPUWatchpoint hw_watchpoint;
4409 
4410 static int kvm_handle_debug(X86CPU *cpu,
4411                             struct kvm_debug_exit_arch *arch_info)
4412 {
4413     CPUState *cs = CPU(cpu);
4414     CPUX86State *env = &cpu->env;
4415     int ret = 0;
4416     int n;
4417 
4418     if (arch_info->exception == EXCP01_DB) {
4419         if (arch_info->dr6 & DR6_BS) {
4420             if (cs->singlestep_enabled) {
4421                 ret = EXCP_DEBUG;
4422             }
4423         } else {
4424             for (n = 0; n < 4; n++) {
4425                 if (arch_info->dr6 & (1 << n)) {
4426                     switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
4427                     case 0x0:
4428                         ret = EXCP_DEBUG;
4429                         break;
4430                     case 0x1:
4431                         ret = EXCP_DEBUG;
4432                         cs->watchpoint_hit = &hw_watchpoint;
4433                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
4434                         hw_watchpoint.flags = BP_MEM_WRITE;
4435                         break;
4436                     case 0x3:
4437                         ret = EXCP_DEBUG;
4438                         cs->watchpoint_hit = &hw_watchpoint;
4439                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
4440                         hw_watchpoint.flags = BP_MEM_ACCESS;
4441                         break;
4442                     }
4443                 }
4444             }
4445         }
4446     } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
4447         ret = EXCP_DEBUG;
4448     }
4449     if (ret == 0) {
4450         cpu_synchronize_state(cs);
4451         assert(env->exception_nr == -1);
4452 
4453         /* pass to guest */
4454         kvm_queue_exception(env, arch_info->exception,
4455                             arch_info->exception == EXCP01_DB,
4456                             arch_info->dr6);
4457         env->has_error_code = 0;
4458     }
4459 
4460     return ret;
4461 }
4462 
4463 void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
4464 {
4465     const uint8_t type_code[] = {
4466         [GDB_BREAKPOINT_HW] = 0x0,
4467         [GDB_WATCHPOINT_WRITE] = 0x1,
4468         [GDB_WATCHPOINT_ACCESS] = 0x3
4469     };
4470     const uint8_t len_code[] = {
4471         [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
4472     };
4473     int n;
4474 
4475     if (kvm_sw_breakpoints_active(cpu)) {
4476         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
4477     }
4478     if (nb_hw_breakpoint > 0) {
4479         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
4480         dbg->arch.debugreg[7] = 0x0600;
4481         for (n = 0; n < nb_hw_breakpoint; n++) {
4482             dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
4483             dbg->arch.debugreg[7] |= (2 << (n * 2)) |
4484                 (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
4485                 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
4486         }
4487     }
4488 }
4489 
4490 static bool host_supports_vmx(void)
4491 {
4492     uint32_t ecx, unused;
4493 
4494     host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
4495     return ecx & CPUID_EXT_VMX;
4496 }
4497 
4498 #define VMX_INVALID_GUEST_STATE 0x80000021
4499 
4500 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
4501 {
4502     X86CPU *cpu = X86_CPU(cs);
4503     uint64_t code;
4504     int ret;
4505 
4506     switch (run->exit_reason) {
4507     case KVM_EXIT_HLT:
4508         DPRINTF("handle_hlt\n");
4509         qemu_mutex_lock_iothread();
4510         ret = kvm_handle_halt(cpu);
4511         qemu_mutex_unlock_iothread();
4512         break;
4513     case KVM_EXIT_SET_TPR:
4514         ret = 0;
4515         break;
4516     case KVM_EXIT_TPR_ACCESS:
4517         qemu_mutex_lock_iothread();
4518         ret = kvm_handle_tpr_access(cpu);
4519         qemu_mutex_unlock_iothread();
4520         break;
4521     case KVM_EXIT_FAIL_ENTRY:
4522         code = run->fail_entry.hardware_entry_failure_reason;
4523         fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
4524                 code);
4525         if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
4526             fprintf(stderr,
4527                     "\nIf you're running a guest on an Intel machine without "
4528                         "unrestricted mode\n"
4529                     "support, the failure can be most likely due to the guest "
4530                         "entering an invalid\n"
4531                     "state for Intel VT. For example, the guest maybe running "
4532                         "in big real mode\n"
4533                     "which is not supported on less recent Intel processors."
4534                         "\n\n");
4535         }
4536         ret = -1;
4537         break;
4538     case KVM_EXIT_EXCEPTION:
4539         fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
4540                 run->ex.exception, run->ex.error_code);
4541         ret = -1;
4542         break;
4543     case KVM_EXIT_DEBUG:
4544         DPRINTF("kvm_exit_debug\n");
4545         qemu_mutex_lock_iothread();
4546         ret = kvm_handle_debug(cpu, &run->debug.arch);
4547         qemu_mutex_unlock_iothread();
4548         break;
4549     case KVM_EXIT_HYPERV:
4550         ret = kvm_hv_handle_exit(cpu, &run->hyperv);
4551         break;
4552     case KVM_EXIT_IOAPIC_EOI:
4553         ioapic_eoi_broadcast(run->eoi.vector);
4554         ret = 0;
4555         break;
4556     default:
4557         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
4558         ret = -1;
4559         break;
4560     }
4561 
4562     return ret;
4563 }
4564 
4565 bool kvm_arch_stop_on_emulation_error(CPUState *cs)
4566 {
4567     X86CPU *cpu = X86_CPU(cs);
4568     CPUX86State *env = &cpu->env;
4569 
4570     kvm_cpu_synchronize_state(cs);
4571     return !(env->cr[0] & CR0_PE_MASK) ||
4572            ((env->segs[R_CS].selector  & 3) != 3);
4573 }
4574 
4575 void kvm_arch_init_irq_routing(KVMState *s)
4576 {
4577     /* We know at this point that we're using the in-kernel
4578      * irqchip, so we can use irqfds, and on x86 we know
4579      * we can use msi via irqfd and GSI routing.
4580      */
4581     kvm_msi_via_irqfd_allowed = true;
4582     kvm_gsi_routing_allowed = true;
4583 
4584     if (kvm_irqchip_is_split()) {
4585         int i;
4586 
4587         /* If the ioapic is in QEMU and the lapics are in KVM, reserve
4588            MSI routes for signaling interrupts to the local apics. */
4589         for (i = 0; i < IOAPIC_NUM_PINS; i++) {
4590             if (kvm_irqchip_add_msi_route(s, 0, NULL) < 0) {
4591                 error_report("Could not enable split IRQ mode.");
4592                 exit(1);
4593             }
4594         }
4595     }
4596 }
4597 
4598 int kvm_arch_irqchip_create(KVMState *s)
4599 {
4600     int ret;
4601     if (kvm_kernel_irqchip_split()) {
4602         ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24);
4603         if (ret) {
4604             error_report("Could not enable split irqchip mode: %s",
4605                          strerror(-ret));
4606             exit(1);
4607         } else {
4608             DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n");
4609             kvm_split_irqchip = true;
4610             return 1;
4611         }
4612     } else {
4613         return 0;
4614     }
4615 }
4616 
4617 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)
4618 {
4619     CPUX86State *env;
4620     uint64_t ext_id;
4621 
4622     if (!first_cpu) {
4623         return address;
4624     }
4625     env = &X86_CPU(first_cpu)->env;
4626     if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) {
4627         return address;
4628     }
4629 
4630     /*
4631      * If the remappable format bit is set, or the upper bits are
4632      * already set in address_hi, or the low extended bits aren't
4633      * there anyway, do nothing.
4634      */
4635     ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT);
4636     if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) {
4637         return address;
4638     }
4639 
4640     address &= ~ext_id;
4641     address |= ext_id << 35;
4642     return address;
4643 }
4644 
4645 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
4646                              uint64_t address, uint32_t data, PCIDevice *dev)
4647 {
4648     X86IOMMUState *iommu = x86_iommu_get_default();
4649 
4650     if (iommu) {
4651         X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu);
4652 
4653         if (class->int_remap) {
4654             int ret;
4655             MSIMessage src, dst;
4656 
4657             src.address = route->u.msi.address_hi;
4658             src.address <<= VTD_MSI_ADDR_HI_SHIFT;
4659             src.address |= route->u.msi.address_lo;
4660             src.data = route->u.msi.data;
4661 
4662             ret = class->int_remap(iommu, &src, &dst, dev ?     \
4663                                    pci_requester_id(dev) :      \
4664                                    X86_IOMMU_SID_INVALID);
4665             if (ret) {
4666                 trace_kvm_x86_fixup_msi_error(route->gsi);
4667                 return 1;
4668             }
4669 
4670             /*
4671              * Handled untranslated compatibilty format interrupt with
4672              * extended destination ID in the low bits 11-5. */
4673             dst.address = kvm_swizzle_msi_ext_dest_id(dst.address);
4674 
4675             route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
4676             route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
4677             route->u.msi.data = dst.data;
4678             return 0;
4679         }
4680     }
4681 
4682     address = kvm_swizzle_msi_ext_dest_id(address);
4683     route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT;
4684     route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK;
4685     return 0;
4686 }
4687 
4688 typedef struct MSIRouteEntry MSIRouteEntry;
4689 
4690 struct MSIRouteEntry {
4691     PCIDevice *dev;             /* Device pointer */
4692     int vector;                 /* MSI/MSIX vector index */
4693     int virq;                   /* Virtual IRQ index */
4694     QLIST_ENTRY(MSIRouteEntry) list;
4695 };
4696 
4697 /* List of used GSI routes */
4698 static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \
4699     QLIST_HEAD_INITIALIZER(msi_route_list);
4700 
4701 static void kvm_update_msi_routes_all(void *private, bool global,
4702                                       uint32_t index, uint32_t mask)
4703 {
4704     int cnt = 0, vector;
4705     MSIRouteEntry *entry;
4706     MSIMessage msg;
4707     PCIDevice *dev;
4708 
4709     /* TODO: explicit route update */
4710     QLIST_FOREACH(entry, &msi_route_list, list) {
4711         cnt++;
4712         vector = entry->vector;
4713         dev = entry->dev;
4714         if (msix_enabled(dev) && !msix_is_masked(dev, vector)) {
4715             msg = msix_get_message(dev, vector);
4716         } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) {
4717             msg = msi_get_message(dev, vector);
4718         } else {
4719             /*
4720              * Either MSI/MSIX is disabled for the device, or the
4721              * specific message was masked out.  Skip this one.
4722              */
4723             continue;
4724         }
4725         kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
4726     }
4727     kvm_irqchip_commit_routes(kvm_state);
4728     trace_kvm_x86_update_msi_routes(cnt);
4729 }
4730 
4731 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
4732                                 int vector, PCIDevice *dev)
4733 {
4734     static bool notify_list_inited = false;
4735     MSIRouteEntry *entry;
4736 
4737     if (!dev) {
4738         /* These are (possibly) IOAPIC routes only used for split
4739          * kernel irqchip mode, while what we are housekeeping are
4740          * PCI devices only. */
4741         return 0;
4742     }
4743 
4744     entry = g_new0(MSIRouteEntry, 1);
4745     entry->dev = dev;
4746     entry->vector = vector;
4747     entry->virq = route->gsi;
4748     QLIST_INSERT_HEAD(&msi_route_list, entry, list);
4749 
4750     trace_kvm_x86_add_msi_route(route->gsi);
4751 
4752     if (!notify_list_inited) {
4753         /* For the first time we do add route, add ourselves into
4754          * IOMMU's IEC notify list if needed. */
4755         X86IOMMUState *iommu = x86_iommu_get_default();
4756         if (iommu) {
4757             x86_iommu_iec_register_notifier(iommu,
4758                                             kvm_update_msi_routes_all,
4759                                             NULL);
4760         }
4761         notify_list_inited = true;
4762     }
4763     return 0;
4764 }
4765 
4766 int kvm_arch_release_virq_post(int virq)
4767 {
4768     MSIRouteEntry *entry, *next;
4769     QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) {
4770         if (entry->virq == virq) {
4771             trace_kvm_x86_remove_msi_route(virq);
4772             QLIST_REMOVE(entry, list);
4773             g_free(entry);
4774             break;
4775         }
4776     }
4777     return 0;
4778 }
4779 
4780 int kvm_arch_msi_data_to_gsi(uint32_t data)
4781 {
4782     abort();
4783 }
4784 
4785 bool kvm_has_waitpkg(void)
4786 {
4787     return has_msr_umwait;
4788 }
4789