xref: /openbmc/qemu/target/i386/whpx/whpx-all.c (revision ad66b5cb)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <WinHvPlatform.h>
35 #include <WinHvEmulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct whpx_vcpu {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240 
241     /* Must be the last field as it may have a tail */
242     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
243 };
244 
245 static bool whpx_allowed;
246 static bool whp_dispatch_initialized;
247 static HMODULE hWinHvPlatform, hWinHvEmulation;
248 static uint32_t max_vcpu_index;
249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
250 
251 struct whpx_state whpx_global;
252 struct WHPDispatch whp_dispatch;
253 
254 static bool whpx_has_xsave(void)
255 {
256     return whpx_xsave_cap.XsaveSupport;
257 }
258 
259 /*
260  * VP support
261  */
262 
263 static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu)
264 {
265     return (struct whpx_vcpu *)cpu->hax_vcpu;
266 }
267 
268 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
269                                              int r86)
270 {
271     WHV_X64_SEGMENT_REGISTER hs;
272     unsigned flags = qs->flags;
273 
274     hs.Base = qs->base;
275     hs.Limit = qs->limit;
276     hs.Selector = qs->selector;
277 
278     if (v86) {
279         hs.Attributes = 0;
280         hs.SegmentType = 3;
281         hs.Present = 1;
282         hs.DescriptorPrivilegeLevel = 3;
283         hs.NonSystemSegment = 1;
284 
285     } else {
286         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
287 
288         if (r86) {
289             /* hs.Base &= 0xfffff; */
290         }
291     }
292 
293     return hs;
294 }
295 
296 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
297 {
298     SegmentCache qs;
299 
300     qs.base = hs->Base;
301     qs.limit = hs->Limit;
302     qs.selector = hs->Selector;
303 
304     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
305 
306     return qs;
307 }
308 
309 /* X64 Extended Control Registers */
310 static void whpx_set_xcrs(CPUState *cpu)
311 {
312     CPUX86State *env = cpu->env_ptr;
313     HRESULT hr;
314     struct whpx_state *whpx = &whpx_global;
315     WHV_REGISTER_VALUE xcr0;
316     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
317 
318     if (!whpx_has_xsave()) {
319         return;
320     }
321 
322     /* Only xcr0 is supported by the hypervisor currently */
323     xcr0.Reg64 = env->xcr0;
324     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
325         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
326     if (FAILED(hr)) {
327         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
328     }
329 }
330 
331 static int whpx_set_tsc(CPUState *cpu)
332 {
333     CPUX86State *env = cpu->env_ptr;
334     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
335     WHV_REGISTER_VALUE tsc_val;
336     HRESULT hr;
337     struct whpx_state *whpx = &whpx_global;
338 
339     /*
340      * Suspend the partition prior to setting the TSC to reduce the variance
341      * in TSC across vCPUs. When the first vCPU runs post suspend, the
342      * partition is automatically resumed.
343      */
344     if (whp_dispatch.WHvSuspendPartitionTime) {
345 
346         /*
347          * Unable to suspend partition while setting TSC is not a fatal
348          * error. It just increases the likelihood of TSC variance between
349          * vCPUs and some guest OS are able to handle that just fine.
350          */
351         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
352         if (FAILED(hr)) {
353             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
354         }
355     }
356 
357     tsc_val.Reg64 = env->tsc;
358     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
359         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
360     if (FAILED(hr)) {
361         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
362         return -1;
363     }
364 
365     return 0;
366 }
367 
368 /*
369  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
370  * however, they use a slightly different encoding. Specifically:
371  *
372  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
373  *
374  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
375  * and IA-32 Architectures Software Developer's Manual.
376  *
377  * The functions below translate the value of CR8 to TPR and vice versa.
378  */
379 
380 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
381 {
382     return tpr >> 4;
383 }
384 
385 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
386 {
387     return cr8 << 4;
388 }
389 
390 static void whpx_set_registers(CPUState *cpu, int level)
391 {
392     struct whpx_state *whpx = &whpx_global;
393     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
394     CPUX86State *env = cpu->env_ptr;
395     X86CPU *x86_cpu = X86_CPU(cpu);
396     struct whpx_register_set vcxt;
397     HRESULT hr;
398     int idx;
399     int idx_next;
400     int i;
401     int v86, r86;
402 
403     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
404 
405     /*
406      * Following MSRs have side effects on the guest or are too heavy for
407      * runtime. Limit them to full state update.
408      */
409     if (level >= WHPX_SET_RESET_STATE) {
410         whpx_set_tsc(cpu);
411     }
412 
413     memset(&vcxt, 0, sizeof(struct whpx_register_set));
414 
415     v86 = (env->eflags & VM_MASK);
416     r86 = !(env->cr[0] & CR0_PE_MASK);
417 
418     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
419     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
420 
421     idx = 0;
422 
423     /* Indexes for first 16 registers match between HV and QEMU definitions */
424     idx_next = 16;
425     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
426         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
427     }
428     idx = idx_next;
429 
430     /* Same goes for RIP and RFLAGS */
431     assert(whpx_register_names[idx] == WHvX64RegisterRip);
432     vcxt.values[idx++].Reg64 = env->eip;
433 
434     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
435     vcxt.values[idx++].Reg64 = env->eflags;
436 
437     /* Translate 6+4 segment registers. HV and QEMU order matches  */
438     assert(idx == WHvX64RegisterEs);
439     for (i = 0; i < 6; i += 1, idx += 1) {
440         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
441     }
442 
443     assert(idx == WHvX64RegisterLdtr);
444     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
445 
446     assert(idx == WHvX64RegisterTr);
447     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
448 
449     assert(idx == WHvX64RegisterIdtr);
450     vcxt.values[idx].Table.Base = env->idt.base;
451     vcxt.values[idx].Table.Limit = env->idt.limit;
452     idx += 1;
453 
454     assert(idx == WHvX64RegisterGdtr);
455     vcxt.values[idx].Table.Base = env->gdt.base;
456     vcxt.values[idx].Table.Limit = env->gdt.limit;
457     idx += 1;
458 
459     /* CR0, 2, 3, 4, 8 */
460     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
461     vcxt.values[idx++].Reg64 = env->cr[0];
462     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
463     vcxt.values[idx++].Reg64 = env->cr[2];
464     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
465     vcxt.values[idx++].Reg64 = env->cr[3];
466     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
467     vcxt.values[idx++].Reg64 = env->cr[4];
468     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
469     vcxt.values[idx++].Reg64 = vcpu->tpr;
470 
471     /* 8 Debug Registers - Skipped */
472 
473     /*
474      * Extended control registers needs to be handled separately depending
475      * on whether xsave is supported/enabled or not.
476      */
477     whpx_set_xcrs(cpu);
478 
479     /* 16 XMM registers */
480     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
481     idx_next = idx + 16;
482     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
483         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
484         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
485     }
486     idx = idx_next;
487 
488     /* 8 FP registers */
489     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
490     for (i = 0; i < 8; i += 1, idx += 1) {
491         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
492         /* vcxt.values[idx].Fp.AsUINT128.High64 =
493                env->fpregs[i].mmx.MMX_Q(1);
494         */
495     }
496 
497     /* FP control status register */
498     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
499     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
500     vcxt.values[idx].FpControlStatus.FpStatus =
501         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
502     vcxt.values[idx].FpControlStatus.FpTag = 0;
503     for (i = 0; i < 8; ++i) {
504         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
505     }
506     vcxt.values[idx].FpControlStatus.Reserved = 0;
507     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
508     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
509     idx += 1;
510 
511     /* XMM control status register */
512     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
513     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
514     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
515     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
516     idx += 1;
517 
518     /* MSRs */
519     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
520     vcxt.values[idx++].Reg64 = env->efer;
521 #ifdef TARGET_X86_64
522     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
523     vcxt.values[idx++].Reg64 = env->kernelgsbase;
524 #endif
525 
526     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
527     vcxt.values[idx++].Reg64 = vcpu->apic_base;
528 
529     /* WHvX64RegisterPat - Skipped */
530 
531     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
532     vcxt.values[idx++].Reg64 = env->sysenter_cs;
533     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
534     vcxt.values[idx++].Reg64 = env->sysenter_eip;
535     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
536     vcxt.values[idx++].Reg64 = env->sysenter_esp;
537     assert(whpx_register_names[idx] == WHvX64RegisterStar);
538     vcxt.values[idx++].Reg64 = env->star;
539 #ifdef TARGET_X86_64
540     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
541     vcxt.values[idx++].Reg64 = env->lstar;
542     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
543     vcxt.values[idx++].Reg64 = env->cstar;
544     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
545     vcxt.values[idx++].Reg64 = env->fmask;
546 #endif
547 
548     /* Interrupt / Event Registers - Skipped */
549 
550     assert(idx == RTL_NUMBER_OF(whpx_register_names));
551 
552     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
553         whpx->partition, cpu->cpu_index,
554         whpx_register_names,
555         RTL_NUMBER_OF(whpx_register_names),
556         &vcxt.values[0]);
557 
558     if (FAILED(hr)) {
559         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
560                      hr);
561     }
562 
563     return;
564 }
565 
566 static int whpx_get_tsc(CPUState *cpu)
567 {
568     CPUX86State *env = cpu->env_ptr;
569     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
570     WHV_REGISTER_VALUE tsc_val;
571     HRESULT hr;
572     struct whpx_state *whpx = &whpx_global;
573 
574     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
575         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
576     if (FAILED(hr)) {
577         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
578         return -1;
579     }
580 
581     env->tsc = tsc_val.Reg64;
582     return 0;
583 }
584 
585 /* X64 Extended Control Registers */
586 static void whpx_get_xcrs(CPUState *cpu)
587 {
588     CPUX86State *env = cpu->env_ptr;
589     HRESULT hr;
590     struct whpx_state *whpx = &whpx_global;
591     WHV_REGISTER_VALUE xcr0;
592     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
593 
594     if (!whpx_has_xsave()) {
595         return;
596     }
597 
598     /* Only xcr0 is supported by the hypervisor currently */
599     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
600         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
601     if (FAILED(hr)) {
602         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
603         return;
604     }
605 
606     env->xcr0 = xcr0.Reg64;
607 }
608 
609 static void whpx_get_registers(CPUState *cpu)
610 {
611     struct whpx_state *whpx = &whpx_global;
612     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
613     CPUX86State *env = cpu->env_ptr;
614     X86CPU *x86_cpu = X86_CPU(cpu);
615     struct whpx_register_set vcxt;
616     uint64_t tpr, apic_base;
617     HRESULT hr;
618     int idx;
619     int idx_next;
620     int i;
621 
622     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
623 
624     if (!env->tsc_valid) {
625         whpx_get_tsc(cpu);
626         env->tsc_valid = !runstate_is_running();
627     }
628 
629     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
630         whpx->partition, cpu->cpu_index,
631         whpx_register_names,
632         RTL_NUMBER_OF(whpx_register_names),
633         &vcxt.values[0]);
634     if (FAILED(hr)) {
635         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
636                      hr);
637     }
638 
639     if (whpx_apic_in_platform()) {
640         /*
641          * Fetch the TPR value from the emulated APIC. It may get overwritten
642          * below with the value from CR8 returned by
643          * WHvGetVirtualProcessorRegisters().
644          */
645         whpx_apic_get(x86_cpu->apic_state);
646         vcpu->tpr = whpx_apic_tpr_to_cr8(
647             cpu_get_apic_tpr(x86_cpu->apic_state));
648     }
649 
650     idx = 0;
651 
652     /* Indexes for first 16 registers match between HV and QEMU definitions */
653     idx_next = 16;
654     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
655         env->regs[idx] = vcxt.values[idx].Reg64;
656     }
657     idx = idx_next;
658 
659     /* Same goes for RIP and RFLAGS */
660     assert(whpx_register_names[idx] == WHvX64RegisterRip);
661     env->eip = vcxt.values[idx++].Reg64;
662     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
663     env->eflags = vcxt.values[idx++].Reg64;
664 
665     /* Translate 6+4 segment registers. HV and QEMU order matches  */
666     assert(idx == WHvX64RegisterEs);
667     for (i = 0; i < 6; i += 1, idx += 1) {
668         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
669     }
670 
671     assert(idx == WHvX64RegisterLdtr);
672     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
673     assert(idx == WHvX64RegisterTr);
674     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
675     assert(idx == WHvX64RegisterIdtr);
676     env->idt.base = vcxt.values[idx].Table.Base;
677     env->idt.limit = vcxt.values[idx].Table.Limit;
678     idx += 1;
679     assert(idx == WHvX64RegisterGdtr);
680     env->gdt.base = vcxt.values[idx].Table.Base;
681     env->gdt.limit = vcxt.values[idx].Table.Limit;
682     idx += 1;
683 
684     /* CR0, 2, 3, 4, 8 */
685     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
686     env->cr[0] = vcxt.values[idx++].Reg64;
687     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
688     env->cr[2] = vcxt.values[idx++].Reg64;
689     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
690     env->cr[3] = vcxt.values[idx++].Reg64;
691     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
692     env->cr[4] = vcxt.values[idx++].Reg64;
693     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
694     tpr = vcxt.values[idx++].Reg64;
695     if (tpr != vcpu->tpr) {
696         vcpu->tpr = tpr;
697         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
698     }
699 
700     /* 8 Debug Registers - Skipped */
701 
702     /*
703      * Extended control registers needs to be handled separately depending
704      * on whether xsave is supported/enabled or not.
705      */
706     whpx_get_xcrs(cpu);
707 
708     /* 16 XMM registers */
709     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
710     idx_next = idx + 16;
711     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
712         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
713         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
714     }
715     idx = idx_next;
716 
717     /* 8 FP registers */
718     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
719     for (i = 0; i < 8; i += 1, idx += 1) {
720         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
721         /* env->fpregs[i].mmx.MMX_Q(1) =
722                vcxt.values[idx].Fp.AsUINT128.High64;
723         */
724     }
725 
726     /* FP control status register */
727     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
728     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
729     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
730     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
731     for (i = 0; i < 8; ++i) {
732         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
733     }
734     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
735     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
736     idx += 1;
737 
738     /* XMM control status register */
739     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
740     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
741     idx += 1;
742 
743     /* MSRs */
744     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
745     env->efer = vcxt.values[idx++].Reg64;
746 #ifdef TARGET_X86_64
747     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
748     env->kernelgsbase = vcxt.values[idx++].Reg64;
749 #endif
750 
751     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
752     apic_base = vcxt.values[idx++].Reg64;
753     if (apic_base != vcpu->apic_base) {
754         vcpu->apic_base = apic_base;
755         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
756     }
757 
758     /* WHvX64RegisterPat - Skipped */
759 
760     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
761     env->sysenter_cs = vcxt.values[idx++].Reg64;
762     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
763     env->sysenter_eip = vcxt.values[idx++].Reg64;
764     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
765     env->sysenter_esp = vcxt.values[idx++].Reg64;
766     assert(whpx_register_names[idx] == WHvX64RegisterStar);
767     env->star = vcxt.values[idx++].Reg64;
768 #ifdef TARGET_X86_64
769     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
770     env->lstar = vcxt.values[idx++].Reg64;
771     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
772     env->cstar = vcxt.values[idx++].Reg64;
773     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
774     env->fmask = vcxt.values[idx++].Reg64;
775 #endif
776 
777     /* Interrupt / Event Registers - Skipped */
778 
779     assert(idx == RTL_NUMBER_OF(whpx_register_names));
780 
781     if (whpx_apic_in_platform()) {
782         whpx_apic_get(x86_cpu->apic_state);
783     }
784 
785     x86_update_hflags(env);
786 
787     return;
788 }
789 
790 static HRESULT CALLBACK whpx_emu_ioport_callback(
791     void *ctx,
792     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
793 {
794     MemTxAttrs attrs = { 0 };
795     address_space_rw(&address_space_io, IoAccess->Port, attrs,
796                      &IoAccess->Data, IoAccess->AccessSize,
797                      IoAccess->Direction);
798     return S_OK;
799 }
800 
801 static HRESULT CALLBACK whpx_emu_mmio_callback(
802     void *ctx,
803     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
804 {
805     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
806                            ma->Direction);
807     return S_OK;
808 }
809 
810 static HRESULT CALLBACK whpx_emu_getreg_callback(
811     void *ctx,
812     const WHV_REGISTER_NAME *RegisterNames,
813     UINT32 RegisterCount,
814     WHV_REGISTER_VALUE *RegisterValues)
815 {
816     HRESULT hr;
817     struct whpx_state *whpx = &whpx_global;
818     CPUState *cpu = (CPUState *)ctx;
819 
820     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
821         whpx->partition, cpu->cpu_index,
822         RegisterNames, RegisterCount,
823         RegisterValues);
824     if (FAILED(hr)) {
825         error_report("WHPX: Failed to get virtual processor registers,"
826                      " hr=%08lx", hr);
827     }
828 
829     return hr;
830 }
831 
832 static HRESULT CALLBACK whpx_emu_setreg_callback(
833     void *ctx,
834     const WHV_REGISTER_NAME *RegisterNames,
835     UINT32 RegisterCount,
836     const WHV_REGISTER_VALUE *RegisterValues)
837 {
838     HRESULT hr;
839     struct whpx_state *whpx = &whpx_global;
840     CPUState *cpu = (CPUState *)ctx;
841 
842     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
843         whpx->partition, cpu->cpu_index,
844         RegisterNames, RegisterCount,
845         RegisterValues);
846     if (FAILED(hr)) {
847         error_report("WHPX: Failed to set virtual processor registers,"
848                      " hr=%08lx", hr);
849     }
850 
851     /*
852      * The emulator just successfully wrote the register state. We clear the
853      * dirty state so we avoid the double write on resume of the VP.
854      */
855     cpu->vcpu_dirty = false;
856 
857     return hr;
858 }
859 
860 static HRESULT CALLBACK whpx_emu_translate_callback(
861     void *ctx,
862     WHV_GUEST_VIRTUAL_ADDRESS Gva,
863     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
864     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
865     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
866 {
867     HRESULT hr;
868     struct whpx_state *whpx = &whpx_global;
869     CPUState *cpu = (CPUState *)ctx;
870     WHV_TRANSLATE_GVA_RESULT res;
871 
872     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
873                                       Gva, TranslateFlags, &res, Gpa);
874     if (FAILED(hr)) {
875         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
876     } else {
877         *TranslationResult = res.ResultCode;
878     }
879 
880     return hr;
881 }
882 
883 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
884     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
885     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
886     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
887     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
888     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
889     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
890 };
891 
892 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
893 {
894     HRESULT hr;
895     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
896     WHV_EMULATOR_STATUS emu_status;
897 
898     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
899         vcpu->emulator, cpu,
900         &vcpu->exit_ctx.VpContext, ctx,
901         &emu_status);
902     if (FAILED(hr)) {
903         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
904         return -1;
905     }
906 
907     if (!emu_status.EmulationSuccessful) {
908         error_report("WHPX: Failed to emulate MMIO access with"
909                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
910         return -1;
911     }
912 
913     return 0;
914 }
915 
916 static int whpx_handle_portio(CPUState *cpu,
917                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
918 {
919     HRESULT hr;
920     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
921     WHV_EMULATOR_STATUS emu_status;
922 
923     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
924         vcpu->emulator, cpu,
925         &vcpu->exit_ctx.VpContext, ctx,
926         &emu_status);
927     if (FAILED(hr)) {
928         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
929         return -1;
930     }
931 
932     if (!emu_status.EmulationSuccessful) {
933         error_report("WHPX: Failed to emulate PortIO access with"
934                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
935         return -1;
936     }
937 
938     return 0;
939 }
940 
941 /*
942  * Controls whether we should intercept various exceptions on the guest,
943  * namely breakpoint/single-step events.
944  *
945  * The 'exceptions' argument accepts a bitmask, e.g:
946  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
947  */
948 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
949 {
950     struct whpx_state *whpx = &whpx_global;
951     WHV_PARTITION_PROPERTY prop = { 0, };
952     HRESULT hr;
953 
954     if (exceptions == whpx->exception_exit_bitmap) {
955         return S_OK;
956     }
957 
958     prop.ExceptionExitBitmap = exceptions;
959 
960     hr = whp_dispatch.WHvSetPartitionProperty(
961         whpx->partition,
962         WHvPartitionPropertyCodeExceptionExitBitmap,
963         &prop,
964         sizeof(WHV_PARTITION_PROPERTY));
965 
966     if (SUCCEEDED(hr)) {
967         whpx->exception_exit_bitmap = exceptions;
968     }
969 
970     return hr;
971 }
972 
973 
974 /*
975  * This function is called before/after stepping over a single instruction.
976  * It will update the CPU registers to arm/disarm the instruction stepping
977  * accordingly.
978  */
979 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
980     bool set,
981     uint64_t *exit_context_rflags)
982 {
983     WHV_REGISTER_NAME reg_name;
984     WHV_REGISTER_VALUE reg_value;
985     HRESULT hr;
986     struct whpx_state *whpx = &whpx_global;
987 
988     /*
989      * If we are trying to step over a single instruction, we need to set the
990      * TF bit in rflags. Otherwise, clear it.
991      */
992     reg_name = WHvX64RegisterRflags;
993     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
994         whpx->partition,
995         cpu->cpu_index,
996         &reg_name,
997         1,
998         &reg_value);
999 
1000     if (FAILED(hr)) {
1001         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
1002         return hr;
1003     }
1004 
1005     if (exit_context_rflags) {
1006         assert(*exit_context_rflags == reg_value.Reg64);
1007     }
1008 
1009     if (set) {
1010         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
1011         reg_value.Reg64 |= TF_MASK;
1012     } else {
1013         reg_value.Reg64 &= ~TF_MASK;
1014     }
1015 
1016     if (exit_context_rflags) {
1017         *exit_context_rflags = reg_value.Reg64;
1018     }
1019 
1020     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1021         whpx->partition,
1022         cpu->cpu_index,
1023         &reg_name,
1024         1,
1025         &reg_value);
1026 
1027     if (FAILED(hr)) {
1028         error_report("WHPX: Failed to set rflags,"
1029             " hr=%08lx",
1030             hr);
1031         return hr;
1032     }
1033 
1034     reg_name = WHvRegisterInterruptState;
1035     reg_value.Reg64 = 0;
1036 
1037     /* Suspend delivery of hardware interrupts during single-stepping. */
1038     reg_value.InterruptState.InterruptShadow = set != 0;
1039 
1040     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1041     whpx->partition,
1042         cpu->cpu_index,
1043         &reg_name,
1044         1,
1045         &reg_value);
1046 
1047     if (FAILED(hr)) {
1048         error_report("WHPX: Failed to set InterruptState,"
1049             " hr=%08lx",
1050             hr);
1051         return hr;
1052     }
1053 
1054     if (!set) {
1055         /*
1056          * We have just finished stepping over a single instruction,
1057          * and intercepted the INT1 generated by it.
1058          * We need to now hide the INT1 from the guest,
1059          * as it would not be expecting it.
1060          */
1061 
1062         reg_name = WHvX64RegisterPendingDebugException;
1063         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1064         whpx->partition,
1065             cpu->cpu_index,
1066             &reg_name,
1067             1,
1068             &reg_value);
1069 
1070         if (FAILED(hr)) {
1071             error_report("WHPX: Failed to get pending debug exceptions,"
1072                          "hr=%08lx", hr);
1073             return hr;
1074         }
1075 
1076         if (reg_value.PendingDebugException.SingleStep) {
1077             reg_value.PendingDebugException.SingleStep = 0;
1078 
1079             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1080                 whpx->partition,
1081                 cpu->cpu_index,
1082                 &reg_name,
1083                 1,
1084                 &reg_value);
1085 
1086             if (FAILED(hr)) {
1087                 error_report("WHPX: Failed to clear pending debug exceptions,"
1088                              "hr=%08lx", hr);
1089              return hr;
1090             }
1091         }
1092 
1093     }
1094 
1095     return S_OK;
1096 }
1097 
1098 /* Tries to find a breakpoint at the specified address. */
1099 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1100 {
1101     struct whpx_state *whpx = &whpx_global;
1102     int i;
1103 
1104     if (whpx->breakpoints.breakpoints) {
1105         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1106             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1107                 return &whpx->breakpoints.breakpoints->data[i];
1108             }
1109         }
1110     }
1111 
1112     return NULL;
1113 }
1114 
1115 /*
1116  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1117  * debugging user-mode applications. Since the WHPX API does not offer
1118  * an easy way to pass the intercepted exception back to the guest, we
1119  * resort to using INT1 instead, and let the guest always handle INT3.
1120  */
1121 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1122 
1123 /*
1124  * The WHPX QEMU backend implements breakpoints by writing the INT1
1125  * instruction into memory (ignoring the DRx registers). This raises a few
1126  * issues that need to be carefully handled:
1127  *
1128  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1129  *    at the same location, and later remove them in arbitrary order.
1130  *    This should not cause memory corruption, and should only remove the
1131  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1132  *
1133  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1134  *    physical location. Hence, physically adding/removing a breakpoint can
1135  *    theoretically fail at any time. We need to keep track of it.
1136  *
1137  * The function below rebuilds a list of low-level breakpoints (one per
1138  * address, tracking the original instruction and any errors) from the list of
1139  * high-level breakpoints (set via cpu_breakpoint_insert()).
1140  *
1141  * In order to optimize performance, this function stores the list of
1142  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1143  * low-level ones, so that it won't be re-invoked until these breakpoints
1144  * change.
1145  *
1146  * Note that this function decides which breakpoints should be inserted into,
1147  * memory, but doesn't actually do it. The memory accessing is done in
1148  * whpx_apply_breakpoints().
1149  */
1150 static void whpx_translate_cpu_breakpoints(
1151     struct whpx_breakpoints *breakpoints,
1152     CPUState *cpu,
1153     int cpu_breakpoint_count)
1154 {
1155     CPUBreakpoint *bp;
1156     int cpu_bp_index = 0;
1157 
1158     breakpoints->original_addresses =
1159         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1160 
1161     breakpoints->original_address_count = cpu_breakpoint_count;
1162 
1163     int max_breakpoints = cpu_breakpoint_count +
1164         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1165 
1166     struct whpx_breakpoint_collection *new_breakpoints =
1167         g_malloc0(sizeof(struct whpx_breakpoint_collection)
1168                   + max_breakpoints * sizeof(struct whpx_breakpoint));
1169 
1170     new_breakpoints->allocated = max_breakpoints;
1171     new_breakpoints->used = 0;
1172 
1173     /*
1174      * 1. Preserve all old breakpoints that could not be automatically
1175      * cleared when the CPU got stopped.
1176      */
1177     if (breakpoints->breakpoints) {
1178         int i;
1179         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1180             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1181                 new_breakpoints->data[new_breakpoints->used++] =
1182                     breakpoints->breakpoints->data[i];
1183             }
1184         }
1185     }
1186 
1187     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1188     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1189         int i;
1190         bool found = false;
1191 
1192         /* This will be used to detect changed CPU breakpoints later. */
1193         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1194 
1195         for (i = 0; i < new_breakpoints->used; i++) {
1196             /*
1197              * WARNING: This loop has O(N^2) complexity, where N is the
1198              * number of breakpoints. It should not be a bottleneck in
1199              * real-world scenarios, since it only needs to run once after
1200              * the breakpoints have been modified.
1201              * If this ever becomes a concern, it can be optimized by storing
1202              * high-level breakpoint objects in a tree or hash map.
1203              */
1204 
1205             if (new_breakpoints->data[i].address == bp->pc) {
1206                 /* There was already a breakpoint at this address. */
1207                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1208                     new_breakpoints->data[i].state = WHPX_BP_SET;
1209                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1210                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1211                 }
1212 
1213                 found = true;
1214                 break;
1215             }
1216         }
1217 
1218         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1219             /* No WHPX breakpoint at this address. Create one. */
1220             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1221             new_breakpoints->data[new_breakpoints->used].state =
1222                 WHPX_BP_SET_PENDING;
1223             new_breakpoints->used++;
1224         }
1225     }
1226 
1227     /*
1228      * Free the previous breakpoint list. This can be optimized by keeping
1229      * it as shadow buffer for the next computation instead of freeing
1230      * it immediately.
1231      */
1232     g_free(breakpoints->breakpoints);
1233 
1234     breakpoints->breakpoints = new_breakpoints;
1235 }
1236 
1237 /*
1238  * Physically inserts/removes the breakpoints by reading and writing the
1239  * physical memory, keeping a track of the failed attempts.
1240  *
1241  * Passing resuming=true  will try to set all previously unset breakpoints.
1242  * Passing resuming=false will remove all inserted ones.
1243  */
1244 static void whpx_apply_breakpoints(
1245     struct whpx_breakpoint_collection *breakpoints,
1246     CPUState *cpu,
1247     bool resuming)
1248 {
1249     int i, rc;
1250     if (!breakpoints) {
1251         return;
1252     }
1253 
1254     for (i = 0; i < breakpoints->used; i++) {
1255         /* Decide what to do right now based on the last known state. */
1256         WhpxBreakpointState state = breakpoints->data[i].state;
1257         switch (state) {
1258         case WHPX_BP_CLEARED:
1259             if (resuming) {
1260                 state = WHPX_BP_SET_PENDING;
1261             }
1262             break;
1263         case WHPX_BP_SET_PENDING:
1264             if (!resuming) {
1265                 state = WHPX_BP_CLEARED;
1266             }
1267             break;
1268         case WHPX_BP_SET:
1269             if (!resuming) {
1270                 state = WHPX_BP_CLEAR_PENDING;
1271             }
1272             break;
1273         case WHPX_BP_CLEAR_PENDING:
1274             if (resuming) {
1275                 state = WHPX_BP_SET;
1276             }
1277             break;
1278         }
1279 
1280         if (state == WHPX_BP_SET_PENDING) {
1281             /* Remember the original instruction. */
1282             rc = cpu_memory_rw_debug(cpu,
1283                 breakpoints->data[i].address,
1284                 &breakpoints->data[i].original_instruction,
1285                 1,
1286                 false);
1287 
1288             if (!rc) {
1289                 /* Write the breakpoint instruction. */
1290                 rc = cpu_memory_rw_debug(cpu,
1291                     breakpoints->data[i].address,
1292                     (void *)&whpx_breakpoint_instruction,
1293                     1,
1294                     true);
1295             }
1296 
1297             if (!rc) {
1298                 state = WHPX_BP_SET;
1299             }
1300 
1301         }
1302 
1303         if (state == WHPX_BP_CLEAR_PENDING) {
1304             /* Restore the original instruction. */
1305             rc = cpu_memory_rw_debug(cpu,
1306                 breakpoints->data[i].address,
1307                 &breakpoints->data[i].original_instruction,
1308                 1,
1309                 true);
1310 
1311             if (!rc) {
1312                 state = WHPX_BP_CLEARED;
1313             }
1314         }
1315 
1316         breakpoints->data[i].state = state;
1317     }
1318 }
1319 
1320 /*
1321  * This function is called when the a VCPU is about to start and no other
1322  * VCPUs have been started so far. Since the VCPU start order could be
1323  * arbitrary, it doesn't have to be VCPU#0.
1324  *
1325  * It is used to commit the breakpoints into memory, and configure WHPX
1326  * to intercept debug exceptions.
1327  *
1328  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1329  * more VCPUs are already running, so this is the best place to do it.
1330  */
1331 static int whpx_first_vcpu_starting(CPUState *cpu)
1332 {
1333     struct whpx_state *whpx = &whpx_global;
1334     HRESULT hr;
1335 
1336     g_assert(qemu_mutex_iothread_locked());
1337 
1338     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1339             (whpx->breakpoints.breakpoints &&
1340              whpx->breakpoints.breakpoints->used)) {
1341         CPUBreakpoint *bp;
1342         int i = 0;
1343         bool update_pending = false;
1344 
1345         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1346             if (i >= whpx->breakpoints.original_address_count ||
1347                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1348                 update_pending = true;
1349             }
1350 
1351             i++;
1352         }
1353 
1354         if (i != whpx->breakpoints.original_address_count) {
1355             update_pending = true;
1356         }
1357 
1358         if (update_pending) {
1359             /*
1360              * The CPU breakpoints have changed since the last call to
1361              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1362              * now be recomputed.
1363              */
1364             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1365         }
1366 
1367         /* Actually insert the breakpoints into the memory. */
1368         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1369     }
1370 
1371     uint64_t exception_mask;
1372     if (whpx->step_pending ||
1373         (whpx->breakpoints.breakpoints &&
1374          whpx->breakpoints.breakpoints->used)) {
1375         /*
1376          * We are either attempting to single-step one or more CPUs, or
1377          * have one or more breakpoints enabled. Both require intercepting
1378          * the WHvX64ExceptionTypeBreakpointTrap exception.
1379          */
1380 
1381         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1382     } else {
1383         /* Let the guest handle all exceptions. */
1384         exception_mask = 0;
1385     }
1386 
1387     hr = whpx_set_exception_exit_bitmap(exception_mask);
1388     if (!SUCCEEDED(hr)) {
1389         error_report("WHPX: Failed to update exception exit mask,"
1390                      "hr=%08lx.", hr);
1391         return 1;
1392     }
1393 
1394     return 0;
1395 }
1396 
1397 /*
1398  * This function is called when the last VCPU has finished running.
1399  * It is used to remove any previously set breakpoints from memory.
1400  */
1401 static int whpx_last_vcpu_stopping(CPUState *cpu)
1402 {
1403     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1404     return 0;
1405 }
1406 
1407 /* Returns the address of the next instruction that is about to be executed. */
1408 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1409 {
1410     if (cpu->vcpu_dirty) {
1411         /* The CPU registers have been modified by other parts of QEMU. */
1412         CPUArchState *env = (CPUArchState *)(cpu->env_ptr);
1413         return env->eip;
1414     } else if (exit_context_valid) {
1415         /*
1416          * The CPU registers have not been modified by neither other parts
1417          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1418          * This is the most common case.
1419          */
1420         struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1421         return vcpu->exit_ctx.VpContext.Rip;
1422     } else {
1423         /*
1424          * The CPU registers have been modified by a call to
1425          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1426          * the target.
1427          */
1428         WHV_REGISTER_VALUE reg_value;
1429         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1430         HRESULT hr;
1431         struct whpx_state *whpx = &whpx_global;
1432 
1433         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1434             whpx->partition,
1435             cpu->cpu_index,
1436             &reg_name,
1437             1,
1438             &reg_value);
1439 
1440         if (FAILED(hr)) {
1441             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1442             return 0;
1443         }
1444 
1445         return reg_value.Reg64;
1446     }
1447 }
1448 
1449 static int whpx_handle_halt(CPUState *cpu)
1450 {
1451     CPUX86State *env = cpu->env_ptr;
1452     int ret = 0;
1453 
1454     qemu_mutex_lock_iothread();
1455     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1456           (env->eflags & IF_MASK)) &&
1457         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1458         cpu->exception_index = EXCP_HLT;
1459         cpu->halted = true;
1460         ret = 1;
1461     }
1462     qemu_mutex_unlock_iothread();
1463 
1464     return ret;
1465 }
1466 
1467 static void whpx_vcpu_pre_run(CPUState *cpu)
1468 {
1469     HRESULT hr;
1470     struct whpx_state *whpx = &whpx_global;
1471     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1472     CPUX86State *env = cpu->env_ptr;
1473     X86CPU *x86_cpu = X86_CPU(cpu);
1474     int irq;
1475     uint8_t tpr;
1476     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1477     UINT32 reg_count = 0;
1478     WHV_REGISTER_VALUE reg_values[3];
1479     WHV_REGISTER_NAME reg_names[3];
1480 
1481     memset(&new_int, 0, sizeof(new_int));
1482     memset(reg_values, 0, sizeof(reg_values));
1483 
1484     qemu_mutex_lock_iothread();
1485 
1486     /* Inject NMI */
1487     if (!vcpu->interruption_pending &&
1488         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1489         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1490             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1491             vcpu->interruptable = false;
1492             new_int.InterruptionType = WHvX64PendingNmi;
1493             new_int.InterruptionPending = 1;
1494             new_int.InterruptionVector = 2;
1495         }
1496         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1497             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1498         }
1499     }
1500 
1501     /*
1502      * Force the VCPU out of its inner loop to process any INIT requests or
1503      * commit pending TPR access.
1504      */
1505     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1506         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1507             !(env->hflags & HF_SMM_MASK)) {
1508             cpu->exit_request = 1;
1509         }
1510         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1511             cpu->exit_request = 1;
1512         }
1513     }
1514 
1515     /* Get pending hard interruption or replay one that was overwritten */
1516     if (!whpx_apic_in_platform()) {
1517         if (!vcpu->interruption_pending &&
1518             vcpu->interruptable && (env->eflags & IF_MASK)) {
1519             assert(!new_int.InterruptionPending);
1520             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1521                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1522                 irq = cpu_get_pic_interrupt(env);
1523                 if (irq >= 0) {
1524                     new_int.InterruptionType = WHvX64PendingInterrupt;
1525                     new_int.InterruptionPending = 1;
1526                     new_int.InterruptionVector = irq;
1527                 }
1528             }
1529         }
1530 
1531         /* Setup interrupt state if new one was prepared */
1532         if (new_int.InterruptionPending) {
1533             reg_values[reg_count].PendingInterruption = new_int;
1534             reg_names[reg_count] = WHvRegisterPendingInterruption;
1535             reg_count += 1;
1536         }
1537     } else if (vcpu->ready_for_pic_interrupt &&
1538                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1539         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1540         irq = cpu_get_pic_interrupt(env);
1541         if (irq >= 0) {
1542             reg_names[reg_count] = WHvRegisterPendingEvent;
1543             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1544             {
1545                 .EventPending = 1,
1546                 .EventType = WHvX64PendingEventExtInt,
1547                 .Vector = irq,
1548             };
1549             reg_count += 1;
1550         }
1551      }
1552 
1553     /* Sync the TPR to the CR8 if was modified during the intercept */
1554     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1555     if (tpr != vcpu->tpr) {
1556         vcpu->tpr = tpr;
1557         reg_values[reg_count].Reg64 = tpr;
1558         cpu->exit_request = 1;
1559         reg_names[reg_count] = WHvX64RegisterCr8;
1560         reg_count += 1;
1561     }
1562 
1563     /* Update the state of the interrupt delivery notification */
1564     if (!vcpu->window_registered &&
1565         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1566         reg_values[reg_count].DeliverabilityNotifications =
1567             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1568                 .InterruptNotification = 1
1569             };
1570         vcpu->window_registered = 1;
1571         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1572         reg_count += 1;
1573     }
1574 
1575     qemu_mutex_unlock_iothread();
1576     vcpu->ready_for_pic_interrupt = false;
1577 
1578     if (reg_count) {
1579         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1580             whpx->partition, cpu->cpu_index,
1581             reg_names, reg_count, reg_values);
1582         if (FAILED(hr)) {
1583             error_report("WHPX: Failed to set interrupt state registers,"
1584                          " hr=%08lx", hr);
1585         }
1586     }
1587 
1588     return;
1589 }
1590 
1591 static void whpx_vcpu_post_run(CPUState *cpu)
1592 {
1593     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1594     CPUX86State *env = cpu->env_ptr;
1595     X86CPU *x86_cpu = X86_CPU(cpu);
1596 
1597     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1598 
1599     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1600     if (vcpu->tpr != tpr) {
1601         vcpu->tpr = tpr;
1602         qemu_mutex_lock_iothread();
1603         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1604         qemu_mutex_unlock_iothread();
1605     }
1606 
1607     vcpu->interruption_pending =
1608         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1609 
1610     vcpu->interruptable =
1611         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1612 
1613     return;
1614 }
1615 
1616 static void whpx_vcpu_process_async_events(CPUState *cpu)
1617 {
1618     CPUX86State *env = cpu->env_ptr;
1619     X86CPU *x86_cpu = X86_CPU(cpu);
1620     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1621 
1622     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1623         !(env->hflags & HF_SMM_MASK)) {
1624         whpx_cpu_synchronize_state(cpu);
1625         do_cpu_init(x86_cpu);
1626         vcpu->interruptable = true;
1627     }
1628 
1629     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1630         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1631         apic_poll_irq(x86_cpu->apic_state);
1632     }
1633 
1634     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1635          (env->eflags & IF_MASK)) ||
1636         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1637         cpu->halted = false;
1638     }
1639 
1640     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1641         whpx_cpu_synchronize_state(cpu);
1642         do_cpu_sipi(x86_cpu);
1643     }
1644 
1645     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1646         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1647         whpx_cpu_synchronize_state(cpu);
1648         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1649                                       env->tpr_access_type);
1650     }
1651 
1652     return;
1653 }
1654 
1655 static int whpx_vcpu_run(CPUState *cpu)
1656 {
1657     HRESULT hr;
1658     struct whpx_state *whpx = &whpx_global;
1659     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1660     struct whpx_breakpoint *stepped_over_bp = NULL;
1661     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1662     int ret;
1663 
1664     g_assert(qemu_mutex_iothread_locked());
1665 
1666     if (whpx->running_cpus++ == 0) {
1667         /* Insert breakpoints into memory, update exception exit bitmap. */
1668         ret = whpx_first_vcpu_starting(cpu);
1669         if (ret != 0) {
1670             return ret;
1671         }
1672     }
1673 
1674     if (whpx->breakpoints.breakpoints &&
1675         whpx->breakpoints.breakpoints->used > 0)
1676     {
1677         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1678         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1679         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1680             stepped_over_bp = NULL;
1681         }
1682 
1683         if (stepped_over_bp) {
1684             /*
1685              * We are trying to run the instruction overwritten by an active
1686              * breakpoint. We will temporarily disable the breakpoint, suspend
1687              * other CPUs, and step over the instruction.
1688              */
1689             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1690         }
1691     }
1692 
1693     if (exclusive_step_mode == WHPX_STEP_NONE) {
1694         whpx_vcpu_process_async_events(cpu);
1695         if (cpu->halted && !whpx_apic_in_platform()) {
1696             cpu->exception_index = EXCP_HLT;
1697             qatomic_set(&cpu->exit_request, false);
1698             return 0;
1699         }
1700     }
1701 
1702     qemu_mutex_unlock_iothread();
1703 
1704     if (exclusive_step_mode != WHPX_STEP_NONE) {
1705         start_exclusive();
1706         g_assert(cpu == current_cpu);
1707         g_assert(!cpu->running);
1708         cpu->running = true;
1709 
1710         hr = whpx_set_exception_exit_bitmap(
1711             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1712         if (!SUCCEEDED(hr)) {
1713             error_report("WHPX: Failed to update exception exit mask, "
1714                          "hr=%08lx.", hr);
1715             return 1;
1716         }
1717 
1718         if (stepped_over_bp) {
1719             /* Temporarily disable the triggered breakpoint. */
1720             cpu_memory_rw_debug(cpu,
1721                 stepped_over_bp->address,
1722                 &stepped_over_bp->original_instruction,
1723                 1,
1724                 true);
1725         }
1726     } else {
1727         cpu_exec_start(cpu);
1728     }
1729 
1730     do {
1731         if (cpu->vcpu_dirty) {
1732             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1733             cpu->vcpu_dirty = false;
1734         }
1735 
1736         if (exclusive_step_mode == WHPX_STEP_NONE) {
1737             whpx_vcpu_pre_run(cpu);
1738 
1739             if (qatomic_read(&cpu->exit_request)) {
1740                 whpx_vcpu_kick(cpu);
1741             }
1742         }
1743 
1744         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1745             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1746         }
1747 
1748         hr = whp_dispatch.WHvRunVirtualProcessor(
1749             whpx->partition, cpu->cpu_index,
1750             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1751 
1752         if (FAILED(hr)) {
1753             error_report("WHPX: Failed to exec a virtual processor,"
1754                          " hr=%08lx", hr);
1755             ret = -1;
1756             break;
1757         }
1758 
1759         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1760             whpx_vcpu_configure_single_stepping(cpu,
1761                 false,
1762                 &vcpu->exit_ctx.VpContext.Rflags);
1763         }
1764 
1765         whpx_vcpu_post_run(cpu);
1766 
1767         switch (vcpu->exit_ctx.ExitReason) {
1768         case WHvRunVpExitReasonMemoryAccess:
1769             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1770             break;
1771 
1772         case WHvRunVpExitReasonX64IoPortAccess:
1773             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1774             break;
1775 
1776         case WHvRunVpExitReasonX64InterruptWindow:
1777             vcpu->ready_for_pic_interrupt = 1;
1778             vcpu->window_registered = 0;
1779             ret = 0;
1780             break;
1781 
1782         case WHvRunVpExitReasonX64ApicEoi:
1783             assert(whpx_apic_in_platform());
1784             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1785             break;
1786 
1787         case WHvRunVpExitReasonX64Halt:
1788             /*
1789              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1790              * longer used.
1791              */
1792             ret = whpx_handle_halt(cpu);
1793             break;
1794 
1795         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1796             WHV_INTERRUPT_CONTROL ipi = {0};
1797             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1798             uint32_t delivery_mode =
1799                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1800             int dest_shorthand =
1801                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1802             bool broadcast = false;
1803             bool include_self = false;
1804             uint32_t i;
1805 
1806             /* We only registered for INIT and SIPI exits. */
1807             if ((delivery_mode != APIC_DM_INIT) &&
1808                 (delivery_mode != APIC_DM_SIPI)) {
1809                 error_report(
1810                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1811                 break;
1812             }
1813 
1814             if (delivery_mode == APIC_DM_INIT) {
1815                 ipi.Type = WHvX64InterruptTypeInit;
1816             } else {
1817                 ipi.Type = WHvX64InterruptTypeSipi;
1818             }
1819 
1820             ipi.DestinationMode =
1821                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1822                     WHvX64InterruptDestinationModeLogical :
1823                     WHvX64InterruptDestinationModePhysical;
1824 
1825             ipi.TriggerMode =
1826                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1827                     WHvX64InterruptTriggerModeLevel :
1828                     WHvX64InterruptTriggerModeEdge;
1829 
1830             ipi.Vector = icr & APIC_VECTOR_MASK;
1831             switch (dest_shorthand) {
1832             /* no shorthand. Bits 56-63 contain the destination. */
1833             case 0:
1834                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1835                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1836                         &ipi, sizeof(ipi));
1837                 if (FAILED(hr)) {
1838                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1839                         hr);
1840                 }
1841 
1842                 break;
1843 
1844             /* self */
1845             case 1:
1846                 include_self = true;
1847                 break;
1848 
1849             /* broadcast, including self */
1850             case 2:
1851                 broadcast = true;
1852                 include_self = true;
1853                 break;
1854 
1855             /* broadcast, excluding self */
1856             case 3:
1857                 broadcast = true;
1858                 break;
1859             }
1860 
1861             if (!broadcast && !include_self) {
1862                 break;
1863             }
1864 
1865             for (i = 0; i <= max_vcpu_index; i++) {
1866                 if (i == cpu->cpu_index && !include_self) {
1867                     continue;
1868                 }
1869 
1870                 /*
1871                  * Assuming that APIC Ids are identity mapped since
1872                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1873                  * are not handled yet and the hypervisor doesn't allow the
1874                  * guest to modify the APIC ID.
1875                  */
1876                 ipi.Destination = i;
1877                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1878                         &ipi, sizeof(ipi));
1879                 if (FAILED(hr)) {
1880                     error_report(
1881                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1882                         i, hr);
1883                 }
1884             }
1885 
1886             break;
1887         }
1888 
1889         case WHvRunVpExitReasonCanceled:
1890             if (exclusive_step_mode != WHPX_STEP_NONE) {
1891                 /*
1892                  * We are trying to step over a single instruction, and
1893                  * likely got a request to stop from another thread.
1894                  * Delay it until we are done stepping
1895                  * over.
1896                  */
1897                 ret = 0;
1898             } else {
1899                 cpu->exception_index = EXCP_INTERRUPT;
1900                 ret = 1;
1901             }
1902             break;
1903         case WHvRunVpExitReasonX64MsrAccess: {
1904             WHV_REGISTER_VALUE reg_values[3] = {0};
1905             WHV_REGISTER_NAME reg_names[3];
1906             UINT32 reg_count;
1907 
1908             reg_names[0] = WHvX64RegisterRip;
1909             reg_names[1] = WHvX64RegisterRax;
1910             reg_names[2] = WHvX64RegisterRdx;
1911 
1912             reg_values[0].Reg64 =
1913                 vcpu->exit_ctx.VpContext.Rip +
1914                 vcpu->exit_ctx.VpContext.InstructionLength;
1915 
1916             /*
1917              * For all unsupported MSR access we:
1918              *     ignore writes
1919              *     return 0 on read.
1920              */
1921             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1922                         1 : 3;
1923 
1924             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1925                 whpx->partition,
1926                 cpu->cpu_index,
1927                 reg_names, reg_count,
1928                 reg_values);
1929 
1930             if (FAILED(hr)) {
1931                 error_report("WHPX: Failed to set MsrAccess state "
1932                              " registers, hr=%08lx", hr);
1933             }
1934             ret = 0;
1935             break;
1936         }
1937         case WHvRunVpExitReasonX64Cpuid: {
1938             WHV_REGISTER_VALUE reg_values[5];
1939             WHV_REGISTER_NAME reg_names[5];
1940             UINT32 reg_count = 5;
1941             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1942             X86CPU *x86_cpu = X86_CPU(cpu);
1943             CPUX86State *env = &x86_cpu->env;
1944 
1945             memset(reg_values, 0, sizeof(reg_values));
1946 
1947             rip = vcpu->exit_ctx.VpContext.Rip +
1948                   vcpu->exit_ctx.VpContext.InstructionLength;
1949             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1950 
1951             /*
1952              * Ideally, these should be supplied to the hypervisor during VCPU
1953              * initialization and it should be able to satisfy this request.
1954              * But, currently, WHPX doesn't support setting CPUID values in the
1955              * hypervisor once the partition has been setup, which is too late
1956              * since VCPUs are realized later. For now, use the values from
1957              * QEMU to satisfy these requests, until WHPX adds support for
1958              * being able to set these values in the hypervisor at runtime.
1959              */
1960             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1961                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1962             switch (cpuid_fn) {
1963             case 0x40000000:
1964                 /* Expose the vmware cpu frequency cpuid leaf */
1965                 rax = 0x40000010;
1966                 rbx = rcx = rdx = 0;
1967                 break;
1968 
1969             case 0x40000010:
1970                 rax = env->tsc_khz;
1971                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1972                 rcx = rdx = 0;
1973                 break;
1974 
1975             case 0x80000001:
1976                 /* Remove any support of OSVW */
1977                 rcx &= ~CPUID_EXT3_OSVW;
1978                 break;
1979             }
1980 
1981             reg_names[0] = WHvX64RegisterRip;
1982             reg_names[1] = WHvX64RegisterRax;
1983             reg_names[2] = WHvX64RegisterRcx;
1984             reg_names[3] = WHvX64RegisterRdx;
1985             reg_names[4] = WHvX64RegisterRbx;
1986 
1987             reg_values[0].Reg64 = rip;
1988             reg_values[1].Reg64 = rax;
1989             reg_values[2].Reg64 = rcx;
1990             reg_values[3].Reg64 = rdx;
1991             reg_values[4].Reg64 = rbx;
1992 
1993             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1994                 whpx->partition, cpu->cpu_index,
1995                 reg_names,
1996                 reg_count,
1997                 reg_values);
1998 
1999             if (FAILED(hr)) {
2000                 error_report("WHPX: Failed to set CpuidAccess state registers,"
2001                              " hr=%08lx", hr);
2002             }
2003             ret = 0;
2004             break;
2005         }
2006         case WHvRunVpExitReasonException:
2007             whpx_get_registers(cpu);
2008 
2009             if ((vcpu->exit_ctx.VpException.ExceptionType ==
2010                  WHvX64ExceptionTypeDebugTrapOrFault) &&
2011                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
2012                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
2013                  whpx_breakpoint_instruction)) {
2014                 /* Stopped at a software breakpoint. */
2015                 cpu->exception_index = EXCP_DEBUG;
2016             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
2017                         WHvX64ExceptionTypeDebugTrapOrFault) &&
2018                        !cpu->singlestep_enabled) {
2019                 /*
2020                  * Just finished stepping over a breakpoint, but the
2021                  * gdb does not expect us to do single-stepping.
2022                  * Don't do anything special.
2023                  */
2024                 cpu->exception_index = EXCP_INTERRUPT;
2025             } else {
2026                 /* Another exception or debug event. Report it to GDB. */
2027                 cpu->exception_index = EXCP_DEBUG;
2028             }
2029 
2030             ret = 1;
2031             break;
2032         case WHvRunVpExitReasonNone:
2033         case WHvRunVpExitReasonUnrecoverableException:
2034         case WHvRunVpExitReasonInvalidVpRegisterValue:
2035         case WHvRunVpExitReasonUnsupportedFeature:
2036         default:
2037             error_report("WHPX: Unexpected VP exit code %d",
2038                          vcpu->exit_ctx.ExitReason);
2039             whpx_get_registers(cpu);
2040             qemu_mutex_lock_iothread();
2041             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2042             qemu_mutex_unlock_iothread();
2043             break;
2044         }
2045 
2046     } while (!ret);
2047 
2048     if (stepped_over_bp) {
2049         /* Restore the breakpoint we stepped over */
2050         cpu_memory_rw_debug(cpu,
2051             stepped_over_bp->address,
2052             (void *)&whpx_breakpoint_instruction,
2053             1,
2054             true);
2055     }
2056 
2057     if (exclusive_step_mode != WHPX_STEP_NONE) {
2058         g_assert(cpu_in_exclusive_context(cpu));
2059         cpu->running = false;
2060         end_exclusive();
2061 
2062         exclusive_step_mode = WHPX_STEP_NONE;
2063     } else {
2064         cpu_exec_end(cpu);
2065     }
2066 
2067     qemu_mutex_lock_iothread();
2068     current_cpu = cpu;
2069 
2070     if (--whpx->running_cpus == 0) {
2071         whpx_last_vcpu_stopping(cpu);
2072     }
2073 
2074     qatomic_set(&cpu->exit_request, false);
2075 
2076     return ret < 0;
2077 }
2078 
2079 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2080 {
2081     if (!cpu->vcpu_dirty) {
2082         whpx_get_registers(cpu);
2083         cpu->vcpu_dirty = true;
2084     }
2085 }
2086 
2087 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2088                                                run_on_cpu_data arg)
2089 {
2090     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2091     cpu->vcpu_dirty = false;
2092 }
2093 
2094 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2095                                               run_on_cpu_data arg)
2096 {
2097     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2098     cpu->vcpu_dirty = false;
2099 }
2100 
2101 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2102                                                run_on_cpu_data arg)
2103 {
2104     cpu->vcpu_dirty = true;
2105 }
2106 
2107 /*
2108  * CPU support.
2109  */
2110 
2111 void whpx_cpu_synchronize_state(CPUState *cpu)
2112 {
2113     if (!cpu->vcpu_dirty) {
2114         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2115     }
2116 }
2117 
2118 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2119 {
2120     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2121 }
2122 
2123 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2124 {
2125     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2126 }
2127 
2128 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2129 {
2130     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2131 }
2132 
2133 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2134 {
2135     whpx_global.step_pending = step_pending;
2136 }
2137 
2138 /*
2139  * Vcpu support.
2140  */
2141 
2142 static Error *whpx_migration_blocker;
2143 
2144 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2145 {
2146     CPUX86State *env = opaque;
2147 
2148     if (running) {
2149         env->tsc_valid = false;
2150     }
2151 }
2152 
2153 int whpx_init_vcpu(CPUState *cpu)
2154 {
2155     HRESULT hr;
2156     struct whpx_state *whpx = &whpx_global;
2157     struct whpx_vcpu *vcpu = NULL;
2158     Error *local_error = NULL;
2159     CPUX86State *env = cpu->env_ptr;
2160     X86CPU *x86_cpu = X86_CPU(cpu);
2161     UINT64 freq = 0;
2162     int ret;
2163 
2164     /* Add migration blockers for all unsupported features of the
2165      * Windows Hypervisor Platform
2166      */
2167     if (whpx_migration_blocker == NULL) {
2168         error_setg(&whpx_migration_blocker,
2169                "State blocked due to non-migratable CPUID feature support,"
2170                "dirty memory tracking support, and XSAVE/XRSTOR support");
2171 
2172         if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) {
2173             error_report_err(local_error);
2174             error_free(whpx_migration_blocker);
2175             ret = -EINVAL;
2176             goto error;
2177         }
2178     }
2179 
2180     vcpu = g_new0(struct whpx_vcpu, 1);
2181 
2182     if (!vcpu) {
2183         error_report("WHPX: Failed to allocte VCPU context.");
2184         ret = -ENOMEM;
2185         goto error;
2186     }
2187 
2188     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2189         &whpx_emu_callbacks,
2190         &vcpu->emulator);
2191     if (FAILED(hr)) {
2192         error_report("WHPX: Failed to setup instruction completion support,"
2193                      " hr=%08lx", hr);
2194         ret = -EINVAL;
2195         goto error;
2196     }
2197 
2198     hr = whp_dispatch.WHvCreateVirtualProcessor(
2199         whpx->partition, cpu->cpu_index, 0);
2200     if (FAILED(hr)) {
2201         error_report("WHPX: Failed to create a virtual processor,"
2202                      " hr=%08lx", hr);
2203         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2204         ret = -EINVAL;
2205         goto error;
2206     }
2207 
2208     /*
2209      * vcpu's TSC frequency is either specified by user, or use the value
2210      * provided by Hyper-V if the former is not present. In the latter case, we
2211      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2212      * frequency can be migrated later via this field.
2213      */
2214     if (!env->tsc_khz) {
2215         hr = whp_dispatch.WHvGetCapability(
2216             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2217                 NULL);
2218         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2219             if (FAILED(hr)) {
2220                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2221             } else {
2222                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2223             }
2224         }
2225     }
2226 
2227     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2228     hr = whp_dispatch.WHvGetCapability(
2229         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2230     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2231         if (FAILED(hr)) {
2232             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2233         } else {
2234             env->apic_bus_freq = freq;
2235         }
2236     }
2237 
2238     /*
2239      * If the vmware cpuid frequency leaf option is set, and we have a valid
2240      * tsc value, trap the corresponding cpuid's.
2241      */
2242     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2243         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2244 
2245         hr = whp_dispatch.WHvSetPartitionProperty(
2246                 whpx->partition,
2247                 WHvPartitionPropertyCodeCpuidExitList,
2248                 cpuidExitList,
2249                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2250 
2251         if (FAILED(hr)) {
2252             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2253                         hr);
2254             ret = -EINVAL;
2255             goto error;
2256         }
2257     }
2258 
2259     vcpu->interruptable = true;
2260     cpu->vcpu_dirty = true;
2261     cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu;
2262     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2263     qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr);
2264 
2265     return 0;
2266 
2267 error:
2268     g_free(vcpu);
2269 
2270     return ret;
2271 }
2272 
2273 int whpx_vcpu_exec(CPUState *cpu)
2274 {
2275     int ret;
2276     int fatal;
2277 
2278     for (;;) {
2279         if (cpu->exception_index >= EXCP_INTERRUPT) {
2280             ret = cpu->exception_index;
2281             cpu->exception_index = -1;
2282             break;
2283         }
2284 
2285         fatal = whpx_vcpu_run(cpu);
2286 
2287         if (fatal) {
2288             error_report("WHPX: Failed to exec a virtual processor");
2289             abort();
2290         }
2291     }
2292 
2293     return ret;
2294 }
2295 
2296 void whpx_destroy_vcpu(CPUState *cpu)
2297 {
2298     struct whpx_state *whpx = &whpx_global;
2299     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
2300 
2301     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2302     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2303     g_free(cpu->hax_vcpu);
2304     return;
2305 }
2306 
2307 void whpx_vcpu_kick(CPUState *cpu)
2308 {
2309     struct whpx_state *whpx = &whpx_global;
2310     whp_dispatch.WHvCancelRunVirtualProcessor(
2311         whpx->partition, cpu->cpu_index, 0);
2312 }
2313 
2314 /*
2315  * Memory support.
2316  */
2317 
2318 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2319                                 void *host_va, int add, int rom,
2320                                 const char *name)
2321 {
2322     struct whpx_state *whpx = &whpx_global;
2323     HRESULT hr;
2324 
2325     /*
2326     if (add) {
2327         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2328                (void*)start_pa, (void*)size, host_va,
2329                (rom ? "ROM" : "RAM"), name);
2330     } else {
2331         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2332                (void*)start_pa, (void*)size, host_va, name);
2333     }
2334     */
2335 
2336     if (add) {
2337         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2338                                          host_va,
2339                                          start_pa,
2340                                          size,
2341                                          (WHvMapGpaRangeFlagRead |
2342                                           WHvMapGpaRangeFlagExecute |
2343                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2344     } else {
2345         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2346                                            start_pa,
2347                                            size);
2348     }
2349 
2350     if (FAILED(hr)) {
2351         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2352                      " Host:%p, hr=%08lx",
2353                      (add ? "MAP" : "UNMAP"), name,
2354                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2355     }
2356 }
2357 
2358 static void whpx_process_section(MemoryRegionSection *section, int add)
2359 {
2360     MemoryRegion *mr = section->mr;
2361     hwaddr start_pa = section->offset_within_address_space;
2362     ram_addr_t size = int128_get64(section->size);
2363     unsigned int delta;
2364     uint64_t host_va;
2365 
2366     if (!memory_region_is_ram(mr)) {
2367         return;
2368     }
2369 
2370     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2371     delta &= ~qemu_real_host_page_mask();
2372     if (delta > size) {
2373         return;
2374     }
2375     start_pa += delta;
2376     size -= delta;
2377     size &= qemu_real_host_page_mask();
2378     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2379         return;
2380     }
2381 
2382     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2383             + section->offset_within_region + delta;
2384 
2385     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2386                         memory_region_is_rom(mr), mr->name);
2387 }
2388 
2389 static void whpx_region_add(MemoryListener *listener,
2390                            MemoryRegionSection *section)
2391 {
2392     memory_region_ref(section->mr);
2393     whpx_process_section(section, 1);
2394 }
2395 
2396 static void whpx_region_del(MemoryListener *listener,
2397                            MemoryRegionSection *section)
2398 {
2399     whpx_process_section(section, 0);
2400     memory_region_unref(section->mr);
2401 }
2402 
2403 static void whpx_transaction_begin(MemoryListener *listener)
2404 {
2405 }
2406 
2407 static void whpx_transaction_commit(MemoryListener *listener)
2408 {
2409 }
2410 
2411 static void whpx_log_sync(MemoryListener *listener,
2412                          MemoryRegionSection *section)
2413 {
2414     MemoryRegion *mr = section->mr;
2415 
2416     if (!memory_region_is_ram(mr)) {
2417         return;
2418     }
2419 
2420     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2421 }
2422 
2423 static MemoryListener whpx_memory_listener = {
2424     .name = "whpx",
2425     .begin = whpx_transaction_begin,
2426     .commit = whpx_transaction_commit,
2427     .region_add = whpx_region_add,
2428     .region_del = whpx_region_del,
2429     .log_sync = whpx_log_sync,
2430     .priority = 10,
2431 };
2432 
2433 static void whpx_memory_init(void)
2434 {
2435     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2436 }
2437 
2438 /*
2439  * Load the functions from the given library, using the given handle. If a
2440  * handle is provided, it is used, otherwise the library is opened. The
2441  * handle will be updated on return with the opened one.
2442  */
2443 static bool load_whp_dispatch_fns(HMODULE *handle,
2444     WHPFunctionList function_list)
2445 {
2446     HMODULE hLib = *handle;
2447 
2448     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2449     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2450     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2451         whp_dispatch.function_name = \
2452             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2453 
2454     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2455         whp_dispatch.function_name = \
2456             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2457         if (!whp_dispatch.function_name) { \
2458             error_report("Could not load function %s", #function_name); \
2459             goto error; \
2460         } \
2461 
2462     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2463     if (!handle_lib) { \
2464         handle_lib = LoadLibrary(lib_name); \
2465         if (!handle_lib) { \
2466             error_report("Could not load library %s.", lib_name); \
2467             goto error; \
2468         } \
2469     } \
2470 
2471     switch (function_list) {
2472     case WINHV_PLATFORM_FNS_DEFAULT:
2473         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2474         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2475         break;
2476 
2477     case WINHV_EMULATION_FNS_DEFAULT:
2478         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2479         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2480         break;
2481 
2482     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2483         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2484         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2485         break;
2486     }
2487 
2488     *handle = hLib;
2489     return true;
2490 
2491 error:
2492     if (hLib) {
2493         FreeLibrary(hLib);
2494     }
2495 
2496     return false;
2497 }
2498 
2499 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2500                                    const char *name, void *opaque,
2501                                    Error **errp)
2502 {
2503     struct whpx_state *whpx = &whpx_global;
2504     OnOffSplit mode;
2505 
2506     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2507         return;
2508     }
2509 
2510     switch (mode) {
2511     case ON_OFF_SPLIT_ON:
2512         whpx->kernel_irqchip_allowed = true;
2513         whpx->kernel_irqchip_required = true;
2514         break;
2515 
2516     case ON_OFF_SPLIT_OFF:
2517         whpx->kernel_irqchip_allowed = false;
2518         whpx->kernel_irqchip_required = false;
2519         break;
2520 
2521     case ON_OFF_SPLIT_SPLIT:
2522         error_setg(errp, "WHPX: split irqchip currently not supported");
2523         error_append_hint(errp,
2524             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2525         break;
2526 
2527     default:
2528         /*
2529          * The value was checked in visit_type_OnOffSplit() above. If
2530          * we get here, then something is wrong in QEMU.
2531          */
2532         abort();
2533     }
2534 }
2535 
2536 /*
2537  * Partition support
2538  */
2539 
2540 static int whpx_accel_init(MachineState *ms)
2541 {
2542     struct whpx_state *whpx;
2543     int ret;
2544     HRESULT hr;
2545     WHV_CAPABILITY whpx_cap;
2546     UINT32 whpx_cap_size;
2547     WHV_PARTITION_PROPERTY prop;
2548     UINT32 cpuidExitList[] = {1, 0x80000001};
2549     WHV_CAPABILITY_FEATURES features = {0};
2550 
2551     whpx = &whpx_global;
2552 
2553     if (!init_whp_dispatch()) {
2554         ret = -ENOSYS;
2555         goto error;
2556     }
2557 
2558     whpx->mem_quota = ms->ram_size;
2559 
2560     hr = whp_dispatch.WHvGetCapability(
2561         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2562         sizeof(whpx_cap), &whpx_cap_size);
2563     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2564         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2565         ret = -ENOSPC;
2566         goto error;
2567     }
2568 
2569     hr = whp_dispatch.WHvGetCapability(
2570         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2571     if (FAILED(hr)) {
2572         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2573         ret = -EINVAL;
2574         goto error;
2575     }
2576 
2577     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2578     if (FAILED(hr)) {
2579         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2580         ret = -EINVAL;
2581         goto error;
2582     }
2583 
2584     /*
2585      * Query the XSAVE capability of the partition. Any error here is not
2586      * considered fatal.
2587      */
2588     hr = whp_dispatch.WHvGetPartitionProperty(
2589         whpx->partition,
2590         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2591         &whpx_xsave_cap,
2592         sizeof(whpx_xsave_cap),
2593         &whpx_cap_size);
2594 
2595     /*
2596      * Windows version which don't support this property will return with the
2597      * specific error code.
2598      */
2599     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2600         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2601     }
2602 
2603     if (!whpx_has_xsave()) {
2604         printf("WHPX: Partition is not XSAVE capable\n");
2605     }
2606 
2607     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2608     prop.ProcessorCount = ms->smp.cpus;
2609     hr = whp_dispatch.WHvSetPartitionProperty(
2610         whpx->partition,
2611         WHvPartitionPropertyCodeProcessorCount,
2612         &prop,
2613         sizeof(WHV_PARTITION_PROPERTY));
2614 
2615     if (FAILED(hr)) {
2616         error_report("WHPX: Failed to set partition core count to %d,"
2617                      " hr=%08lx", ms->smp.cores, hr);
2618         ret = -EINVAL;
2619         goto error;
2620     }
2621 
2622     /*
2623      * Error out if WHP doesn't support apic emulation and user is requiring
2624      * it.
2625      */
2626     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2627             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2628         error_report("WHPX: kernel irqchip requested, but unavailable. "
2629             "Try without kernel-irqchip or with kernel-irqchip=off");
2630         ret = -EINVAL;
2631         goto error;
2632     }
2633 
2634     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2635         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2636         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2637             WHvX64LocalApicEmulationModeXApic;
2638         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2639         hr = whp_dispatch.WHvSetPartitionProperty(
2640             whpx->partition,
2641             WHvPartitionPropertyCodeLocalApicEmulationMode,
2642             &mode,
2643             sizeof(mode));
2644         if (FAILED(hr)) {
2645             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2646             if (whpx->kernel_irqchip_required) {
2647                 error_report("WHPX: kernel irqchip requested, but unavailable");
2648                 ret = -EINVAL;
2649                 goto error;
2650             }
2651         } else {
2652             whpx->apic_in_platform = true;
2653         }
2654     }
2655 
2656     /* Register for MSR and CPUID exits */
2657     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2658     prop.ExtendedVmExits.X64MsrExit = 1;
2659     prop.ExtendedVmExits.X64CpuidExit = 1;
2660     prop.ExtendedVmExits.ExceptionExit = 1;
2661     if (whpx_apic_in_platform()) {
2662         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2663     }
2664 
2665     hr = whp_dispatch.WHvSetPartitionProperty(
2666             whpx->partition,
2667             WHvPartitionPropertyCodeExtendedVmExits,
2668             &prop,
2669             sizeof(WHV_PARTITION_PROPERTY));
2670     if (FAILED(hr)) {
2671         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2672         ret = -EINVAL;
2673         goto error;
2674     }
2675 
2676     hr = whp_dispatch.WHvSetPartitionProperty(
2677         whpx->partition,
2678         WHvPartitionPropertyCodeCpuidExitList,
2679         cpuidExitList,
2680         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2681 
2682     if (FAILED(hr)) {
2683         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2684                      hr);
2685         ret = -EINVAL;
2686         goto error;
2687     }
2688 
2689     /*
2690      * We do not want to intercept any exceptions from the guest,
2691      * until we actually start debugging with gdb.
2692      */
2693     whpx->exception_exit_bitmap = -1;
2694     hr = whpx_set_exception_exit_bitmap(0);
2695 
2696     if (FAILED(hr)) {
2697         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2698         ret = -EINVAL;
2699         goto error;
2700     }
2701 
2702     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2703     if (FAILED(hr)) {
2704         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2705         ret = -EINVAL;
2706         goto error;
2707     }
2708 
2709     whpx_memory_init();
2710 
2711     printf("Windows Hypervisor Platform accelerator is operational\n");
2712     return 0;
2713 
2714 error:
2715 
2716     if (NULL != whpx->partition) {
2717         whp_dispatch.WHvDeletePartition(whpx->partition);
2718         whpx->partition = NULL;
2719     }
2720 
2721     return ret;
2722 }
2723 
2724 int whpx_enabled(void)
2725 {
2726     return whpx_allowed;
2727 }
2728 
2729 bool whpx_apic_in_platform(void) {
2730     return whpx_global.apic_in_platform;
2731 }
2732 
2733 static void whpx_accel_class_init(ObjectClass *oc, void *data)
2734 {
2735     AccelClass *ac = ACCEL_CLASS(oc);
2736     ac->name = "WHPX";
2737     ac->init_machine = whpx_accel_init;
2738     ac->allowed = &whpx_allowed;
2739 
2740     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2741         NULL, whpx_set_kernel_irqchip,
2742         NULL, NULL);
2743     object_class_property_set_description(oc, "kernel-irqchip",
2744         "Configure WHPX in-kernel irqchip");
2745 }
2746 
2747 static void whpx_accel_instance_init(Object *obj)
2748 {
2749     struct whpx_state *whpx = &whpx_global;
2750 
2751     memset(whpx, 0, sizeof(struct whpx_state));
2752     /* Turn on kernel-irqchip, by default */
2753     whpx->kernel_irqchip_allowed = true;
2754 }
2755 
2756 static const TypeInfo whpx_accel_type = {
2757     .name = ACCEL_CLASS_NAME("whpx"),
2758     .parent = TYPE_ACCEL,
2759     .instance_init = whpx_accel_instance_init,
2760     .class_init = whpx_accel_class_init,
2761 };
2762 
2763 static void whpx_type_init(void)
2764 {
2765     type_register_static(&whpx_accel_type);
2766 }
2767 
2768 bool init_whp_dispatch(void)
2769 {
2770     if (whp_dispatch_initialized) {
2771         return true;
2772     }
2773 
2774     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2775         goto error;
2776     }
2777 
2778     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2779         goto error;
2780     }
2781 
2782     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2783         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2784     whp_dispatch_initialized = true;
2785 
2786     return true;
2787 error:
2788     if (hWinHvPlatform) {
2789         FreeLibrary(hWinHvPlatform);
2790     }
2791 
2792     if (hWinHvEmulation) {
2793         FreeLibrary(hWinHvEmulation);
2794     }
2795 
2796     return false;
2797 }
2798 
2799 type_init(whpx_type_init);
2800