xref: /openbmc/qemu/target/i386/whpx/whpx-all.c (revision bac4711b)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <winhvplatform.h>
35 #include <winhvemulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct AccelCPUState {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240 
241     /* Must be the last field as it may have a tail */
242     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
243 };
244 
245 static bool whpx_allowed;
246 static bool whp_dispatch_initialized;
247 static HMODULE hWinHvPlatform, hWinHvEmulation;
248 static uint32_t max_vcpu_index;
249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
250 
251 struct whpx_state whpx_global;
252 struct WHPDispatch whp_dispatch;
253 
254 static bool whpx_has_xsave(void)
255 {
256     return whpx_xsave_cap.XsaveSupport;
257 }
258 
259 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
260                                              int r86)
261 {
262     WHV_X64_SEGMENT_REGISTER hs;
263     unsigned flags = qs->flags;
264 
265     hs.Base = qs->base;
266     hs.Limit = qs->limit;
267     hs.Selector = qs->selector;
268 
269     if (v86) {
270         hs.Attributes = 0;
271         hs.SegmentType = 3;
272         hs.Present = 1;
273         hs.DescriptorPrivilegeLevel = 3;
274         hs.NonSystemSegment = 1;
275 
276     } else {
277         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
278 
279         if (r86) {
280             /* hs.Base &= 0xfffff; */
281         }
282     }
283 
284     return hs;
285 }
286 
287 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
288 {
289     SegmentCache qs;
290 
291     qs.base = hs->Base;
292     qs.limit = hs->Limit;
293     qs.selector = hs->Selector;
294 
295     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
296 
297     return qs;
298 }
299 
300 /* X64 Extended Control Registers */
301 static void whpx_set_xcrs(CPUState *cpu)
302 {
303     CPUX86State *env = cpu->env_ptr;
304     HRESULT hr;
305     struct whpx_state *whpx = &whpx_global;
306     WHV_REGISTER_VALUE xcr0;
307     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
308 
309     if (!whpx_has_xsave()) {
310         return;
311     }
312 
313     /* Only xcr0 is supported by the hypervisor currently */
314     xcr0.Reg64 = env->xcr0;
315     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
316         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
317     if (FAILED(hr)) {
318         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
319     }
320 }
321 
322 static int whpx_set_tsc(CPUState *cpu)
323 {
324     CPUX86State *env = cpu->env_ptr;
325     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
326     WHV_REGISTER_VALUE tsc_val;
327     HRESULT hr;
328     struct whpx_state *whpx = &whpx_global;
329 
330     /*
331      * Suspend the partition prior to setting the TSC to reduce the variance
332      * in TSC across vCPUs. When the first vCPU runs post suspend, the
333      * partition is automatically resumed.
334      */
335     if (whp_dispatch.WHvSuspendPartitionTime) {
336 
337         /*
338          * Unable to suspend partition while setting TSC is not a fatal
339          * error. It just increases the likelihood of TSC variance between
340          * vCPUs and some guest OS are able to handle that just fine.
341          */
342         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
343         if (FAILED(hr)) {
344             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
345         }
346     }
347 
348     tsc_val.Reg64 = env->tsc;
349     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
350         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
351     if (FAILED(hr)) {
352         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
353         return -1;
354     }
355 
356     return 0;
357 }
358 
359 /*
360  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
361  * however, they use a slightly different encoding. Specifically:
362  *
363  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
364  *
365  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
366  * and IA-32 Architectures Software Developer's Manual.
367  *
368  * The functions below translate the value of CR8 to TPR and vice versa.
369  */
370 
371 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
372 {
373     return tpr >> 4;
374 }
375 
376 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
377 {
378     return cr8 << 4;
379 }
380 
381 static void whpx_set_registers(CPUState *cpu, int level)
382 {
383     struct whpx_state *whpx = &whpx_global;
384     AccelCPUState *vcpu = cpu->accel;
385     CPUX86State *env = cpu->env_ptr;
386     X86CPU *x86_cpu = X86_CPU(cpu);
387     struct whpx_register_set vcxt;
388     HRESULT hr;
389     int idx;
390     int idx_next;
391     int i;
392     int v86, r86;
393 
394     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
395 
396     /*
397      * Following MSRs have side effects on the guest or are too heavy for
398      * runtime. Limit them to full state update.
399      */
400     if (level >= WHPX_SET_RESET_STATE) {
401         whpx_set_tsc(cpu);
402     }
403 
404     memset(&vcxt, 0, sizeof(struct whpx_register_set));
405 
406     v86 = (env->eflags & VM_MASK);
407     r86 = !(env->cr[0] & CR0_PE_MASK);
408 
409     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
410     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
411 
412     idx = 0;
413 
414     /* Indexes for first 16 registers match between HV and QEMU definitions */
415     idx_next = 16;
416     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
417         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
418     }
419     idx = idx_next;
420 
421     /* Same goes for RIP and RFLAGS */
422     assert(whpx_register_names[idx] == WHvX64RegisterRip);
423     vcxt.values[idx++].Reg64 = env->eip;
424 
425     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
426     vcxt.values[idx++].Reg64 = env->eflags;
427 
428     /* Translate 6+4 segment registers. HV and QEMU order matches  */
429     assert(idx == WHvX64RegisterEs);
430     for (i = 0; i < 6; i += 1, idx += 1) {
431         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
432     }
433 
434     assert(idx == WHvX64RegisterLdtr);
435     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
436 
437     assert(idx == WHvX64RegisterTr);
438     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
439 
440     assert(idx == WHvX64RegisterIdtr);
441     vcxt.values[idx].Table.Base = env->idt.base;
442     vcxt.values[idx].Table.Limit = env->idt.limit;
443     idx += 1;
444 
445     assert(idx == WHvX64RegisterGdtr);
446     vcxt.values[idx].Table.Base = env->gdt.base;
447     vcxt.values[idx].Table.Limit = env->gdt.limit;
448     idx += 1;
449 
450     /* CR0, 2, 3, 4, 8 */
451     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
452     vcxt.values[idx++].Reg64 = env->cr[0];
453     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
454     vcxt.values[idx++].Reg64 = env->cr[2];
455     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
456     vcxt.values[idx++].Reg64 = env->cr[3];
457     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
458     vcxt.values[idx++].Reg64 = env->cr[4];
459     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
460     vcxt.values[idx++].Reg64 = vcpu->tpr;
461 
462     /* 8 Debug Registers - Skipped */
463 
464     /*
465      * Extended control registers needs to be handled separately depending
466      * on whether xsave is supported/enabled or not.
467      */
468     whpx_set_xcrs(cpu);
469 
470     /* 16 XMM registers */
471     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
472     idx_next = idx + 16;
473     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
474         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
475         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
476     }
477     idx = idx_next;
478 
479     /* 8 FP registers */
480     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
481     for (i = 0; i < 8; i += 1, idx += 1) {
482         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
483         /* vcxt.values[idx].Fp.AsUINT128.High64 =
484                env->fpregs[i].mmx.MMX_Q(1);
485         */
486     }
487 
488     /* FP control status register */
489     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
490     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
491     vcxt.values[idx].FpControlStatus.FpStatus =
492         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
493     vcxt.values[idx].FpControlStatus.FpTag = 0;
494     for (i = 0; i < 8; ++i) {
495         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
496     }
497     vcxt.values[idx].FpControlStatus.Reserved = 0;
498     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
499     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
500     idx += 1;
501 
502     /* XMM control status register */
503     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
504     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
505     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
506     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
507     idx += 1;
508 
509     /* MSRs */
510     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
511     vcxt.values[idx++].Reg64 = env->efer;
512 #ifdef TARGET_X86_64
513     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
514     vcxt.values[idx++].Reg64 = env->kernelgsbase;
515 #endif
516 
517     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
518     vcxt.values[idx++].Reg64 = vcpu->apic_base;
519 
520     /* WHvX64RegisterPat - Skipped */
521 
522     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
523     vcxt.values[idx++].Reg64 = env->sysenter_cs;
524     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
525     vcxt.values[idx++].Reg64 = env->sysenter_eip;
526     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
527     vcxt.values[idx++].Reg64 = env->sysenter_esp;
528     assert(whpx_register_names[idx] == WHvX64RegisterStar);
529     vcxt.values[idx++].Reg64 = env->star;
530 #ifdef TARGET_X86_64
531     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
532     vcxt.values[idx++].Reg64 = env->lstar;
533     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
534     vcxt.values[idx++].Reg64 = env->cstar;
535     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
536     vcxt.values[idx++].Reg64 = env->fmask;
537 #endif
538 
539     /* Interrupt / Event Registers - Skipped */
540 
541     assert(idx == RTL_NUMBER_OF(whpx_register_names));
542 
543     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
544         whpx->partition, cpu->cpu_index,
545         whpx_register_names,
546         RTL_NUMBER_OF(whpx_register_names),
547         &vcxt.values[0]);
548 
549     if (FAILED(hr)) {
550         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
551                      hr);
552     }
553 
554     return;
555 }
556 
557 static int whpx_get_tsc(CPUState *cpu)
558 {
559     CPUX86State *env = cpu->env_ptr;
560     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
561     WHV_REGISTER_VALUE tsc_val;
562     HRESULT hr;
563     struct whpx_state *whpx = &whpx_global;
564 
565     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
566         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
567     if (FAILED(hr)) {
568         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
569         return -1;
570     }
571 
572     env->tsc = tsc_val.Reg64;
573     return 0;
574 }
575 
576 /* X64 Extended Control Registers */
577 static void whpx_get_xcrs(CPUState *cpu)
578 {
579     CPUX86State *env = cpu->env_ptr;
580     HRESULT hr;
581     struct whpx_state *whpx = &whpx_global;
582     WHV_REGISTER_VALUE xcr0;
583     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
584 
585     if (!whpx_has_xsave()) {
586         return;
587     }
588 
589     /* Only xcr0 is supported by the hypervisor currently */
590     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
591         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
592     if (FAILED(hr)) {
593         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
594         return;
595     }
596 
597     env->xcr0 = xcr0.Reg64;
598 }
599 
600 static void whpx_get_registers(CPUState *cpu)
601 {
602     struct whpx_state *whpx = &whpx_global;
603     AccelCPUState *vcpu = cpu->accel;
604     CPUX86State *env = cpu->env_ptr;
605     X86CPU *x86_cpu = X86_CPU(cpu);
606     struct whpx_register_set vcxt;
607     uint64_t tpr, apic_base;
608     HRESULT hr;
609     int idx;
610     int idx_next;
611     int i;
612 
613     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
614 
615     if (!env->tsc_valid) {
616         whpx_get_tsc(cpu);
617         env->tsc_valid = !runstate_is_running();
618     }
619 
620     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
621         whpx->partition, cpu->cpu_index,
622         whpx_register_names,
623         RTL_NUMBER_OF(whpx_register_names),
624         &vcxt.values[0]);
625     if (FAILED(hr)) {
626         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
627                      hr);
628     }
629 
630     if (whpx_apic_in_platform()) {
631         /*
632          * Fetch the TPR value from the emulated APIC. It may get overwritten
633          * below with the value from CR8 returned by
634          * WHvGetVirtualProcessorRegisters().
635          */
636         whpx_apic_get(x86_cpu->apic_state);
637         vcpu->tpr = whpx_apic_tpr_to_cr8(
638             cpu_get_apic_tpr(x86_cpu->apic_state));
639     }
640 
641     idx = 0;
642 
643     /* Indexes for first 16 registers match between HV and QEMU definitions */
644     idx_next = 16;
645     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
646         env->regs[idx] = vcxt.values[idx].Reg64;
647     }
648     idx = idx_next;
649 
650     /* Same goes for RIP and RFLAGS */
651     assert(whpx_register_names[idx] == WHvX64RegisterRip);
652     env->eip = vcxt.values[idx++].Reg64;
653     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
654     env->eflags = vcxt.values[idx++].Reg64;
655 
656     /* Translate 6+4 segment registers. HV and QEMU order matches  */
657     assert(idx == WHvX64RegisterEs);
658     for (i = 0; i < 6; i += 1, idx += 1) {
659         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
660     }
661 
662     assert(idx == WHvX64RegisterLdtr);
663     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
664     assert(idx == WHvX64RegisterTr);
665     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
666     assert(idx == WHvX64RegisterIdtr);
667     env->idt.base = vcxt.values[idx].Table.Base;
668     env->idt.limit = vcxt.values[idx].Table.Limit;
669     idx += 1;
670     assert(idx == WHvX64RegisterGdtr);
671     env->gdt.base = vcxt.values[idx].Table.Base;
672     env->gdt.limit = vcxt.values[idx].Table.Limit;
673     idx += 1;
674 
675     /* CR0, 2, 3, 4, 8 */
676     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
677     env->cr[0] = vcxt.values[idx++].Reg64;
678     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
679     env->cr[2] = vcxt.values[idx++].Reg64;
680     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
681     env->cr[3] = vcxt.values[idx++].Reg64;
682     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
683     env->cr[4] = vcxt.values[idx++].Reg64;
684     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
685     tpr = vcxt.values[idx++].Reg64;
686     if (tpr != vcpu->tpr) {
687         vcpu->tpr = tpr;
688         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
689     }
690 
691     /* 8 Debug Registers - Skipped */
692 
693     /*
694      * Extended control registers needs to be handled separately depending
695      * on whether xsave is supported/enabled or not.
696      */
697     whpx_get_xcrs(cpu);
698 
699     /* 16 XMM registers */
700     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
701     idx_next = idx + 16;
702     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
703         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
704         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
705     }
706     idx = idx_next;
707 
708     /* 8 FP registers */
709     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
710     for (i = 0; i < 8; i += 1, idx += 1) {
711         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
712         /* env->fpregs[i].mmx.MMX_Q(1) =
713                vcxt.values[idx].Fp.AsUINT128.High64;
714         */
715     }
716 
717     /* FP control status register */
718     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
719     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
720     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
721     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
722     for (i = 0; i < 8; ++i) {
723         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
724     }
725     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
726     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
727     idx += 1;
728 
729     /* XMM control status register */
730     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
731     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
732     idx += 1;
733 
734     /* MSRs */
735     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
736     env->efer = vcxt.values[idx++].Reg64;
737 #ifdef TARGET_X86_64
738     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
739     env->kernelgsbase = vcxt.values[idx++].Reg64;
740 #endif
741 
742     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
743     apic_base = vcxt.values[idx++].Reg64;
744     if (apic_base != vcpu->apic_base) {
745         vcpu->apic_base = apic_base;
746         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
747     }
748 
749     /* WHvX64RegisterPat - Skipped */
750 
751     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
752     env->sysenter_cs = vcxt.values[idx++].Reg64;
753     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
754     env->sysenter_eip = vcxt.values[idx++].Reg64;
755     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
756     env->sysenter_esp = vcxt.values[idx++].Reg64;
757     assert(whpx_register_names[idx] == WHvX64RegisterStar);
758     env->star = vcxt.values[idx++].Reg64;
759 #ifdef TARGET_X86_64
760     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
761     env->lstar = vcxt.values[idx++].Reg64;
762     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
763     env->cstar = vcxt.values[idx++].Reg64;
764     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
765     env->fmask = vcxt.values[idx++].Reg64;
766 #endif
767 
768     /* Interrupt / Event Registers - Skipped */
769 
770     assert(idx == RTL_NUMBER_OF(whpx_register_names));
771 
772     if (whpx_apic_in_platform()) {
773         whpx_apic_get(x86_cpu->apic_state);
774     }
775 
776     x86_update_hflags(env);
777 
778     return;
779 }
780 
781 static HRESULT CALLBACK whpx_emu_ioport_callback(
782     void *ctx,
783     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
784 {
785     MemTxAttrs attrs = { 0 };
786     address_space_rw(&address_space_io, IoAccess->Port, attrs,
787                      &IoAccess->Data, IoAccess->AccessSize,
788                      IoAccess->Direction);
789     return S_OK;
790 }
791 
792 static HRESULT CALLBACK whpx_emu_mmio_callback(
793     void *ctx,
794     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
795 {
796     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
797                            ma->Direction);
798     return S_OK;
799 }
800 
801 static HRESULT CALLBACK whpx_emu_getreg_callback(
802     void *ctx,
803     const WHV_REGISTER_NAME *RegisterNames,
804     UINT32 RegisterCount,
805     WHV_REGISTER_VALUE *RegisterValues)
806 {
807     HRESULT hr;
808     struct whpx_state *whpx = &whpx_global;
809     CPUState *cpu = (CPUState *)ctx;
810 
811     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
812         whpx->partition, cpu->cpu_index,
813         RegisterNames, RegisterCount,
814         RegisterValues);
815     if (FAILED(hr)) {
816         error_report("WHPX: Failed to get virtual processor registers,"
817                      " hr=%08lx", hr);
818     }
819 
820     return hr;
821 }
822 
823 static HRESULT CALLBACK whpx_emu_setreg_callback(
824     void *ctx,
825     const WHV_REGISTER_NAME *RegisterNames,
826     UINT32 RegisterCount,
827     const WHV_REGISTER_VALUE *RegisterValues)
828 {
829     HRESULT hr;
830     struct whpx_state *whpx = &whpx_global;
831     CPUState *cpu = (CPUState *)ctx;
832 
833     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
834         whpx->partition, cpu->cpu_index,
835         RegisterNames, RegisterCount,
836         RegisterValues);
837     if (FAILED(hr)) {
838         error_report("WHPX: Failed to set virtual processor registers,"
839                      " hr=%08lx", hr);
840     }
841 
842     /*
843      * The emulator just successfully wrote the register state. We clear the
844      * dirty state so we avoid the double write on resume of the VP.
845      */
846     cpu->vcpu_dirty = false;
847 
848     return hr;
849 }
850 
851 static HRESULT CALLBACK whpx_emu_translate_callback(
852     void *ctx,
853     WHV_GUEST_VIRTUAL_ADDRESS Gva,
854     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
855     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
856     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
857 {
858     HRESULT hr;
859     struct whpx_state *whpx = &whpx_global;
860     CPUState *cpu = (CPUState *)ctx;
861     WHV_TRANSLATE_GVA_RESULT res;
862 
863     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
864                                       Gva, TranslateFlags, &res, Gpa);
865     if (FAILED(hr)) {
866         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
867     } else {
868         *TranslationResult = res.ResultCode;
869     }
870 
871     return hr;
872 }
873 
874 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
875     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
876     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
877     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
878     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
879     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
880     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
881 };
882 
883 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
884 {
885     HRESULT hr;
886     AccelCPUState *vcpu = cpu->accel;
887     WHV_EMULATOR_STATUS emu_status;
888 
889     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
890         vcpu->emulator, cpu,
891         &vcpu->exit_ctx.VpContext, ctx,
892         &emu_status);
893     if (FAILED(hr)) {
894         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
895         return -1;
896     }
897 
898     if (!emu_status.EmulationSuccessful) {
899         error_report("WHPX: Failed to emulate MMIO access with"
900                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
901         return -1;
902     }
903 
904     return 0;
905 }
906 
907 static int whpx_handle_portio(CPUState *cpu,
908                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
909 {
910     HRESULT hr;
911     AccelCPUState *vcpu = cpu->accel;
912     WHV_EMULATOR_STATUS emu_status;
913 
914     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
915         vcpu->emulator, cpu,
916         &vcpu->exit_ctx.VpContext, ctx,
917         &emu_status);
918     if (FAILED(hr)) {
919         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
920         return -1;
921     }
922 
923     if (!emu_status.EmulationSuccessful) {
924         error_report("WHPX: Failed to emulate PortIO access with"
925                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
926         return -1;
927     }
928 
929     return 0;
930 }
931 
932 /*
933  * Controls whether we should intercept various exceptions on the guest,
934  * namely breakpoint/single-step events.
935  *
936  * The 'exceptions' argument accepts a bitmask, e.g:
937  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
938  */
939 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
940 {
941     struct whpx_state *whpx = &whpx_global;
942     WHV_PARTITION_PROPERTY prop = { 0, };
943     HRESULT hr;
944 
945     if (exceptions == whpx->exception_exit_bitmap) {
946         return S_OK;
947     }
948 
949     prop.ExceptionExitBitmap = exceptions;
950 
951     hr = whp_dispatch.WHvSetPartitionProperty(
952         whpx->partition,
953         WHvPartitionPropertyCodeExceptionExitBitmap,
954         &prop,
955         sizeof(WHV_PARTITION_PROPERTY));
956 
957     if (SUCCEEDED(hr)) {
958         whpx->exception_exit_bitmap = exceptions;
959     }
960 
961     return hr;
962 }
963 
964 
965 /*
966  * This function is called before/after stepping over a single instruction.
967  * It will update the CPU registers to arm/disarm the instruction stepping
968  * accordingly.
969  */
970 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
971     bool set,
972     uint64_t *exit_context_rflags)
973 {
974     WHV_REGISTER_NAME reg_name;
975     WHV_REGISTER_VALUE reg_value;
976     HRESULT hr;
977     struct whpx_state *whpx = &whpx_global;
978 
979     /*
980      * If we are trying to step over a single instruction, we need to set the
981      * TF bit in rflags. Otherwise, clear it.
982      */
983     reg_name = WHvX64RegisterRflags;
984     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
985         whpx->partition,
986         cpu->cpu_index,
987         &reg_name,
988         1,
989         &reg_value);
990 
991     if (FAILED(hr)) {
992         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
993         return hr;
994     }
995 
996     if (exit_context_rflags) {
997         assert(*exit_context_rflags == reg_value.Reg64);
998     }
999 
1000     if (set) {
1001         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
1002         reg_value.Reg64 |= TF_MASK;
1003     } else {
1004         reg_value.Reg64 &= ~TF_MASK;
1005     }
1006 
1007     if (exit_context_rflags) {
1008         *exit_context_rflags = reg_value.Reg64;
1009     }
1010 
1011     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1012         whpx->partition,
1013         cpu->cpu_index,
1014         &reg_name,
1015         1,
1016         &reg_value);
1017 
1018     if (FAILED(hr)) {
1019         error_report("WHPX: Failed to set rflags,"
1020             " hr=%08lx",
1021             hr);
1022         return hr;
1023     }
1024 
1025     reg_name = WHvRegisterInterruptState;
1026     reg_value.Reg64 = 0;
1027 
1028     /* Suspend delivery of hardware interrupts during single-stepping. */
1029     reg_value.InterruptState.InterruptShadow = set != 0;
1030 
1031     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1032     whpx->partition,
1033         cpu->cpu_index,
1034         &reg_name,
1035         1,
1036         &reg_value);
1037 
1038     if (FAILED(hr)) {
1039         error_report("WHPX: Failed to set InterruptState,"
1040             " hr=%08lx",
1041             hr);
1042         return hr;
1043     }
1044 
1045     if (!set) {
1046         /*
1047          * We have just finished stepping over a single instruction,
1048          * and intercepted the INT1 generated by it.
1049          * We need to now hide the INT1 from the guest,
1050          * as it would not be expecting it.
1051          */
1052 
1053         reg_name = WHvX64RegisterPendingDebugException;
1054         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1055         whpx->partition,
1056             cpu->cpu_index,
1057             &reg_name,
1058             1,
1059             &reg_value);
1060 
1061         if (FAILED(hr)) {
1062             error_report("WHPX: Failed to get pending debug exceptions,"
1063                          "hr=%08lx", hr);
1064             return hr;
1065         }
1066 
1067         if (reg_value.PendingDebugException.SingleStep) {
1068             reg_value.PendingDebugException.SingleStep = 0;
1069 
1070             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1071                 whpx->partition,
1072                 cpu->cpu_index,
1073                 &reg_name,
1074                 1,
1075                 &reg_value);
1076 
1077             if (FAILED(hr)) {
1078                 error_report("WHPX: Failed to clear pending debug exceptions,"
1079                              "hr=%08lx", hr);
1080              return hr;
1081             }
1082         }
1083 
1084     }
1085 
1086     return S_OK;
1087 }
1088 
1089 /* Tries to find a breakpoint at the specified address. */
1090 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1091 {
1092     struct whpx_state *whpx = &whpx_global;
1093     int i;
1094 
1095     if (whpx->breakpoints.breakpoints) {
1096         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1097             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1098                 return &whpx->breakpoints.breakpoints->data[i];
1099             }
1100         }
1101     }
1102 
1103     return NULL;
1104 }
1105 
1106 /*
1107  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1108  * debugging user-mode applications. Since the WHPX API does not offer
1109  * an easy way to pass the intercepted exception back to the guest, we
1110  * resort to using INT1 instead, and let the guest always handle INT3.
1111  */
1112 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1113 
1114 /*
1115  * The WHPX QEMU backend implements breakpoints by writing the INT1
1116  * instruction into memory (ignoring the DRx registers). This raises a few
1117  * issues that need to be carefully handled:
1118  *
1119  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1120  *    at the same location, and later remove them in arbitrary order.
1121  *    This should not cause memory corruption, and should only remove the
1122  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1123  *
1124  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1125  *    physical location. Hence, physically adding/removing a breakpoint can
1126  *    theoretically fail at any time. We need to keep track of it.
1127  *
1128  * The function below rebuilds a list of low-level breakpoints (one per
1129  * address, tracking the original instruction and any errors) from the list of
1130  * high-level breakpoints (set via cpu_breakpoint_insert()).
1131  *
1132  * In order to optimize performance, this function stores the list of
1133  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1134  * low-level ones, so that it won't be re-invoked until these breakpoints
1135  * change.
1136  *
1137  * Note that this function decides which breakpoints should be inserted into,
1138  * memory, but doesn't actually do it. The memory accessing is done in
1139  * whpx_apply_breakpoints().
1140  */
1141 static void whpx_translate_cpu_breakpoints(
1142     struct whpx_breakpoints *breakpoints,
1143     CPUState *cpu,
1144     int cpu_breakpoint_count)
1145 {
1146     CPUBreakpoint *bp;
1147     int cpu_bp_index = 0;
1148 
1149     breakpoints->original_addresses =
1150         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1151 
1152     breakpoints->original_address_count = cpu_breakpoint_count;
1153 
1154     int max_breakpoints = cpu_breakpoint_count +
1155         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1156 
1157     struct whpx_breakpoint_collection *new_breakpoints =
1158         g_malloc0(sizeof(struct whpx_breakpoint_collection)
1159                   + max_breakpoints * sizeof(struct whpx_breakpoint));
1160 
1161     new_breakpoints->allocated = max_breakpoints;
1162     new_breakpoints->used = 0;
1163 
1164     /*
1165      * 1. Preserve all old breakpoints that could not be automatically
1166      * cleared when the CPU got stopped.
1167      */
1168     if (breakpoints->breakpoints) {
1169         int i;
1170         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1171             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1172                 new_breakpoints->data[new_breakpoints->used++] =
1173                     breakpoints->breakpoints->data[i];
1174             }
1175         }
1176     }
1177 
1178     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1179     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1180         int i;
1181         bool found = false;
1182 
1183         /* This will be used to detect changed CPU breakpoints later. */
1184         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1185 
1186         for (i = 0; i < new_breakpoints->used; i++) {
1187             /*
1188              * WARNING: This loop has O(N^2) complexity, where N is the
1189              * number of breakpoints. It should not be a bottleneck in
1190              * real-world scenarios, since it only needs to run once after
1191              * the breakpoints have been modified.
1192              * If this ever becomes a concern, it can be optimized by storing
1193              * high-level breakpoint objects in a tree or hash map.
1194              */
1195 
1196             if (new_breakpoints->data[i].address == bp->pc) {
1197                 /* There was already a breakpoint at this address. */
1198                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1199                     new_breakpoints->data[i].state = WHPX_BP_SET;
1200                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1201                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1202                 }
1203 
1204                 found = true;
1205                 break;
1206             }
1207         }
1208 
1209         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1210             /* No WHPX breakpoint at this address. Create one. */
1211             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1212             new_breakpoints->data[new_breakpoints->used].state =
1213                 WHPX_BP_SET_PENDING;
1214             new_breakpoints->used++;
1215         }
1216     }
1217 
1218     /*
1219      * Free the previous breakpoint list. This can be optimized by keeping
1220      * it as shadow buffer for the next computation instead of freeing
1221      * it immediately.
1222      */
1223     g_free(breakpoints->breakpoints);
1224 
1225     breakpoints->breakpoints = new_breakpoints;
1226 }
1227 
1228 /*
1229  * Physically inserts/removes the breakpoints by reading and writing the
1230  * physical memory, keeping a track of the failed attempts.
1231  *
1232  * Passing resuming=true  will try to set all previously unset breakpoints.
1233  * Passing resuming=false will remove all inserted ones.
1234  */
1235 static void whpx_apply_breakpoints(
1236     struct whpx_breakpoint_collection *breakpoints,
1237     CPUState *cpu,
1238     bool resuming)
1239 {
1240     int i, rc;
1241     if (!breakpoints) {
1242         return;
1243     }
1244 
1245     for (i = 0; i < breakpoints->used; i++) {
1246         /* Decide what to do right now based on the last known state. */
1247         WhpxBreakpointState state = breakpoints->data[i].state;
1248         switch (state) {
1249         case WHPX_BP_CLEARED:
1250             if (resuming) {
1251                 state = WHPX_BP_SET_PENDING;
1252             }
1253             break;
1254         case WHPX_BP_SET_PENDING:
1255             if (!resuming) {
1256                 state = WHPX_BP_CLEARED;
1257             }
1258             break;
1259         case WHPX_BP_SET:
1260             if (!resuming) {
1261                 state = WHPX_BP_CLEAR_PENDING;
1262             }
1263             break;
1264         case WHPX_BP_CLEAR_PENDING:
1265             if (resuming) {
1266                 state = WHPX_BP_SET;
1267             }
1268             break;
1269         }
1270 
1271         if (state == WHPX_BP_SET_PENDING) {
1272             /* Remember the original instruction. */
1273             rc = cpu_memory_rw_debug(cpu,
1274                 breakpoints->data[i].address,
1275                 &breakpoints->data[i].original_instruction,
1276                 1,
1277                 false);
1278 
1279             if (!rc) {
1280                 /* Write the breakpoint instruction. */
1281                 rc = cpu_memory_rw_debug(cpu,
1282                     breakpoints->data[i].address,
1283                     (void *)&whpx_breakpoint_instruction,
1284                     1,
1285                     true);
1286             }
1287 
1288             if (!rc) {
1289                 state = WHPX_BP_SET;
1290             }
1291 
1292         }
1293 
1294         if (state == WHPX_BP_CLEAR_PENDING) {
1295             /* Restore the original instruction. */
1296             rc = cpu_memory_rw_debug(cpu,
1297                 breakpoints->data[i].address,
1298                 &breakpoints->data[i].original_instruction,
1299                 1,
1300                 true);
1301 
1302             if (!rc) {
1303                 state = WHPX_BP_CLEARED;
1304             }
1305         }
1306 
1307         breakpoints->data[i].state = state;
1308     }
1309 }
1310 
1311 /*
1312  * This function is called when the a VCPU is about to start and no other
1313  * VCPUs have been started so far. Since the VCPU start order could be
1314  * arbitrary, it doesn't have to be VCPU#0.
1315  *
1316  * It is used to commit the breakpoints into memory, and configure WHPX
1317  * to intercept debug exceptions.
1318  *
1319  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1320  * more VCPUs are already running, so this is the best place to do it.
1321  */
1322 static int whpx_first_vcpu_starting(CPUState *cpu)
1323 {
1324     struct whpx_state *whpx = &whpx_global;
1325     HRESULT hr;
1326 
1327     g_assert(qemu_mutex_iothread_locked());
1328 
1329     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1330             (whpx->breakpoints.breakpoints &&
1331              whpx->breakpoints.breakpoints->used)) {
1332         CPUBreakpoint *bp;
1333         int i = 0;
1334         bool update_pending = false;
1335 
1336         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1337             if (i >= whpx->breakpoints.original_address_count ||
1338                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1339                 update_pending = true;
1340             }
1341 
1342             i++;
1343         }
1344 
1345         if (i != whpx->breakpoints.original_address_count) {
1346             update_pending = true;
1347         }
1348 
1349         if (update_pending) {
1350             /*
1351              * The CPU breakpoints have changed since the last call to
1352              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1353              * now be recomputed.
1354              */
1355             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1356         }
1357 
1358         /* Actually insert the breakpoints into the memory. */
1359         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1360     }
1361 
1362     uint64_t exception_mask;
1363     if (whpx->step_pending ||
1364         (whpx->breakpoints.breakpoints &&
1365          whpx->breakpoints.breakpoints->used)) {
1366         /*
1367          * We are either attempting to single-step one or more CPUs, or
1368          * have one or more breakpoints enabled. Both require intercepting
1369          * the WHvX64ExceptionTypeBreakpointTrap exception.
1370          */
1371 
1372         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1373     } else {
1374         /* Let the guest handle all exceptions. */
1375         exception_mask = 0;
1376     }
1377 
1378     hr = whpx_set_exception_exit_bitmap(exception_mask);
1379     if (!SUCCEEDED(hr)) {
1380         error_report("WHPX: Failed to update exception exit mask,"
1381                      "hr=%08lx.", hr);
1382         return 1;
1383     }
1384 
1385     return 0;
1386 }
1387 
1388 /*
1389  * This function is called when the last VCPU has finished running.
1390  * It is used to remove any previously set breakpoints from memory.
1391  */
1392 static int whpx_last_vcpu_stopping(CPUState *cpu)
1393 {
1394     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1395     return 0;
1396 }
1397 
1398 /* Returns the address of the next instruction that is about to be executed. */
1399 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1400 {
1401     if (cpu->vcpu_dirty) {
1402         /* The CPU registers have been modified by other parts of QEMU. */
1403         CPUArchState *env = (CPUArchState *)(cpu->env_ptr);
1404         return env->eip;
1405     } else if (exit_context_valid) {
1406         /*
1407          * The CPU registers have not been modified by neither other parts
1408          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1409          * This is the most common case.
1410          */
1411         AccelCPUState *vcpu = cpu->accel;
1412         return vcpu->exit_ctx.VpContext.Rip;
1413     } else {
1414         /*
1415          * The CPU registers have been modified by a call to
1416          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1417          * the target.
1418          */
1419         WHV_REGISTER_VALUE reg_value;
1420         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1421         HRESULT hr;
1422         struct whpx_state *whpx = &whpx_global;
1423 
1424         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1425             whpx->partition,
1426             cpu->cpu_index,
1427             &reg_name,
1428             1,
1429             &reg_value);
1430 
1431         if (FAILED(hr)) {
1432             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1433             return 0;
1434         }
1435 
1436         return reg_value.Reg64;
1437     }
1438 }
1439 
1440 static int whpx_handle_halt(CPUState *cpu)
1441 {
1442     CPUX86State *env = cpu->env_ptr;
1443     int ret = 0;
1444 
1445     qemu_mutex_lock_iothread();
1446     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1447           (env->eflags & IF_MASK)) &&
1448         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1449         cpu->exception_index = EXCP_HLT;
1450         cpu->halted = true;
1451         ret = 1;
1452     }
1453     qemu_mutex_unlock_iothread();
1454 
1455     return ret;
1456 }
1457 
1458 static void whpx_vcpu_pre_run(CPUState *cpu)
1459 {
1460     HRESULT hr;
1461     struct whpx_state *whpx = &whpx_global;
1462     AccelCPUState *vcpu = cpu->accel;
1463     CPUX86State *env = cpu->env_ptr;
1464     X86CPU *x86_cpu = X86_CPU(cpu);
1465     int irq;
1466     uint8_t tpr;
1467     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1468     UINT32 reg_count = 0;
1469     WHV_REGISTER_VALUE reg_values[3];
1470     WHV_REGISTER_NAME reg_names[3];
1471 
1472     memset(&new_int, 0, sizeof(new_int));
1473     memset(reg_values, 0, sizeof(reg_values));
1474 
1475     qemu_mutex_lock_iothread();
1476 
1477     /* Inject NMI */
1478     if (!vcpu->interruption_pending &&
1479         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1480         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1481             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1482             vcpu->interruptable = false;
1483             new_int.InterruptionType = WHvX64PendingNmi;
1484             new_int.InterruptionPending = 1;
1485             new_int.InterruptionVector = 2;
1486         }
1487         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1488             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1489         }
1490     }
1491 
1492     /*
1493      * Force the VCPU out of its inner loop to process any INIT requests or
1494      * commit pending TPR access.
1495      */
1496     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1497         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1498             !(env->hflags & HF_SMM_MASK)) {
1499             cpu->exit_request = 1;
1500         }
1501         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1502             cpu->exit_request = 1;
1503         }
1504     }
1505 
1506     /* Get pending hard interruption or replay one that was overwritten */
1507     if (!whpx_apic_in_platform()) {
1508         if (!vcpu->interruption_pending &&
1509             vcpu->interruptable && (env->eflags & IF_MASK)) {
1510             assert(!new_int.InterruptionPending);
1511             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1512                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1513                 irq = cpu_get_pic_interrupt(env);
1514                 if (irq >= 0) {
1515                     new_int.InterruptionType = WHvX64PendingInterrupt;
1516                     new_int.InterruptionPending = 1;
1517                     new_int.InterruptionVector = irq;
1518                 }
1519             }
1520         }
1521 
1522         /* Setup interrupt state if new one was prepared */
1523         if (new_int.InterruptionPending) {
1524             reg_values[reg_count].PendingInterruption = new_int;
1525             reg_names[reg_count] = WHvRegisterPendingInterruption;
1526             reg_count += 1;
1527         }
1528     } else if (vcpu->ready_for_pic_interrupt &&
1529                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1530         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1531         irq = cpu_get_pic_interrupt(env);
1532         if (irq >= 0) {
1533             reg_names[reg_count] = WHvRegisterPendingEvent;
1534             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1535             {
1536                 .EventPending = 1,
1537                 .EventType = WHvX64PendingEventExtInt,
1538                 .Vector = irq,
1539             };
1540             reg_count += 1;
1541         }
1542      }
1543 
1544     /* Sync the TPR to the CR8 if was modified during the intercept */
1545     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1546     if (tpr != vcpu->tpr) {
1547         vcpu->tpr = tpr;
1548         reg_values[reg_count].Reg64 = tpr;
1549         cpu->exit_request = 1;
1550         reg_names[reg_count] = WHvX64RegisterCr8;
1551         reg_count += 1;
1552     }
1553 
1554     /* Update the state of the interrupt delivery notification */
1555     if (!vcpu->window_registered &&
1556         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1557         reg_values[reg_count].DeliverabilityNotifications =
1558             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1559                 .InterruptNotification = 1
1560             };
1561         vcpu->window_registered = 1;
1562         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1563         reg_count += 1;
1564     }
1565 
1566     qemu_mutex_unlock_iothread();
1567     vcpu->ready_for_pic_interrupt = false;
1568 
1569     if (reg_count) {
1570         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1571             whpx->partition, cpu->cpu_index,
1572             reg_names, reg_count, reg_values);
1573         if (FAILED(hr)) {
1574             error_report("WHPX: Failed to set interrupt state registers,"
1575                          " hr=%08lx", hr);
1576         }
1577     }
1578 
1579     return;
1580 }
1581 
1582 static void whpx_vcpu_post_run(CPUState *cpu)
1583 {
1584     AccelCPUState *vcpu = cpu->accel;
1585     CPUX86State *env = cpu->env_ptr;
1586     X86CPU *x86_cpu = X86_CPU(cpu);
1587 
1588     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1589 
1590     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1591     if (vcpu->tpr != tpr) {
1592         vcpu->tpr = tpr;
1593         qemu_mutex_lock_iothread();
1594         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1595         qemu_mutex_unlock_iothread();
1596     }
1597 
1598     vcpu->interruption_pending =
1599         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1600 
1601     vcpu->interruptable =
1602         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1603 
1604     return;
1605 }
1606 
1607 static void whpx_vcpu_process_async_events(CPUState *cpu)
1608 {
1609     CPUX86State *env = cpu->env_ptr;
1610     X86CPU *x86_cpu = X86_CPU(cpu);
1611     AccelCPUState *vcpu = cpu->accel;
1612 
1613     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1614         !(env->hflags & HF_SMM_MASK)) {
1615         whpx_cpu_synchronize_state(cpu);
1616         do_cpu_init(x86_cpu);
1617         vcpu->interruptable = true;
1618     }
1619 
1620     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1621         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1622         apic_poll_irq(x86_cpu->apic_state);
1623     }
1624 
1625     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1626          (env->eflags & IF_MASK)) ||
1627         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1628         cpu->halted = false;
1629     }
1630 
1631     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1632         whpx_cpu_synchronize_state(cpu);
1633         do_cpu_sipi(x86_cpu);
1634     }
1635 
1636     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1637         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1638         whpx_cpu_synchronize_state(cpu);
1639         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1640                                       env->tpr_access_type);
1641     }
1642 
1643     return;
1644 }
1645 
1646 static int whpx_vcpu_run(CPUState *cpu)
1647 {
1648     HRESULT hr;
1649     struct whpx_state *whpx = &whpx_global;
1650     AccelCPUState *vcpu = cpu->accel;
1651     struct whpx_breakpoint *stepped_over_bp = NULL;
1652     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1653     int ret;
1654 
1655     g_assert(qemu_mutex_iothread_locked());
1656 
1657     if (whpx->running_cpus++ == 0) {
1658         /* Insert breakpoints into memory, update exception exit bitmap. */
1659         ret = whpx_first_vcpu_starting(cpu);
1660         if (ret != 0) {
1661             return ret;
1662         }
1663     }
1664 
1665     if (whpx->breakpoints.breakpoints &&
1666         whpx->breakpoints.breakpoints->used > 0)
1667     {
1668         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1669         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1670         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1671             stepped_over_bp = NULL;
1672         }
1673 
1674         if (stepped_over_bp) {
1675             /*
1676              * We are trying to run the instruction overwritten by an active
1677              * breakpoint. We will temporarily disable the breakpoint, suspend
1678              * other CPUs, and step over the instruction.
1679              */
1680             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1681         }
1682     }
1683 
1684     if (exclusive_step_mode == WHPX_STEP_NONE) {
1685         whpx_vcpu_process_async_events(cpu);
1686         if (cpu->halted && !whpx_apic_in_platform()) {
1687             cpu->exception_index = EXCP_HLT;
1688             qatomic_set(&cpu->exit_request, false);
1689             return 0;
1690         }
1691     }
1692 
1693     qemu_mutex_unlock_iothread();
1694 
1695     if (exclusive_step_mode != WHPX_STEP_NONE) {
1696         start_exclusive();
1697         g_assert(cpu == current_cpu);
1698         g_assert(!cpu->running);
1699         cpu->running = true;
1700 
1701         hr = whpx_set_exception_exit_bitmap(
1702             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1703         if (!SUCCEEDED(hr)) {
1704             error_report("WHPX: Failed to update exception exit mask, "
1705                          "hr=%08lx.", hr);
1706             return 1;
1707         }
1708 
1709         if (stepped_over_bp) {
1710             /* Temporarily disable the triggered breakpoint. */
1711             cpu_memory_rw_debug(cpu,
1712                 stepped_over_bp->address,
1713                 &stepped_over_bp->original_instruction,
1714                 1,
1715                 true);
1716         }
1717     } else {
1718         cpu_exec_start(cpu);
1719     }
1720 
1721     do {
1722         if (cpu->vcpu_dirty) {
1723             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1724             cpu->vcpu_dirty = false;
1725         }
1726 
1727         if (exclusive_step_mode == WHPX_STEP_NONE) {
1728             whpx_vcpu_pre_run(cpu);
1729 
1730             if (qatomic_read(&cpu->exit_request)) {
1731                 whpx_vcpu_kick(cpu);
1732             }
1733         }
1734 
1735         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1736             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1737         }
1738 
1739         hr = whp_dispatch.WHvRunVirtualProcessor(
1740             whpx->partition, cpu->cpu_index,
1741             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1742 
1743         if (FAILED(hr)) {
1744             error_report("WHPX: Failed to exec a virtual processor,"
1745                          " hr=%08lx", hr);
1746             ret = -1;
1747             break;
1748         }
1749 
1750         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1751             whpx_vcpu_configure_single_stepping(cpu,
1752                 false,
1753                 &vcpu->exit_ctx.VpContext.Rflags);
1754         }
1755 
1756         whpx_vcpu_post_run(cpu);
1757 
1758         switch (vcpu->exit_ctx.ExitReason) {
1759         case WHvRunVpExitReasonMemoryAccess:
1760             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1761             break;
1762 
1763         case WHvRunVpExitReasonX64IoPortAccess:
1764             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1765             break;
1766 
1767         case WHvRunVpExitReasonX64InterruptWindow:
1768             vcpu->ready_for_pic_interrupt = 1;
1769             vcpu->window_registered = 0;
1770             ret = 0;
1771             break;
1772 
1773         case WHvRunVpExitReasonX64ApicEoi:
1774             assert(whpx_apic_in_platform());
1775             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1776             break;
1777 
1778         case WHvRunVpExitReasonX64Halt:
1779             /*
1780              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1781              * longer used.
1782              */
1783             ret = whpx_handle_halt(cpu);
1784             break;
1785 
1786         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1787             WHV_INTERRUPT_CONTROL ipi = {0};
1788             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1789             uint32_t delivery_mode =
1790                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1791             int dest_shorthand =
1792                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1793             bool broadcast = false;
1794             bool include_self = false;
1795             uint32_t i;
1796 
1797             /* We only registered for INIT and SIPI exits. */
1798             if ((delivery_mode != APIC_DM_INIT) &&
1799                 (delivery_mode != APIC_DM_SIPI)) {
1800                 error_report(
1801                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1802                 break;
1803             }
1804 
1805             if (delivery_mode == APIC_DM_INIT) {
1806                 ipi.Type = WHvX64InterruptTypeInit;
1807             } else {
1808                 ipi.Type = WHvX64InterruptTypeSipi;
1809             }
1810 
1811             ipi.DestinationMode =
1812                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1813                     WHvX64InterruptDestinationModeLogical :
1814                     WHvX64InterruptDestinationModePhysical;
1815 
1816             ipi.TriggerMode =
1817                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1818                     WHvX64InterruptTriggerModeLevel :
1819                     WHvX64InterruptTriggerModeEdge;
1820 
1821             ipi.Vector = icr & APIC_VECTOR_MASK;
1822             switch (dest_shorthand) {
1823             /* no shorthand. Bits 56-63 contain the destination. */
1824             case 0:
1825                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1826                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1827                         &ipi, sizeof(ipi));
1828                 if (FAILED(hr)) {
1829                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1830                         hr);
1831                 }
1832 
1833                 break;
1834 
1835             /* self */
1836             case 1:
1837                 include_self = true;
1838                 break;
1839 
1840             /* broadcast, including self */
1841             case 2:
1842                 broadcast = true;
1843                 include_self = true;
1844                 break;
1845 
1846             /* broadcast, excluding self */
1847             case 3:
1848                 broadcast = true;
1849                 break;
1850             }
1851 
1852             if (!broadcast && !include_self) {
1853                 break;
1854             }
1855 
1856             for (i = 0; i <= max_vcpu_index; i++) {
1857                 if (i == cpu->cpu_index && !include_self) {
1858                     continue;
1859                 }
1860 
1861                 /*
1862                  * Assuming that APIC Ids are identity mapped since
1863                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1864                  * are not handled yet and the hypervisor doesn't allow the
1865                  * guest to modify the APIC ID.
1866                  */
1867                 ipi.Destination = i;
1868                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1869                         &ipi, sizeof(ipi));
1870                 if (FAILED(hr)) {
1871                     error_report(
1872                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1873                         i, hr);
1874                 }
1875             }
1876 
1877             break;
1878         }
1879 
1880         case WHvRunVpExitReasonCanceled:
1881             if (exclusive_step_mode != WHPX_STEP_NONE) {
1882                 /*
1883                  * We are trying to step over a single instruction, and
1884                  * likely got a request to stop from another thread.
1885                  * Delay it until we are done stepping
1886                  * over.
1887                  */
1888                 ret = 0;
1889             } else {
1890                 cpu->exception_index = EXCP_INTERRUPT;
1891                 ret = 1;
1892             }
1893             break;
1894         case WHvRunVpExitReasonX64MsrAccess: {
1895             WHV_REGISTER_VALUE reg_values[3] = {0};
1896             WHV_REGISTER_NAME reg_names[3];
1897             UINT32 reg_count;
1898 
1899             reg_names[0] = WHvX64RegisterRip;
1900             reg_names[1] = WHvX64RegisterRax;
1901             reg_names[2] = WHvX64RegisterRdx;
1902 
1903             reg_values[0].Reg64 =
1904                 vcpu->exit_ctx.VpContext.Rip +
1905                 vcpu->exit_ctx.VpContext.InstructionLength;
1906 
1907             /*
1908              * For all unsupported MSR access we:
1909              *     ignore writes
1910              *     return 0 on read.
1911              */
1912             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1913                         1 : 3;
1914 
1915             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1916                 whpx->partition,
1917                 cpu->cpu_index,
1918                 reg_names, reg_count,
1919                 reg_values);
1920 
1921             if (FAILED(hr)) {
1922                 error_report("WHPX: Failed to set MsrAccess state "
1923                              " registers, hr=%08lx", hr);
1924             }
1925             ret = 0;
1926             break;
1927         }
1928         case WHvRunVpExitReasonX64Cpuid: {
1929             WHV_REGISTER_VALUE reg_values[5];
1930             WHV_REGISTER_NAME reg_names[5];
1931             UINT32 reg_count = 5;
1932             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1933             X86CPU *x86_cpu = X86_CPU(cpu);
1934             CPUX86State *env = &x86_cpu->env;
1935 
1936             memset(reg_values, 0, sizeof(reg_values));
1937 
1938             rip = vcpu->exit_ctx.VpContext.Rip +
1939                   vcpu->exit_ctx.VpContext.InstructionLength;
1940             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1941 
1942             /*
1943              * Ideally, these should be supplied to the hypervisor during VCPU
1944              * initialization and it should be able to satisfy this request.
1945              * But, currently, WHPX doesn't support setting CPUID values in the
1946              * hypervisor once the partition has been setup, which is too late
1947              * since VCPUs are realized later. For now, use the values from
1948              * QEMU to satisfy these requests, until WHPX adds support for
1949              * being able to set these values in the hypervisor at runtime.
1950              */
1951             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1952                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1953             switch (cpuid_fn) {
1954             case 0x40000000:
1955                 /* Expose the vmware cpu frequency cpuid leaf */
1956                 rax = 0x40000010;
1957                 rbx = rcx = rdx = 0;
1958                 break;
1959 
1960             case 0x40000010:
1961                 rax = env->tsc_khz;
1962                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1963                 rcx = rdx = 0;
1964                 break;
1965 
1966             case 0x80000001:
1967                 /* Remove any support of OSVW */
1968                 rcx &= ~CPUID_EXT3_OSVW;
1969                 break;
1970             }
1971 
1972             reg_names[0] = WHvX64RegisterRip;
1973             reg_names[1] = WHvX64RegisterRax;
1974             reg_names[2] = WHvX64RegisterRcx;
1975             reg_names[3] = WHvX64RegisterRdx;
1976             reg_names[4] = WHvX64RegisterRbx;
1977 
1978             reg_values[0].Reg64 = rip;
1979             reg_values[1].Reg64 = rax;
1980             reg_values[2].Reg64 = rcx;
1981             reg_values[3].Reg64 = rdx;
1982             reg_values[4].Reg64 = rbx;
1983 
1984             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1985                 whpx->partition, cpu->cpu_index,
1986                 reg_names,
1987                 reg_count,
1988                 reg_values);
1989 
1990             if (FAILED(hr)) {
1991                 error_report("WHPX: Failed to set CpuidAccess state registers,"
1992                              " hr=%08lx", hr);
1993             }
1994             ret = 0;
1995             break;
1996         }
1997         case WHvRunVpExitReasonException:
1998             whpx_get_registers(cpu);
1999 
2000             if ((vcpu->exit_ctx.VpException.ExceptionType ==
2001                  WHvX64ExceptionTypeDebugTrapOrFault) &&
2002                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
2003                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
2004                  whpx_breakpoint_instruction)) {
2005                 /* Stopped at a software breakpoint. */
2006                 cpu->exception_index = EXCP_DEBUG;
2007             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
2008                         WHvX64ExceptionTypeDebugTrapOrFault) &&
2009                        !cpu->singlestep_enabled) {
2010                 /*
2011                  * Just finished stepping over a breakpoint, but the
2012                  * gdb does not expect us to do single-stepping.
2013                  * Don't do anything special.
2014                  */
2015                 cpu->exception_index = EXCP_INTERRUPT;
2016             } else {
2017                 /* Another exception or debug event. Report it to GDB. */
2018                 cpu->exception_index = EXCP_DEBUG;
2019             }
2020 
2021             ret = 1;
2022             break;
2023         case WHvRunVpExitReasonNone:
2024         case WHvRunVpExitReasonUnrecoverableException:
2025         case WHvRunVpExitReasonInvalidVpRegisterValue:
2026         case WHvRunVpExitReasonUnsupportedFeature:
2027         default:
2028             error_report("WHPX: Unexpected VP exit code %d",
2029                          vcpu->exit_ctx.ExitReason);
2030             whpx_get_registers(cpu);
2031             qemu_mutex_lock_iothread();
2032             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2033             qemu_mutex_unlock_iothread();
2034             break;
2035         }
2036 
2037     } while (!ret);
2038 
2039     if (stepped_over_bp) {
2040         /* Restore the breakpoint we stepped over */
2041         cpu_memory_rw_debug(cpu,
2042             stepped_over_bp->address,
2043             (void *)&whpx_breakpoint_instruction,
2044             1,
2045             true);
2046     }
2047 
2048     if (exclusive_step_mode != WHPX_STEP_NONE) {
2049         g_assert(cpu_in_exclusive_context(cpu));
2050         cpu->running = false;
2051         end_exclusive();
2052 
2053         exclusive_step_mode = WHPX_STEP_NONE;
2054     } else {
2055         cpu_exec_end(cpu);
2056     }
2057 
2058     qemu_mutex_lock_iothread();
2059     current_cpu = cpu;
2060 
2061     if (--whpx->running_cpus == 0) {
2062         whpx_last_vcpu_stopping(cpu);
2063     }
2064 
2065     qatomic_set(&cpu->exit_request, false);
2066 
2067     return ret < 0;
2068 }
2069 
2070 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2071 {
2072     if (!cpu->vcpu_dirty) {
2073         whpx_get_registers(cpu);
2074         cpu->vcpu_dirty = true;
2075     }
2076 }
2077 
2078 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2079                                                run_on_cpu_data arg)
2080 {
2081     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2082     cpu->vcpu_dirty = false;
2083 }
2084 
2085 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2086                                               run_on_cpu_data arg)
2087 {
2088     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2089     cpu->vcpu_dirty = false;
2090 }
2091 
2092 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2093                                                run_on_cpu_data arg)
2094 {
2095     cpu->vcpu_dirty = true;
2096 }
2097 
2098 /*
2099  * CPU support.
2100  */
2101 
2102 void whpx_cpu_synchronize_state(CPUState *cpu)
2103 {
2104     if (!cpu->vcpu_dirty) {
2105         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2106     }
2107 }
2108 
2109 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2110 {
2111     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2112 }
2113 
2114 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2115 {
2116     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2117 }
2118 
2119 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2120 {
2121     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2122 }
2123 
2124 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2125 {
2126     whpx_global.step_pending = step_pending;
2127 }
2128 
2129 /*
2130  * Vcpu support.
2131  */
2132 
2133 static Error *whpx_migration_blocker;
2134 
2135 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2136 {
2137     CPUX86State *env = opaque;
2138 
2139     if (running) {
2140         env->tsc_valid = false;
2141     }
2142 }
2143 
2144 int whpx_init_vcpu(CPUState *cpu)
2145 {
2146     HRESULT hr;
2147     struct whpx_state *whpx = &whpx_global;
2148     AccelCPUState *vcpu = NULL;
2149     Error *local_error = NULL;
2150     CPUX86State *env = cpu->env_ptr;
2151     X86CPU *x86_cpu = X86_CPU(cpu);
2152     UINT64 freq = 0;
2153     int ret;
2154 
2155     /* Add migration blockers for all unsupported features of the
2156      * Windows Hypervisor Platform
2157      */
2158     if (whpx_migration_blocker == NULL) {
2159         error_setg(&whpx_migration_blocker,
2160                "State blocked due to non-migratable CPUID feature support,"
2161                "dirty memory tracking support, and XSAVE/XRSTOR support");
2162 
2163         if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) {
2164             error_report_err(local_error);
2165             error_free(whpx_migration_blocker);
2166             ret = -EINVAL;
2167             goto error;
2168         }
2169     }
2170 
2171     vcpu = g_new0(AccelCPUState, 1);
2172 
2173     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2174         &whpx_emu_callbacks,
2175         &vcpu->emulator);
2176     if (FAILED(hr)) {
2177         error_report("WHPX: Failed to setup instruction completion support,"
2178                      " hr=%08lx", hr);
2179         ret = -EINVAL;
2180         goto error;
2181     }
2182 
2183     hr = whp_dispatch.WHvCreateVirtualProcessor(
2184         whpx->partition, cpu->cpu_index, 0);
2185     if (FAILED(hr)) {
2186         error_report("WHPX: Failed to create a virtual processor,"
2187                      " hr=%08lx", hr);
2188         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2189         ret = -EINVAL;
2190         goto error;
2191     }
2192 
2193     /*
2194      * vcpu's TSC frequency is either specified by user, or use the value
2195      * provided by Hyper-V if the former is not present. In the latter case, we
2196      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2197      * frequency can be migrated later via this field.
2198      */
2199     if (!env->tsc_khz) {
2200         hr = whp_dispatch.WHvGetCapability(
2201             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2202                 NULL);
2203         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2204             if (FAILED(hr)) {
2205                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2206             } else {
2207                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2208             }
2209         }
2210     }
2211 
2212     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2213     hr = whp_dispatch.WHvGetCapability(
2214         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2215     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2216         if (FAILED(hr)) {
2217             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2218         } else {
2219             env->apic_bus_freq = freq;
2220         }
2221     }
2222 
2223     /*
2224      * If the vmware cpuid frequency leaf option is set, and we have a valid
2225      * tsc value, trap the corresponding cpuid's.
2226      */
2227     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2228         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2229 
2230         hr = whp_dispatch.WHvSetPartitionProperty(
2231                 whpx->partition,
2232                 WHvPartitionPropertyCodeCpuidExitList,
2233                 cpuidExitList,
2234                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2235 
2236         if (FAILED(hr)) {
2237             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2238                         hr);
2239             ret = -EINVAL;
2240             goto error;
2241         }
2242     }
2243 
2244     vcpu->interruptable = true;
2245     cpu->vcpu_dirty = true;
2246     cpu->accel = vcpu;
2247     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2248     qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr);
2249 
2250     return 0;
2251 
2252 error:
2253     g_free(vcpu);
2254 
2255     return ret;
2256 }
2257 
2258 int whpx_vcpu_exec(CPUState *cpu)
2259 {
2260     int ret;
2261     int fatal;
2262 
2263     for (;;) {
2264         if (cpu->exception_index >= EXCP_INTERRUPT) {
2265             ret = cpu->exception_index;
2266             cpu->exception_index = -1;
2267             break;
2268         }
2269 
2270         fatal = whpx_vcpu_run(cpu);
2271 
2272         if (fatal) {
2273             error_report("WHPX: Failed to exec a virtual processor");
2274             abort();
2275         }
2276     }
2277 
2278     return ret;
2279 }
2280 
2281 void whpx_destroy_vcpu(CPUState *cpu)
2282 {
2283     struct whpx_state *whpx = &whpx_global;
2284     AccelCPUState *vcpu = cpu->accel;
2285 
2286     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2287     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2288     g_free(cpu->accel);
2289     return;
2290 }
2291 
2292 void whpx_vcpu_kick(CPUState *cpu)
2293 {
2294     struct whpx_state *whpx = &whpx_global;
2295     whp_dispatch.WHvCancelRunVirtualProcessor(
2296         whpx->partition, cpu->cpu_index, 0);
2297 }
2298 
2299 /*
2300  * Memory support.
2301  */
2302 
2303 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2304                                 void *host_va, int add, int rom,
2305                                 const char *name)
2306 {
2307     struct whpx_state *whpx = &whpx_global;
2308     HRESULT hr;
2309 
2310     /*
2311     if (add) {
2312         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2313                (void*)start_pa, (void*)size, host_va,
2314                (rom ? "ROM" : "RAM"), name);
2315     } else {
2316         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2317                (void*)start_pa, (void*)size, host_va, name);
2318     }
2319     */
2320 
2321     if (add) {
2322         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2323                                          host_va,
2324                                          start_pa,
2325                                          size,
2326                                          (WHvMapGpaRangeFlagRead |
2327                                           WHvMapGpaRangeFlagExecute |
2328                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2329     } else {
2330         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2331                                            start_pa,
2332                                            size);
2333     }
2334 
2335     if (FAILED(hr)) {
2336         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2337                      " Host:%p, hr=%08lx",
2338                      (add ? "MAP" : "UNMAP"), name,
2339                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2340     }
2341 }
2342 
2343 static void whpx_process_section(MemoryRegionSection *section, int add)
2344 {
2345     MemoryRegion *mr = section->mr;
2346     hwaddr start_pa = section->offset_within_address_space;
2347     ram_addr_t size = int128_get64(section->size);
2348     unsigned int delta;
2349     uint64_t host_va;
2350 
2351     if (!memory_region_is_ram(mr)) {
2352         return;
2353     }
2354 
2355     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2356     delta &= ~qemu_real_host_page_mask();
2357     if (delta > size) {
2358         return;
2359     }
2360     start_pa += delta;
2361     size -= delta;
2362     size &= qemu_real_host_page_mask();
2363     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2364         return;
2365     }
2366 
2367     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2368             + section->offset_within_region + delta;
2369 
2370     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2371                         memory_region_is_rom(mr), mr->name);
2372 }
2373 
2374 static void whpx_region_add(MemoryListener *listener,
2375                            MemoryRegionSection *section)
2376 {
2377     memory_region_ref(section->mr);
2378     whpx_process_section(section, 1);
2379 }
2380 
2381 static void whpx_region_del(MemoryListener *listener,
2382                            MemoryRegionSection *section)
2383 {
2384     whpx_process_section(section, 0);
2385     memory_region_unref(section->mr);
2386 }
2387 
2388 static void whpx_transaction_begin(MemoryListener *listener)
2389 {
2390 }
2391 
2392 static void whpx_transaction_commit(MemoryListener *listener)
2393 {
2394 }
2395 
2396 static void whpx_log_sync(MemoryListener *listener,
2397                          MemoryRegionSection *section)
2398 {
2399     MemoryRegion *mr = section->mr;
2400 
2401     if (!memory_region_is_ram(mr)) {
2402         return;
2403     }
2404 
2405     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2406 }
2407 
2408 static MemoryListener whpx_memory_listener = {
2409     .name = "whpx",
2410     .begin = whpx_transaction_begin,
2411     .commit = whpx_transaction_commit,
2412     .region_add = whpx_region_add,
2413     .region_del = whpx_region_del,
2414     .log_sync = whpx_log_sync,
2415     .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2416 };
2417 
2418 static void whpx_memory_init(void)
2419 {
2420     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2421 }
2422 
2423 /*
2424  * Load the functions from the given library, using the given handle. If a
2425  * handle is provided, it is used, otherwise the library is opened. The
2426  * handle will be updated on return with the opened one.
2427  */
2428 static bool load_whp_dispatch_fns(HMODULE *handle,
2429     WHPFunctionList function_list)
2430 {
2431     HMODULE hLib = *handle;
2432 
2433     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2434     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2435     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2436         whp_dispatch.function_name = \
2437             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2438 
2439     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2440         whp_dispatch.function_name = \
2441             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2442         if (!whp_dispatch.function_name) { \
2443             error_report("Could not load function %s", #function_name); \
2444             goto error; \
2445         } \
2446 
2447     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2448     if (!handle_lib) { \
2449         handle_lib = LoadLibrary(lib_name); \
2450         if (!handle_lib) { \
2451             error_report("Could not load library %s.", lib_name); \
2452             goto error; \
2453         } \
2454     } \
2455 
2456     switch (function_list) {
2457     case WINHV_PLATFORM_FNS_DEFAULT:
2458         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2459         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2460         break;
2461 
2462     case WINHV_EMULATION_FNS_DEFAULT:
2463         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2464         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2465         break;
2466 
2467     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2468         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2469         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2470         break;
2471     }
2472 
2473     *handle = hLib;
2474     return true;
2475 
2476 error:
2477     if (hLib) {
2478         FreeLibrary(hLib);
2479     }
2480 
2481     return false;
2482 }
2483 
2484 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2485                                    const char *name, void *opaque,
2486                                    Error **errp)
2487 {
2488     struct whpx_state *whpx = &whpx_global;
2489     OnOffSplit mode;
2490 
2491     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2492         return;
2493     }
2494 
2495     switch (mode) {
2496     case ON_OFF_SPLIT_ON:
2497         whpx->kernel_irqchip_allowed = true;
2498         whpx->kernel_irqchip_required = true;
2499         break;
2500 
2501     case ON_OFF_SPLIT_OFF:
2502         whpx->kernel_irqchip_allowed = false;
2503         whpx->kernel_irqchip_required = false;
2504         break;
2505 
2506     case ON_OFF_SPLIT_SPLIT:
2507         error_setg(errp, "WHPX: split irqchip currently not supported");
2508         error_append_hint(errp,
2509             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2510         break;
2511 
2512     default:
2513         /*
2514          * The value was checked in visit_type_OnOffSplit() above. If
2515          * we get here, then something is wrong in QEMU.
2516          */
2517         abort();
2518     }
2519 }
2520 
2521 /*
2522  * Partition support
2523  */
2524 
2525 static int whpx_accel_init(MachineState *ms)
2526 {
2527     struct whpx_state *whpx;
2528     int ret;
2529     HRESULT hr;
2530     WHV_CAPABILITY whpx_cap;
2531     UINT32 whpx_cap_size;
2532     WHV_PARTITION_PROPERTY prop;
2533     UINT32 cpuidExitList[] = {1, 0x80000001};
2534     WHV_CAPABILITY_FEATURES features = {0};
2535 
2536     whpx = &whpx_global;
2537 
2538     if (!init_whp_dispatch()) {
2539         ret = -ENOSYS;
2540         goto error;
2541     }
2542 
2543     whpx->mem_quota = ms->ram_size;
2544 
2545     hr = whp_dispatch.WHvGetCapability(
2546         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2547         sizeof(whpx_cap), &whpx_cap_size);
2548     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2549         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2550         ret = -ENOSPC;
2551         goto error;
2552     }
2553 
2554     hr = whp_dispatch.WHvGetCapability(
2555         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2556     if (FAILED(hr)) {
2557         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2558         ret = -EINVAL;
2559         goto error;
2560     }
2561 
2562     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2563     if (FAILED(hr)) {
2564         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2565         ret = -EINVAL;
2566         goto error;
2567     }
2568 
2569     /*
2570      * Query the XSAVE capability of the partition. Any error here is not
2571      * considered fatal.
2572      */
2573     hr = whp_dispatch.WHvGetPartitionProperty(
2574         whpx->partition,
2575         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2576         &whpx_xsave_cap,
2577         sizeof(whpx_xsave_cap),
2578         &whpx_cap_size);
2579 
2580     /*
2581      * Windows version which don't support this property will return with the
2582      * specific error code.
2583      */
2584     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2585         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2586     }
2587 
2588     if (!whpx_has_xsave()) {
2589         printf("WHPX: Partition is not XSAVE capable\n");
2590     }
2591 
2592     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2593     prop.ProcessorCount = ms->smp.cpus;
2594     hr = whp_dispatch.WHvSetPartitionProperty(
2595         whpx->partition,
2596         WHvPartitionPropertyCodeProcessorCount,
2597         &prop,
2598         sizeof(WHV_PARTITION_PROPERTY));
2599 
2600     if (FAILED(hr)) {
2601         error_report("WHPX: Failed to set partition processor count to %u,"
2602                      " hr=%08lx", prop.ProcessorCount, hr);
2603         ret = -EINVAL;
2604         goto error;
2605     }
2606 
2607     /*
2608      * Error out if WHP doesn't support apic emulation and user is requiring
2609      * it.
2610      */
2611     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2612             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2613         error_report("WHPX: kernel irqchip requested, but unavailable. "
2614             "Try without kernel-irqchip or with kernel-irqchip=off");
2615         ret = -EINVAL;
2616         goto error;
2617     }
2618 
2619     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2620         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2621         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2622             WHvX64LocalApicEmulationModeXApic;
2623         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2624         hr = whp_dispatch.WHvSetPartitionProperty(
2625             whpx->partition,
2626             WHvPartitionPropertyCodeLocalApicEmulationMode,
2627             &mode,
2628             sizeof(mode));
2629         if (FAILED(hr)) {
2630             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2631             if (whpx->kernel_irqchip_required) {
2632                 error_report("WHPX: kernel irqchip requested, but unavailable");
2633                 ret = -EINVAL;
2634                 goto error;
2635             }
2636         } else {
2637             whpx->apic_in_platform = true;
2638         }
2639     }
2640 
2641     /* Register for MSR and CPUID exits */
2642     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2643     prop.ExtendedVmExits.X64MsrExit = 1;
2644     prop.ExtendedVmExits.X64CpuidExit = 1;
2645     prop.ExtendedVmExits.ExceptionExit = 1;
2646     if (whpx_apic_in_platform()) {
2647         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2648     }
2649 
2650     hr = whp_dispatch.WHvSetPartitionProperty(
2651             whpx->partition,
2652             WHvPartitionPropertyCodeExtendedVmExits,
2653             &prop,
2654             sizeof(WHV_PARTITION_PROPERTY));
2655     if (FAILED(hr)) {
2656         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2657         ret = -EINVAL;
2658         goto error;
2659     }
2660 
2661     hr = whp_dispatch.WHvSetPartitionProperty(
2662         whpx->partition,
2663         WHvPartitionPropertyCodeCpuidExitList,
2664         cpuidExitList,
2665         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2666 
2667     if (FAILED(hr)) {
2668         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2669                      hr);
2670         ret = -EINVAL;
2671         goto error;
2672     }
2673 
2674     /*
2675      * We do not want to intercept any exceptions from the guest,
2676      * until we actually start debugging with gdb.
2677      */
2678     whpx->exception_exit_bitmap = -1;
2679     hr = whpx_set_exception_exit_bitmap(0);
2680 
2681     if (FAILED(hr)) {
2682         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2683         ret = -EINVAL;
2684         goto error;
2685     }
2686 
2687     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2688     if (FAILED(hr)) {
2689         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2690         ret = -EINVAL;
2691         goto error;
2692     }
2693 
2694     whpx_memory_init();
2695 
2696     printf("Windows Hypervisor Platform accelerator is operational\n");
2697     return 0;
2698 
2699 error:
2700 
2701     if (NULL != whpx->partition) {
2702         whp_dispatch.WHvDeletePartition(whpx->partition);
2703         whpx->partition = NULL;
2704     }
2705 
2706     return ret;
2707 }
2708 
2709 int whpx_enabled(void)
2710 {
2711     return whpx_allowed;
2712 }
2713 
2714 bool whpx_apic_in_platform(void) {
2715     return whpx_global.apic_in_platform;
2716 }
2717 
2718 static void whpx_accel_class_init(ObjectClass *oc, void *data)
2719 {
2720     AccelClass *ac = ACCEL_CLASS(oc);
2721     ac->name = "WHPX";
2722     ac->init_machine = whpx_accel_init;
2723     ac->allowed = &whpx_allowed;
2724 
2725     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2726         NULL, whpx_set_kernel_irqchip,
2727         NULL, NULL);
2728     object_class_property_set_description(oc, "kernel-irqchip",
2729         "Configure WHPX in-kernel irqchip");
2730 }
2731 
2732 static void whpx_accel_instance_init(Object *obj)
2733 {
2734     struct whpx_state *whpx = &whpx_global;
2735 
2736     memset(whpx, 0, sizeof(struct whpx_state));
2737     /* Turn on kernel-irqchip, by default */
2738     whpx->kernel_irqchip_allowed = true;
2739 }
2740 
2741 static const TypeInfo whpx_accel_type = {
2742     .name = ACCEL_CLASS_NAME("whpx"),
2743     .parent = TYPE_ACCEL,
2744     .instance_init = whpx_accel_instance_init,
2745     .class_init = whpx_accel_class_init,
2746 };
2747 
2748 static void whpx_type_init(void)
2749 {
2750     type_register_static(&whpx_accel_type);
2751 }
2752 
2753 bool init_whp_dispatch(void)
2754 {
2755     if (whp_dispatch_initialized) {
2756         return true;
2757     }
2758 
2759     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2760         goto error;
2761     }
2762 
2763     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2764         goto error;
2765     }
2766 
2767     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2768         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2769     whp_dispatch_initialized = true;
2770 
2771     return true;
2772 error:
2773     if (hWinHvPlatform) {
2774         FreeLibrary(hWinHvPlatform);
2775     }
2776 
2777     if (hWinHvEmulation) {
2778         FreeLibrary(hWinHvEmulation);
2779     }
2780 
2781     return false;
2782 }
2783 
2784 type_init(whpx_type_init);
2785