xref: /openbmc/qemu/target/i386/whpx/whpx-all.c (revision 63dd7bcb)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "exec/gdbstub.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/i386/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <WinHvPlatform.h>
35 #include <WinHvEmulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct whpx_vcpu {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240 
241     /* Must be the last field as it may have a tail */
242     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
243 };
244 
245 static bool whpx_allowed;
246 static bool whp_dispatch_initialized;
247 static HMODULE hWinHvPlatform, hWinHvEmulation;
248 static uint32_t max_vcpu_index;
249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
250 
251 struct whpx_state whpx_global;
252 struct WHPDispatch whp_dispatch;
253 
254 static bool whpx_has_xsave(void)
255 {
256     return whpx_xsave_cap.XsaveSupport;
257 }
258 
259 /*
260  * VP support
261  */
262 
263 static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu)
264 {
265     return (struct whpx_vcpu *)cpu->hax_vcpu;
266 }
267 
268 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
269                                              int r86)
270 {
271     WHV_X64_SEGMENT_REGISTER hs;
272     unsigned flags = qs->flags;
273 
274     hs.Base = qs->base;
275     hs.Limit = qs->limit;
276     hs.Selector = qs->selector;
277 
278     if (v86) {
279         hs.Attributes = 0;
280         hs.SegmentType = 3;
281         hs.Present = 1;
282         hs.DescriptorPrivilegeLevel = 3;
283         hs.NonSystemSegment = 1;
284 
285     } else {
286         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
287 
288         if (r86) {
289             /* hs.Base &= 0xfffff; */
290         }
291     }
292 
293     return hs;
294 }
295 
296 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
297 {
298     SegmentCache qs;
299 
300     qs.base = hs->Base;
301     qs.limit = hs->Limit;
302     qs.selector = hs->Selector;
303 
304     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
305 
306     return qs;
307 }
308 
309 /* X64 Extended Control Registers */
310 static void whpx_set_xcrs(CPUState *cpu)
311 {
312     CPUX86State *env = cpu->env_ptr;
313     HRESULT hr;
314     struct whpx_state *whpx = &whpx_global;
315     WHV_REGISTER_VALUE xcr0;
316     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
317 
318     if (!whpx_has_xsave()) {
319         return;
320     }
321 
322     /* Only xcr0 is supported by the hypervisor currently */
323     xcr0.Reg64 = env->xcr0;
324     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
325         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
326     if (FAILED(hr)) {
327         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
328     }
329 }
330 
331 static int whpx_set_tsc(CPUState *cpu)
332 {
333     CPUX86State *env = cpu->env_ptr;
334     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
335     WHV_REGISTER_VALUE tsc_val;
336     HRESULT hr;
337     struct whpx_state *whpx = &whpx_global;
338 
339     /*
340      * Suspend the partition prior to setting the TSC to reduce the variance
341      * in TSC across vCPUs. When the first vCPU runs post suspend, the
342      * partition is automatically resumed.
343      */
344     if (whp_dispatch.WHvSuspendPartitionTime) {
345 
346         /*
347          * Unable to suspend partition while setting TSC is not a fatal
348          * error. It just increases the likelihood of TSC variance between
349          * vCPUs and some guest OS are able to handle that just fine.
350          */
351         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
352         if (FAILED(hr)) {
353             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
354         }
355     }
356 
357     tsc_val.Reg64 = env->tsc;
358     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
359         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
360     if (FAILED(hr)) {
361         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
362         return -1;
363     }
364 
365     return 0;
366 }
367 
368 /*
369  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
370  * however, they use a slightly different encoding. Specifically:
371  *
372  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
373  *
374  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
375  * and IA-32 Architectures Software Developer's Manual.
376  *
377  * The functions below translate the value of CR8 to TPR and vice versa.
378  */
379 
380 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
381 {
382     return tpr >> 4;
383 }
384 
385 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
386 {
387     return cr8 << 4;
388 }
389 
390 static void whpx_set_registers(CPUState *cpu, int level)
391 {
392     struct whpx_state *whpx = &whpx_global;
393     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
394     CPUX86State *env = cpu->env_ptr;
395     X86CPU *x86_cpu = X86_CPU(cpu);
396     struct whpx_register_set vcxt;
397     HRESULT hr;
398     int idx;
399     int idx_next;
400     int i;
401     int v86, r86;
402 
403     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
404 
405     /*
406      * Following MSRs have side effects on the guest or are too heavy for
407      * runtime. Limit them to full state update.
408      */
409     if (level >= WHPX_SET_RESET_STATE) {
410         whpx_set_tsc(cpu);
411     }
412 
413     memset(&vcxt, 0, sizeof(struct whpx_register_set));
414 
415     v86 = (env->eflags & VM_MASK);
416     r86 = !(env->cr[0] & CR0_PE_MASK);
417 
418     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
419     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
420 
421     idx = 0;
422 
423     /* Indexes for first 16 registers match between HV and QEMU definitions */
424     idx_next = 16;
425     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
426         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
427     }
428     idx = idx_next;
429 
430     /* Same goes for RIP and RFLAGS */
431     assert(whpx_register_names[idx] == WHvX64RegisterRip);
432     vcxt.values[idx++].Reg64 = env->eip;
433 
434     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
435     vcxt.values[idx++].Reg64 = env->eflags;
436 
437     /* Translate 6+4 segment registers. HV and QEMU order matches  */
438     assert(idx == WHvX64RegisterEs);
439     for (i = 0; i < 6; i += 1, idx += 1) {
440         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
441     }
442 
443     assert(idx == WHvX64RegisterLdtr);
444     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
445 
446     assert(idx == WHvX64RegisterTr);
447     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
448 
449     assert(idx == WHvX64RegisterIdtr);
450     vcxt.values[idx].Table.Base = env->idt.base;
451     vcxt.values[idx].Table.Limit = env->idt.limit;
452     idx += 1;
453 
454     assert(idx == WHvX64RegisterGdtr);
455     vcxt.values[idx].Table.Base = env->gdt.base;
456     vcxt.values[idx].Table.Limit = env->gdt.limit;
457     idx += 1;
458 
459     /* CR0, 2, 3, 4, 8 */
460     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
461     vcxt.values[idx++].Reg64 = env->cr[0];
462     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
463     vcxt.values[idx++].Reg64 = env->cr[2];
464     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
465     vcxt.values[idx++].Reg64 = env->cr[3];
466     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
467     vcxt.values[idx++].Reg64 = env->cr[4];
468     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
469     vcxt.values[idx++].Reg64 = vcpu->tpr;
470 
471     /* 8 Debug Registers - Skipped */
472 
473     /*
474      * Extended control registers needs to be handled separately depending
475      * on whether xsave is supported/enabled or not.
476      */
477     whpx_set_xcrs(cpu);
478 
479     /* 16 XMM registers */
480     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
481     idx_next = idx + 16;
482     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
483         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
484         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
485     }
486     idx = idx_next;
487 
488     /* 8 FP registers */
489     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
490     for (i = 0; i < 8; i += 1, idx += 1) {
491         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
492         /* vcxt.values[idx].Fp.AsUINT128.High64 =
493                env->fpregs[i].mmx.MMX_Q(1);
494         */
495     }
496 
497     /* FP control status register */
498     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
499     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
500     vcxt.values[idx].FpControlStatus.FpStatus =
501         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
502     vcxt.values[idx].FpControlStatus.FpTag = 0;
503     for (i = 0; i < 8; ++i) {
504         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
505     }
506     vcxt.values[idx].FpControlStatus.Reserved = 0;
507     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
508     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
509     idx += 1;
510 
511     /* XMM control status register */
512     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
513     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
514     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
515     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
516     idx += 1;
517 
518     /* MSRs */
519     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
520     vcxt.values[idx++].Reg64 = env->efer;
521 #ifdef TARGET_X86_64
522     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
523     vcxt.values[idx++].Reg64 = env->kernelgsbase;
524 #endif
525 
526     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
527     vcxt.values[idx++].Reg64 = vcpu->apic_base;
528 
529     /* WHvX64RegisterPat - Skipped */
530 
531     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
532     vcxt.values[idx++].Reg64 = env->sysenter_cs;
533     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
534     vcxt.values[idx++].Reg64 = env->sysenter_eip;
535     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
536     vcxt.values[idx++].Reg64 = env->sysenter_esp;
537     assert(whpx_register_names[idx] == WHvX64RegisterStar);
538     vcxt.values[idx++].Reg64 = env->star;
539 #ifdef TARGET_X86_64
540     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
541     vcxt.values[idx++].Reg64 = env->lstar;
542     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
543     vcxt.values[idx++].Reg64 = env->cstar;
544     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
545     vcxt.values[idx++].Reg64 = env->fmask;
546 #endif
547 
548     /* Interrupt / Event Registers - Skipped */
549 
550     assert(idx == RTL_NUMBER_OF(whpx_register_names));
551 
552     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
553         whpx->partition, cpu->cpu_index,
554         whpx_register_names,
555         RTL_NUMBER_OF(whpx_register_names),
556         &vcxt.values[0]);
557 
558     if (FAILED(hr)) {
559         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
560                      hr);
561     }
562 
563     return;
564 }
565 
566 static int whpx_get_tsc(CPUState *cpu)
567 {
568     CPUX86State *env = cpu->env_ptr;
569     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
570     WHV_REGISTER_VALUE tsc_val;
571     HRESULT hr;
572     struct whpx_state *whpx = &whpx_global;
573 
574     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
575         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
576     if (FAILED(hr)) {
577         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
578         return -1;
579     }
580 
581     env->tsc = tsc_val.Reg64;
582     return 0;
583 }
584 
585 /* X64 Extended Control Registers */
586 static void whpx_get_xcrs(CPUState *cpu)
587 {
588     CPUX86State *env = cpu->env_ptr;
589     HRESULT hr;
590     struct whpx_state *whpx = &whpx_global;
591     WHV_REGISTER_VALUE xcr0;
592     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
593 
594     if (!whpx_has_xsave()) {
595         return;
596     }
597 
598     /* Only xcr0 is supported by the hypervisor currently */
599     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
600         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
601     if (FAILED(hr)) {
602         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
603         return;
604     }
605 
606     env->xcr0 = xcr0.Reg64;
607 }
608 
609 static void whpx_get_registers(CPUState *cpu)
610 {
611     struct whpx_state *whpx = &whpx_global;
612     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
613     CPUX86State *env = cpu->env_ptr;
614     X86CPU *x86_cpu = X86_CPU(cpu);
615     struct whpx_register_set vcxt;
616     uint64_t tpr, apic_base;
617     HRESULT hr;
618     int idx;
619     int idx_next;
620     int i;
621 
622     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
623 
624     if (!env->tsc_valid) {
625         whpx_get_tsc(cpu);
626         env->tsc_valid = !runstate_is_running();
627     }
628 
629     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
630         whpx->partition, cpu->cpu_index,
631         whpx_register_names,
632         RTL_NUMBER_OF(whpx_register_names),
633         &vcxt.values[0]);
634     if (FAILED(hr)) {
635         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
636                      hr);
637     }
638 
639     if (whpx_apic_in_platform()) {
640         /*
641          * Fetch the TPR value from the emulated APIC. It may get overwritten
642          * below with the value from CR8 returned by
643          * WHvGetVirtualProcessorRegisters().
644          */
645         whpx_apic_get(x86_cpu->apic_state);
646         vcpu->tpr = whpx_apic_tpr_to_cr8(
647             cpu_get_apic_tpr(x86_cpu->apic_state));
648     }
649 
650     idx = 0;
651 
652     /* Indexes for first 16 registers match between HV and QEMU definitions */
653     idx_next = 16;
654     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
655         env->regs[idx] = vcxt.values[idx].Reg64;
656     }
657     idx = idx_next;
658 
659     /* Same goes for RIP and RFLAGS */
660     assert(whpx_register_names[idx] == WHvX64RegisterRip);
661     env->eip = vcxt.values[idx++].Reg64;
662     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
663     env->eflags = vcxt.values[idx++].Reg64;
664 
665     /* Translate 6+4 segment registers. HV and QEMU order matches  */
666     assert(idx == WHvX64RegisterEs);
667     for (i = 0; i < 6; i += 1, idx += 1) {
668         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
669     }
670 
671     assert(idx == WHvX64RegisterLdtr);
672     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
673     assert(idx == WHvX64RegisterTr);
674     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
675     assert(idx == WHvX64RegisterIdtr);
676     env->idt.base = vcxt.values[idx].Table.Base;
677     env->idt.limit = vcxt.values[idx].Table.Limit;
678     idx += 1;
679     assert(idx == WHvX64RegisterGdtr);
680     env->gdt.base = vcxt.values[idx].Table.Base;
681     env->gdt.limit = vcxt.values[idx].Table.Limit;
682     idx += 1;
683 
684     /* CR0, 2, 3, 4, 8 */
685     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
686     env->cr[0] = vcxt.values[idx++].Reg64;
687     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
688     env->cr[2] = vcxt.values[idx++].Reg64;
689     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
690     env->cr[3] = vcxt.values[idx++].Reg64;
691     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
692     env->cr[4] = vcxt.values[idx++].Reg64;
693     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
694     tpr = vcxt.values[idx++].Reg64;
695     if (tpr != vcpu->tpr) {
696         vcpu->tpr = tpr;
697         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
698     }
699 
700     /* 8 Debug Registers - Skipped */
701 
702     /*
703      * Extended control registers needs to be handled separately depending
704      * on whether xsave is supported/enabled or not.
705      */
706     whpx_get_xcrs(cpu);
707 
708     /* 16 XMM registers */
709     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
710     idx_next = idx + 16;
711     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
712         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
713         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
714     }
715     idx = idx_next;
716 
717     /* 8 FP registers */
718     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
719     for (i = 0; i < 8; i += 1, idx += 1) {
720         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
721         /* env->fpregs[i].mmx.MMX_Q(1) =
722                vcxt.values[idx].Fp.AsUINT128.High64;
723         */
724     }
725 
726     /* FP control status register */
727     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
728     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
729     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
730     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
731     for (i = 0; i < 8; ++i) {
732         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
733     }
734     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
735     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
736     idx += 1;
737 
738     /* XMM control status register */
739     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
740     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
741     idx += 1;
742 
743     /* MSRs */
744     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
745     env->efer = vcxt.values[idx++].Reg64;
746 #ifdef TARGET_X86_64
747     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
748     env->kernelgsbase = vcxt.values[idx++].Reg64;
749 #endif
750 
751     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
752     apic_base = vcxt.values[idx++].Reg64;
753     if (apic_base != vcpu->apic_base) {
754         vcpu->apic_base = apic_base;
755         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
756     }
757 
758     /* WHvX64RegisterPat - Skipped */
759 
760     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
761     env->sysenter_cs = vcxt.values[idx++].Reg64;
762     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
763     env->sysenter_eip = vcxt.values[idx++].Reg64;
764     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
765     env->sysenter_esp = vcxt.values[idx++].Reg64;
766     assert(whpx_register_names[idx] == WHvX64RegisterStar);
767     env->star = vcxt.values[idx++].Reg64;
768 #ifdef TARGET_X86_64
769     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
770     env->lstar = vcxt.values[idx++].Reg64;
771     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
772     env->cstar = vcxt.values[idx++].Reg64;
773     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
774     env->fmask = vcxt.values[idx++].Reg64;
775 #endif
776 
777     /* Interrupt / Event Registers - Skipped */
778 
779     assert(idx == RTL_NUMBER_OF(whpx_register_names));
780 
781     if (whpx_apic_in_platform()) {
782         whpx_apic_get(x86_cpu->apic_state);
783     }
784 
785     x86_update_hflags(env);
786 
787     return;
788 }
789 
790 static HRESULT CALLBACK whpx_emu_ioport_callback(
791     void *ctx,
792     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
793 {
794     MemTxAttrs attrs = { 0 };
795     address_space_rw(&address_space_io, IoAccess->Port, attrs,
796                      &IoAccess->Data, IoAccess->AccessSize,
797                      IoAccess->Direction);
798     return S_OK;
799 }
800 
801 static HRESULT CALLBACK whpx_emu_mmio_callback(
802     void *ctx,
803     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
804 {
805     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
806                            ma->Direction);
807     return S_OK;
808 }
809 
810 static HRESULT CALLBACK whpx_emu_getreg_callback(
811     void *ctx,
812     const WHV_REGISTER_NAME *RegisterNames,
813     UINT32 RegisterCount,
814     WHV_REGISTER_VALUE *RegisterValues)
815 {
816     HRESULT hr;
817     struct whpx_state *whpx = &whpx_global;
818     CPUState *cpu = (CPUState *)ctx;
819 
820     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
821         whpx->partition, cpu->cpu_index,
822         RegisterNames, RegisterCount,
823         RegisterValues);
824     if (FAILED(hr)) {
825         error_report("WHPX: Failed to get virtual processor registers,"
826                      " hr=%08lx", hr);
827     }
828 
829     return hr;
830 }
831 
832 static HRESULT CALLBACK whpx_emu_setreg_callback(
833     void *ctx,
834     const WHV_REGISTER_NAME *RegisterNames,
835     UINT32 RegisterCount,
836     const WHV_REGISTER_VALUE *RegisterValues)
837 {
838     HRESULT hr;
839     struct whpx_state *whpx = &whpx_global;
840     CPUState *cpu = (CPUState *)ctx;
841 
842     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
843         whpx->partition, cpu->cpu_index,
844         RegisterNames, RegisterCount,
845         RegisterValues);
846     if (FAILED(hr)) {
847         error_report("WHPX: Failed to set virtual processor registers,"
848                      " hr=%08lx", hr);
849     }
850 
851     /*
852      * The emulator just successfully wrote the register state. We clear the
853      * dirty state so we avoid the double write on resume of the VP.
854      */
855     cpu->vcpu_dirty = false;
856 
857     return hr;
858 }
859 
860 static HRESULT CALLBACK whpx_emu_translate_callback(
861     void *ctx,
862     WHV_GUEST_VIRTUAL_ADDRESS Gva,
863     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
864     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
865     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
866 {
867     HRESULT hr;
868     struct whpx_state *whpx = &whpx_global;
869     CPUState *cpu = (CPUState *)ctx;
870     WHV_TRANSLATE_GVA_RESULT res;
871 
872     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
873                                       Gva, TranslateFlags, &res, Gpa);
874     if (FAILED(hr)) {
875         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
876     } else {
877         *TranslationResult = res.ResultCode;
878     }
879 
880     return hr;
881 }
882 
883 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
884     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
885     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
886     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
887     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
888     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
889     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
890 };
891 
892 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
893 {
894     HRESULT hr;
895     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
896     WHV_EMULATOR_STATUS emu_status;
897 
898     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
899         vcpu->emulator, cpu,
900         &vcpu->exit_ctx.VpContext, ctx,
901         &emu_status);
902     if (FAILED(hr)) {
903         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
904         return -1;
905     }
906 
907     if (!emu_status.EmulationSuccessful) {
908         error_report("WHPX: Failed to emulate MMIO access with"
909                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
910         return -1;
911     }
912 
913     return 0;
914 }
915 
916 static int whpx_handle_portio(CPUState *cpu,
917                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
918 {
919     HRESULT hr;
920     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
921     WHV_EMULATOR_STATUS emu_status;
922 
923     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
924         vcpu->emulator, cpu,
925         &vcpu->exit_ctx.VpContext, ctx,
926         &emu_status);
927     if (FAILED(hr)) {
928         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
929         return -1;
930     }
931 
932     if (!emu_status.EmulationSuccessful) {
933         error_report("WHPX: Failed to emulate PortIO access with"
934                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
935         return -1;
936     }
937 
938     return 0;
939 }
940 
941 /*
942  * Controls whether we should intercept various exceptions on the guest,
943  * namely breakpoint/single-step events.
944  *
945  * The 'exceptions' argument accepts a bitmask, e.g:
946  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
947  */
948 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
949 {
950     struct whpx_state *whpx = &whpx_global;
951     WHV_PARTITION_PROPERTY prop = { 0, };
952     HRESULT hr;
953 
954     if (exceptions == whpx->exception_exit_bitmap) {
955         return S_OK;
956     }
957 
958     prop.ExceptionExitBitmap = exceptions;
959 
960     hr = whp_dispatch.WHvSetPartitionProperty(
961         whpx->partition,
962         WHvPartitionPropertyCodeExceptionExitBitmap,
963         &prop,
964         sizeof(WHV_PARTITION_PROPERTY));
965 
966     if (SUCCEEDED(hr)) {
967         whpx->exception_exit_bitmap = exceptions;
968     }
969 
970     return hr;
971 }
972 
973 
974 /*
975  * This function is called before/after stepping over a single instruction.
976  * It will update the CPU registers to arm/disarm the instruction stepping
977  * accordingly.
978  */
979 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
980     bool set,
981     uint64_t *exit_context_rflags)
982 {
983     WHV_REGISTER_NAME reg_name;
984     WHV_REGISTER_VALUE reg_value;
985     HRESULT hr;
986     struct whpx_state *whpx = &whpx_global;
987 
988     /*
989      * If we are trying to step over a single instruction, we need to set the
990      * TF bit in rflags. Otherwise, clear it.
991      */
992     reg_name = WHvX64RegisterRflags;
993     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
994         whpx->partition,
995         cpu->cpu_index,
996         &reg_name,
997         1,
998         &reg_value);
999 
1000     if (FAILED(hr)) {
1001         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
1002         return hr;
1003     }
1004 
1005     if (exit_context_rflags) {
1006         assert(*exit_context_rflags == reg_value.Reg64);
1007     }
1008 
1009     if (set) {
1010         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
1011         reg_value.Reg64 |= TF_MASK;
1012     } else {
1013         reg_value.Reg64 &= ~TF_MASK;
1014     }
1015 
1016     if (exit_context_rflags) {
1017         *exit_context_rflags = reg_value.Reg64;
1018     }
1019 
1020     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1021         whpx->partition,
1022         cpu->cpu_index,
1023         &reg_name,
1024         1,
1025         &reg_value);
1026 
1027     if (FAILED(hr)) {
1028         error_report("WHPX: Failed to set rflags,"
1029             " hr=%08lx",
1030             hr);
1031         return hr;
1032     }
1033 
1034     reg_name = WHvRegisterInterruptState;
1035     reg_value.Reg64 = 0;
1036 
1037     /* Suspend delivery of hardware interrupts during single-stepping. */
1038     reg_value.InterruptState.InterruptShadow = set != 0;
1039 
1040     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1041     whpx->partition,
1042         cpu->cpu_index,
1043         &reg_name,
1044         1,
1045         &reg_value);
1046 
1047     if (FAILED(hr)) {
1048         error_report("WHPX: Failed to set InterruptState,"
1049             " hr=%08lx",
1050             hr);
1051         return hr;
1052     }
1053 
1054     if (!set) {
1055         /*
1056          * We have just finished stepping over a single instruction,
1057          * and intercepted the INT1 generated by it.
1058          * We need to now hide the INT1 from the guest,
1059          * as it would not be expecting it.
1060          */
1061 
1062         reg_name = WHvX64RegisterPendingDebugException;
1063         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1064         whpx->partition,
1065             cpu->cpu_index,
1066             &reg_name,
1067             1,
1068             &reg_value);
1069 
1070         if (FAILED(hr)) {
1071             error_report("WHPX: Failed to get pending debug exceptions,"
1072                          "hr=%08lx", hr);
1073             return hr;
1074         }
1075 
1076         if (reg_value.PendingDebugException.SingleStep) {
1077             reg_value.PendingDebugException.SingleStep = 0;
1078 
1079             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1080                 whpx->partition,
1081                 cpu->cpu_index,
1082                 &reg_name,
1083                 1,
1084                 &reg_value);
1085 
1086             if (FAILED(hr)) {
1087                 error_report("WHPX: Failed to clear pending debug exceptions,"
1088                              "hr=%08lx", hr);
1089              return hr;
1090             }
1091         }
1092 
1093     }
1094 
1095     return S_OK;
1096 }
1097 
1098 /* Tries to find a breakpoint at the specified address. */
1099 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1100 {
1101     struct whpx_state *whpx = &whpx_global;
1102     int i;
1103 
1104     if (whpx->breakpoints.breakpoints) {
1105         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1106             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1107                 return &whpx->breakpoints.breakpoints->data[i];
1108             }
1109         }
1110     }
1111 
1112     return NULL;
1113 }
1114 
1115 /*
1116  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1117  * debugging user-mode applications. Since the WHPX API does not offer
1118  * an easy way to pass the intercepted exception back to the guest, we
1119  * resort to using INT1 instead, and let the guest always handle INT3.
1120  */
1121 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1122 
1123 /*
1124  * The WHPX QEMU backend implements breakpoints by writing the INT1
1125  * instruction into memory (ignoring the DRx registers). This raises a few
1126  * issues that need to be carefully handled:
1127  *
1128  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1129  *    at the same location, and later remove them in arbitrary order.
1130  *    This should not cause memory corruption, and should only remove the
1131  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1132  *
1133  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1134  *    physical location. Hence, physically adding/removing a breakpoint can
1135  *    theoretically fail at any time. We need to keep track of it.
1136  *
1137  * The function below rebuilds a list of low-level breakpoints (one per
1138  * address, tracking the original instruction and any errors) from the list of
1139  * high-level breakpoints (set via cpu_breakpoint_insert()).
1140  *
1141  * In order to optimize performance, this function stores the list of
1142  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1143  * low-level ones, so that it won't be re-invoked until these breakpoints
1144  * change.
1145  *
1146  * Note that this function decides which breakpoints should be inserted into,
1147  * memory, but doesn't actually do it. The memory accessing is done in
1148  * whpx_apply_breakpoints().
1149  */
1150 static void whpx_translate_cpu_breakpoints(
1151     struct whpx_breakpoints *breakpoints,
1152     CPUState *cpu,
1153     int cpu_breakpoint_count)
1154 {
1155     CPUBreakpoint *bp;
1156     int cpu_bp_index = 0;
1157 
1158     breakpoints->original_addresses =
1159         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1160 
1161     breakpoints->original_address_count = cpu_breakpoint_count;
1162 
1163     int max_breakpoints = cpu_breakpoint_count +
1164         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1165 
1166     struct whpx_breakpoint_collection *new_breakpoints =
1167         (struct whpx_breakpoint_collection *)g_malloc0(
1168         sizeof(struct whpx_breakpoint_collection) +
1169             max_breakpoints * sizeof(struct whpx_breakpoint));
1170 
1171     new_breakpoints->allocated = max_breakpoints;
1172     new_breakpoints->used = 0;
1173 
1174     /*
1175      * 1. Preserve all old breakpoints that could not be automatically
1176      * cleared when the CPU got stopped.
1177      */
1178     if (breakpoints->breakpoints) {
1179         int i;
1180         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1181             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1182                 new_breakpoints->data[new_breakpoints->used++] =
1183                     breakpoints->breakpoints->data[i];
1184             }
1185         }
1186     }
1187 
1188     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1189     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1190         int i;
1191         bool found = false;
1192 
1193         /* This will be used to detect changed CPU breakpoints later. */
1194         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1195 
1196         for (i = 0; i < new_breakpoints->used; i++) {
1197             /*
1198              * WARNING: This loop has O(N^2) complexity, where N is the
1199              * number of breakpoints. It should not be a bottleneck in
1200              * real-world scenarios, since it only needs to run once after
1201              * the breakpoints have been modified.
1202              * If this ever becomes a concern, it can be optimized by storing
1203              * high-level breakpoint objects in a tree or hash map.
1204              */
1205 
1206             if (new_breakpoints->data[i].address == bp->pc) {
1207                 /* There was already a breakpoint at this address. */
1208                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1209                     new_breakpoints->data[i].state = WHPX_BP_SET;
1210                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1211                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1212                 }
1213 
1214                 found = true;
1215                 break;
1216             }
1217         }
1218 
1219         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1220             /* No WHPX breakpoint at this address. Create one. */
1221             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1222             new_breakpoints->data[new_breakpoints->used].state =
1223                 WHPX_BP_SET_PENDING;
1224             new_breakpoints->used++;
1225         }
1226     }
1227 
1228     if (breakpoints->breakpoints) {
1229         /*
1230          * Free the previous breakpoint list. This can be optimized by keeping
1231          * it as shadow buffer for the next computation instead of freeing
1232          * it immediately.
1233          */
1234         g_free(breakpoints->breakpoints);
1235     }
1236 
1237     breakpoints->breakpoints = new_breakpoints;
1238 }
1239 
1240 /*
1241  * Physically inserts/removes the breakpoints by reading and writing the
1242  * physical memory, keeping a track of the failed attempts.
1243  *
1244  * Passing resuming=true  will try to set all previously unset breakpoints.
1245  * Passing resuming=false will remove all inserted ones.
1246  */
1247 static void whpx_apply_breakpoints(
1248     struct whpx_breakpoint_collection *breakpoints,
1249     CPUState *cpu,
1250     bool resuming)
1251 {
1252     int i, rc;
1253     if (!breakpoints) {
1254         return;
1255     }
1256 
1257     for (i = 0; i < breakpoints->used; i++) {
1258         /* Decide what to do right now based on the last known state. */
1259         WhpxBreakpointState state = breakpoints->data[i].state;
1260         switch (state) {
1261         case WHPX_BP_CLEARED:
1262             if (resuming) {
1263                 state = WHPX_BP_SET_PENDING;
1264             }
1265             break;
1266         case WHPX_BP_SET_PENDING:
1267             if (!resuming) {
1268                 state = WHPX_BP_CLEARED;
1269             }
1270             break;
1271         case WHPX_BP_SET:
1272             if (!resuming) {
1273                 state = WHPX_BP_CLEAR_PENDING;
1274             }
1275             break;
1276         case WHPX_BP_CLEAR_PENDING:
1277             if (resuming) {
1278                 state = WHPX_BP_SET;
1279             }
1280             break;
1281         }
1282 
1283         if (state == WHPX_BP_SET_PENDING) {
1284             /* Remember the original instruction. */
1285             rc = cpu_memory_rw_debug(cpu,
1286                 breakpoints->data[i].address,
1287                 &breakpoints->data[i].original_instruction,
1288                 1,
1289                 false);
1290 
1291             if (!rc) {
1292                 /* Write the breakpoint instruction. */
1293                 rc = cpu_memory_rw_debug(cpu,
1294                     breakpoints->data[i].address,
1295                     (void *)&whpx_breakpoint_instruction,
1296                     1,
1297                     true);
1298             }
1299 
1300             if (!rc) {
1301                 state = WHPX_BP_SET;
1302             }
1303 
1304         }
1305 
1306         if (state == WHPX_BP_CLEAR_PENDING) {
1307             /* Restore the original instruction. */
1308             rc = cpu_memory_rw_debug(cpu,
1309                 breakpoints->data[i].address,
1310                 &breakpoints->data[i].original_instruction,
1311                 1,
1312                 true);
1313 
1314             if (!rc) {
1315                 state = WHPX_BP_CLEARED;
1316             }
1317         }
1318 
1319         breakpoints->data[i].state = state;
1320     }
1321 }
1322 
1323 /*
1324  * This function is called when the a VCPU is about to start and no other
1325  * VCPUs have been started so far. Since the VCPU start order could be
1326  * arbitrary, it doesn't have to be VCPU#0.
1327  *
1328  * It is used to commit the breakpoints into memory, and configure WHPX
1329  * to intercept debug exceptions.
1330  *
1331  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1332  * more VCPUs are already running, so this is the best place to do it.
1333  */
1334 static int whpx_first_vcpu_starting(CPUState *cpu)
1335 {
1336     struct whpx_state *whpx = &whpx_global;
1337     HRESULT hr;
1338 
1339     g_assert(qemu_mutex_iothread_locked());
1340 
1341     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1342             (whpx->breakpoints.breakpoints &&
1343              whpx->breakpoints.breakpoints->used)) {
1344         CPUBreakpoint *bp;
1345         int i = 0;
1346         bool update_pending = false;
1347 
1348         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1349             if (i >= whpx->breakpoints.original_address_count ||
1350                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1351                 update_pending = true;
1352             }
1353 
1354             i++;
1355         }
1356 
1357         if (i != whpx->breakpoints.original_address_count) {
1358             update_pending = true;
1359         }
1360 
1361         if (update_pending) {
1362             /*
1363              * The CPU breakpoints have changed since the last call to
1364              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1365              * now be recomputed.
1366              */
1367             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1368         }
1369 
1370         /* Actually insert the breakpoints into the memory. */
1371         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1372     }
1373 
1374     uint64_t exception_mask;
1375     if (whpx->step_pending ||
1376         (whpx->breakpoints.breakpoints &&
1377          whpx->breakpoints.breakpoints->used)) {
1378         /*
1379          * We are either attempting to single-step one or more CPUs, or
1380          * have one or more breakpoints enabled. Both require intercepting
1381          * the WHvX64ExceptionTypeBreakpointTrap exception.
1382          */
1383 
1384         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1385     } else {
1386         /* Let the guest handle all exceptions. */
1387         exception_mask = 0;
1388     }
1389 
1390     hr = whpx_set_exception_exit_bitmap(exception_mask);
1391     if (!SUCCEEDED(hr)) {
1392         error_report("WHPX: Failed to update exception exit mask,"
1393                      "hr=%08lx.", hr);
1394         return 1;
1395     }
1396 
1397     return 0;
1398 }
1399 
1400 /*
1401  * This function is called when the last VCPU has finished running.
1402  * It is used to remove any previously set breakpoints from memory.
1403  */
1404 static int whpx_last_vcpu_stopping(CPUState *cpu)
1405 {
1406     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1407     return 0;
1408 }
1409 
1410 /* Returns the address of the next instruction that is about to be executed. */
1411 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1412 {
1413     if (cpu->vcpu_dirty) {
1414         /* The CPU registers have been modified by other parts of QEMU. */
1415         CPUArchState *env = (CPUArchState *)(cpu->env_ptr);
1416         return env->eip;
1417     } else if (exit_context_valid) {
1418         /*
1419          * The CPU registers have not been modified by neither other parts
1420          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1421          * This is the most common case.
1422          */
1423         struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1424         return vcpu->exit_ctx.VpContext.Rip;
1425     } else {
1426         /*
1427          * The CPU registers have been modified by a call to
1428          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1429          * the target.
1430          */
1431         WHV_REGISTER_VALUE reg_value;
1432         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1433         HRESULT hr;
1434         struct whpx_state *whpx = &whpx_global;
1435 
1436         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1437             whpx->partition,
1438             cpu->cpu_index,
1439             &reg_name,
1440             1,
1441             &reg_value);
1442 
1443         if (FAILED(hr)) {
1444             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1445             return 0;
1446         }
1447 
1448         return reg_value.Reg64;
1449     }
1450 }
1451 
1452 static int whpx_handle_halt(CPUState *cpu)
1453 {
1454     CPUX86State *env = cpu->env_ptr;
1455     int ret = 0;
1456 
1457     qemu_mutex_lock_iothread();
1458     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1459           (env->eflags & IF_MASK)) &&
1460         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1461         cpu->exception_index = EXCP_HLT;
1462         cpu->halted = true;
1463         ret = 1;
1464     }
1465     qemu_mutex_unlock_iothread();
1466 
1467     return ret;
1468 }
1469 
1470 static void whpx_vcpu_pre_run(CPUState *cpu)
1471 {
1472     HRESULT hr;
1473     struct whpx_state *whpx = &whpx_global;
1474     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1475     CPUX86State *env = cpu->env_ptr;
1476     X86CPU *x86_cpu = X86_CPU(cpu);
1477     int irq;
1478     uint8_t tpr;
1479     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1480     UINT32 reg_count = 0;
1481     WHV_REGISTER_VALUE reg_values[3];
1482     WHV_REGISTER_NAME reg_names[3];
1483 
1484     memset(&new_int, 0, sizeof(new_int));
1485     memset(reg_values, 0, sizeof(reg_values));
1486 
1487     qemu_mutex_lock_iothread();
1488 
1489     /* Inject NMI */
1490     if (!vcpu->interruption_pending &&
1491         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1492         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1493             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1494             vcpu->interruptable = false;
1495             new_int.InterruptionType = WHvX64PendingNmi;
1496             new_int.InterruptionPending = 1;
1497             new_int.InterruptionVector = 2;
1498         }
1499         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1500             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1501         }
1502     }
1503 
1504     /*
1505      * Force the VCPU out of its inner loop to process any INIT requests or
1506      * commit pending TPR access.
1507      */
1508     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1509         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1510             !(env->hflags & HF_SMM_MASK)) {
1511             cpu->exit_request = 1;
1512         }
1513         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1514             cpu->exit_request = 1;
1515         }
1516     }
1517 
1518     /* Get pending hard interruption or replay one that was overwritten */
1519     if (!whpx_apic_in_platform()) {
1520         if (!vcpu->interruption_pending &&
1521             vcpu->interruptable && (env->eflags & IF_MASK)) {
1522             assert(!new_int.InterruptionPending);
1523             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1524                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1525                 irq = cpu_get_pic_interrupt(env);
1526                 if (irq >= 0) {
1527                     new_int.InterruptionType = WHvX64PendingInterrupt;
1528                     new_int.InterruptionPending = 1;
1529                     new_int.InterruptionVector = irq;
1530                 }
1531             }
1532         }
1533 
1534         /* Setup interrupt state if new one was prepared */
1535         if (new_int.InterruptionPending) {
1536             reg_values[reg_count].PendingInterruption = new_int;
1537             reg_names[reg_count] = WHvRegisterPendingInterruption;
1538             reg_count += 1;
1539         }
1540     } else if (vcpu->ready_for_pic_interrupt &&
1541                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1542         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1543         irq = cpu_get_pic_interrupt(env);
1544         if (irq >= 0) {
1545             reg_names[reg_count] = WHvRegisterPendingEvent;
1546             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1547             {
1548                 .EventPending = 1,
1549                 .EventType = WHvX64PendingEventExtInt,
1550                 .Vector = irq,
1551             };
1552             reg_count += 1;
1553         }
1554      }
1555 
1556     /* Sync the TPR to the CR8 if was modified during the intercept */
1557     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1558     if (tpr != vcpu->tpr) {
1559         vcpu->tpr = tpr;
1560         reg_values[reg_count].Reg64 = tpr;
1561         cpu->exit_request = 1;
1562         reg_names[reg_count] = WHvX64RegisterCr8;
1563         reg_count += 1;
1564     }
1565 
1566     /* Update the state of the interrupt delivery notification */
1567     if (!vcpu->window_registered &&
1568         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1569         reg_values[reg_count].DeliverabilityNotifications =
1570             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1571                 .InterruptNotification = 1
1572             };
1573         vcpu->window_registered = 1;
1574         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1575         reg_count += 1;
1576     }
1577 
1578     qemu_mutex_unlock_iothread();
1579     vcpu->ready_for_pic_interrupt = false;
1580 
1581     if (reg_count) {
1582         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1583             whpx->partition, cpu->cpu_index,
1584             reg_names, reg_count, reg_values);
1585         if (FAILED(hr)) {
1586             error_report("WHPX: Failed to set interrupt state registers,"
1587                          " hr=%08lx", hr);
1588         }
1589     }
1590 
1591     return;
1592 }
1593 
1594 static void whpx_vcpu_post_run(CPUState *cpu)
1595 {
1596     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1597     CPUX86State *env = cpu->env_ptr;
1598     X86CPU *x86_cpu = X86_CPU(cpu);
1599 
1600     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1601 
1602     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1603     if (vcpu->tpr != tpr) {
1604         vcpu->tpr = tpr;
1605         qemu_mutex_lock_iothread();
1606         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1607         qemu_mutex_unlock_iothread();
1608     }
1609 
1610     vcpu->interruption_pending =
1611         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1612 
1613     vcpu->interruptable =
1614         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1615 
1616     return;
1617 }
1618 
1619 static void whpx_vcpu_process_async_events(CPUState *cpu)
1620 {
1621     CPUX86State *env = cpu->env_ptr;
1622     X86CPU *x86_cpu = X86_CPU(cpu);
1623     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1624 
1625     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1626         !(env->hflags & HF_SMM_MASK)) {
1627         whpx_cpu_synchronize_state(cpu);
1628         do_cpu_init(x86_cpu);
1629         vcpu->interruptable = true;
1630     }
1631 
1632     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1633         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1634         apic_poll_irq(x86_cpu->apic_state);
1635     }
1636 
1637     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1638          (env->eflags & IF_MASK)) ||
1639         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1640         cpu->halted = false;
1641     }
1642 
1643     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1644         whpx_cpu_synchronize_state(cpu);
1645         do_cpu_sipi(x86_cpu);
1646     }
1647 
1648     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1649         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1650         whpx_cpu_synchronize_state(cpu);
1651         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1652                                       env->tpr_access_type);
1653     }
1654 
1655     return;
1656 }
1657 
1658 static int whpx_vcpu_run(CPUState *cpu)
1659 {
1660     HRESULT hr;
1661     struct whpx_state *whpx = &whpx_global;
1662     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1663     struct whpx_breakpoint *stepped_over_bp = NULL;
1664     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1665     int ret;
1666 
1667     g_assert(qemu_mutex_iothread_locked());
1668 
1669     if (whpx->running_cpus++ == 0) {
1670         /* Insert breakpoints into memory, update exception exit bitmap. */
1671         ret = whpx_first_vcpu_starting(cpu);
1672         if (ret != 0) {
1673             return ret;
1674         }
1675     }
1676 
1677     if (whpx->breakpoints.breakpoints &&
1678         whpx->breakpoints.breakpoints->used > 0)
1679     {
1680         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1681         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1682         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1683             stepped_over_bp = NULL;
1684         }
1685 
1686         if (stepped_over_bp) {
1687             /*
1688              * We are trying to run the instruction overwritten by an active
1689              * breakpoint. We will temporarily disable the breakpoint, suspend
1690              * other CPUs, and step over the instruction.
1691              */
1692             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1693         }
1694     }
1695 
1696     if (exclusive_step_mode == WHPX_STEP_NONE) {
1697         whpx_vcpu_process_async_events(cpu);
1698         if (cpu->halted && !whpx_apic_in_platform()) {
1699             cpu->exception_index = EXCP_HLT;
1700             qatomic_set(&cpu->exit_request, false);
1701             return 0;
1702         }
1703     }
1704 
1705     qemu_mutex_unlock_iothread();
1706 
1707     if (exclusive_step_mode != WHPX_STEP_NONE) {
1708         start_exclusive();
1709         g_assert(cpu == current_cpu);
1710         g_assert(!cpu->running);
1711         cpu->running = true;
1712 
1713         hr = whpx_set_exception_exit_bitmap(
1714             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1715         if (!SUCCEEDED(hr)) {
1716             error_report("WHPX: Failed to update exception exit mask, "
1717                          "hr=%08lx.", hr);
1718             return 1;
1719         }
1720 
1721         if (stepped_over_bp) {
1722             /* Temporarily disable the triggered breakpoint. */
1723             cpu_memory_rw_debug(cpu,
1724                 stepped_over_bp->address,
1725                 &stepped_over_bp->original_instruction,
1726                 1,
1727                 true);
1728         }
1729     } else {
1730         cpu_exec_start(cpu);
1731     }
1732 
1733     do {
1734         if (cpu->vcpu_dirty) {
1735             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1736             cpu->vcpu_dirty = false;
1737         }
1738 
1739         if (exclusive_step_mode == WHPX_STEP_NONE) {
1740             whpx_vcpu_pre_run(cpu);
1741 
1742             if (qatomic_read(&cpu->exit_request)) {
1743                 whpx_vcpu_kick(cpu);
1744             }
1745         }
1746 
1747         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1748             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1749         }
1750 
1751         hr = whp_dispatch.WHvRunVirtualProcessor(
1752             whpx->partition, cpu->cpu_index,
1753             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1754 
1755         if (FAILED(hr)) {
1756             error_report("WHPX: Failed to exec a virtual processor,"
1757                          " hr=%08lx", hr);
1758             ret = -1;
1759             break;
1760         }
1761 
1762         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1763             whpx_vcpu_configure_single_stepping(cpu,
1764                 false,
1765                 &vcpu->exit_ctx.VpContext.Rflags);
1766         }
1767 
1768         whpx_vcpu_post_run(cpu);
1769 
1770         switch (vcpu->exit_ctx.ExitReason) {
1771         case WHvRunVpExitReasonMemoryAccess:
1772             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1773             break;
1774 
1775         case WHvRunVpExitReasonX64IoPortAccess:
1776             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1777             break;
1778 
1779         case WHvRunVpExitReasonX64InterruptWindow:
1780             vcpu->ready_for_pic_interrupt = 1;
1781             vcpu->window_registered = 0;
1782             ret = 0;
1783             break;
1784 
1785         case WHvRunVpExitReasonX64ApicEoi:
1786             assert(whpx_apic_in_platform());
1787             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1788             break;
1789 
1790         case WHvRunVpExitReasonX64Halt:
1791             /*
1792              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1793              * longer used.
1794              */
1795             ret = whpx_handle_halt(cpu);
1796             break;
1797 
1798         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1799             WHV_INTERRUPT_CONTROL ipi = {0};
1800             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1801             uint32_t delivery_mode =
1802                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1803             int dest_shorthand =
1804                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1805             bool broadcast = false;
1806             bool include_self = false;
1807             uint32_t i;
1808 
1809             /* We only registered for INIT and SIPI exits. */
1810             if ((delivery_mode != APIC_DM_INIT) &&
1811                 (delivery_mode != APIC_DM_SIPI)) {
1812                 error_report(
1813                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1814                 break;
1815             }
1816 
1817             if (delivery_mode == APIC_DM_INIT) {
1818                 ipi.Type = WHvX64InterruptTypeInit;
1819             } else {
1820                 ipi.Type = WHvX64InterruptTypeSipi;
1821             }
1822 
1823             ipi.DestinationMode =
1824                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1825                     WHvX64InterruptDestinationModeLogical :
1826                     WHvX64InterruptDestinationModePhysical;
1827 
1828             ipi.TriggerMode =
1829                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1830                     WHvX64InterruptTriggerModeLevel :
1831                     WHvX64InterruptTriggerModeEdge;
1832 
1833             ipi.Vector = icr & APIC_VECTOR_MASK;
1834             switch (dest_shorthand) {
1835             /* no shorthand. Bits 56-63 contain the destination. */
1836             case 0:
1837                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1838                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1839                         &ipi, sizeof(ipi));
1840                 if (FAILED(hr)) {
1841                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1842                         hr);
1843                 }
1844 
1845                 break;
1846 
1847             /* self */
1848             case 1:
1849                 include_self = true;
1850                 break;
1851 
1852             /* broadcast, including self */
1853             case 2:
1854                 broadcast = true;
1855                 include_self = true;
1856                 break;
1857 
1858             /* broadcast, excluding self */
1859             case 3:
1860                 broadcast = true;
1861                 break;
1862             }
1863 
1864             if (!broadcast && !include_self) {
1865                 break;
1866             }
1867 
1868             for (i = 0; i <= max_vcpu_index; i++) {
1869                 if (i == cpu->cpu_index && !include_self) {
1870                     continue;
1871                 }
1872 
1873                 /*
1874                  * Assuming that APIC Ids are identity mapped since
1875                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1876                  * are not handled yet and the hypervisor doesn't allow the
1877                  * guest to modify the APIC ID.
1878                  */
1879                 ipi.Destination = i;
1880                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1881                         &ipi, sizeof(ipi));
1882                 if (FAILED(hr)) {
1883                     error_report(
1884                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1885                         i, hr);
1886                 }
1887             }
1888 
1889             break;
1890         }
1891 
1892         case WHvRunVpExitReasonCanceled:
1893             if (exclusive_step_mode != WHPX_STEP_NONE) {
1894                 /*
1895                  * We are trying to step over a single instruction, and
1896                  * likely got a request to stop from another thread.
1897                  * Delay it until we are done stepping
1898                  * over.
1899                  */
1900                 ret = 0;
1901             } else {
1902                 cpu->exception_index = EXCP_INTERRUPT;
1903                 ret = 1;
1904             }
1905             break;
1906         case WHvRunVpExitReasonX64MsrAccess: {
1907             WHV_REGISTER_VALUE reg_values[3] = {0};
1908             WHV_REGISTER_NAME reg_names[3];
1909             UINT32 reg_count;
1910 
1911             reg_names[0] = WHvX64RegisterRip;
1912             reg_names[1] = WHvX64RegisterRax;
1913             reg_names[2] = WHvX64RegisterRdx;
1914 
1915             reg_values[0].Reg64 =
1916                 vcpu->exit_ctx.VpContext.Rip +
1917                 vcpu->exit_ctx.VpContext.InstructionLength;
1918 
1919             /*
1920              * For all unsupported MSR access we:
1921              *     ignore writes
1922              *     return 0 on read.
1923              */
1924             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1925                         1 : 3;
1926 
1927             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1928                 whpx->partition,
1929                 cpu->cpu_index,
1930                 reg_names, reg_count,
1931                 reg_values);
1932 
1933             if (FAILED(hr)) {
1934                 error_report("WHPX: Failed to set MsrAccess state "
1935                              " registers, hr=%08lx", hr);
1936             }
1937             ret = 0;
1938             break;
1939         }
1940         case WHvRunVpExitReasonX64Cpuid: {
1941             WHV_REGISTER_VALUE reg_values[5];
1942             WHV_REGISTER_NAME reg_names[5];
1943             UINT32 reg_count = 5;
1944             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1945             X86CPU *x86_cpu = X86_CPU(cpu);
1946             CPUX86State *env = &x86_cpu->env;
1947 
1948             memset(reg_values, 0, sizeof(reg_values));
1949 
1950             rip = vcpu->exit_ctx.VpContext.Rip +
1951                   vcpu->exit_ctx.VpContext.InstructionLength;
1952             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1953 
1954             /*
1955              * Ideally, these should be supplied to the hypervisor during VCPU
1956              * initialization and it should be able to satisfy this request.
1957              * But, currently, WHPX doesn't support setting CPUID values in the
1958              * hypervisor once the partition has been setup, which is too late
1959              * since VCPUs are realized later. For now, use the values from
1960              * QEMU to satisfy these requests, until WHPX adds support for
1961              * being able to set these values in the hypervisor at runtime.
1962              */
1963             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1964                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1965             switch (cpuid_fn) {
1966             case 0x40000000:
1967                 /* Expose the vmware cpu frequency cpuid leaf */
1968                 rax = 0x40000010;
1969                 rbx = rcx = rdx = 0;
1970                 break;
1971 
1972             case 0x40000010:
1973                 rax = env->tsc_khz;
1974                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1975                 rcx = rdx = 0;
1976                 break;
1977 
1978             case 0x80000001:
1979                 /* Remove any support of OSVW */
1980                 rcx &= ~CPUID_EXT3_OSVW;
1981                 break;
1982             }
1983 
1984             reg_names[0] = WHvX64RegisterRip;
1985             reg_names[1] = WHvX64RegisterRax;
1986             reg_names[2] = WHvX64RegisterRcx;
1987             reg_names[3] = WHvX64RegisterRdx;
1988             reg_names[4] = WHvX64RegisterRbx;
1989 
1990             reg_values[0].Reg64 = rip;
1991             reg_values[1].Reg64 = rax;
1992             reg_values[2].Reg64 = rcx;
1993             reg_values[3].Reg64 = rdx;
1994             reg_values[4].Reg64 = rbx;
1995 
1996             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1997                 whpx->partition, cpu->cpu_index,
1998                 reg_names,
1999                 reg_count,
2000                 reg_values);
2001 
2002             if (FAILED(hr)) {
2003                 error_report("WHPX: Failed to set CpuidAccess state registers,"
2004                              " hr=%08lx", hr);
2005             }
2006             ret = 0;
2007             break;
2008         }
2009         case WHvRunVpExitReasonException:
2010             whpx_get_registers(cpu);
2011 
2012             if ((vcpu->exit_ctx.VpException.ExceptionType ==
2013                  WHvX64ExceptionTypeDebugTrapOrFault) &&
2014                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
2015                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
2016                  whpx_breakpoint_instruction)) {
2017                 /* Stopped at a software breakpoint. */
2018                 cpu->exception_index = EXCP_DEBUG;
2019             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
2020                         WHvX64ExceptionTypeDebugTrapOrFault) &&
2021                        !cpu->singlestep_enabled) {
2022                 /*
2023                  * Just finished stepping over a breakpoint, but the
2024                  * gdb does not expect us to do single-stepping.
2025                  * Don't do anything special.
2026                  */
2027                 cpu->exception_index = EXCP_INTERRUPT;
2028             } else {
2029                 /* Another exception or debug event. Report it to GDB. */
2030                 cpu->exception_index = EXCP_DEBUG;
2031             }
2032 
2033             ret = 1;
2034             break;
2035         case WHvRunVpExitReasonNone:
2036         case WHvRunVpExitReasonUnrecoverableException:
2037         case WHvRunVpExitReasonInvalidVpRegisterValue:
2038         case WHvRunVpExitReasonUnsupportedFeature:
2039         default:
2040             error_report("WHPX: Unexpected VP exit code %d",
2041                          vcpu->exit_ctx.ExitReason);
2042             whpx_get_registers(cpu);
2043             qemu_mutex_lock_iothread();
2044             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2045             qemu_mutex_unlock_iothread();
2046             break;
2047         }
2048 
2049     } while (!ret);
2050 
2051     if (stepped_over_bp) {
2052         /* Restore the breakpoint we stepped over */
2053         cpu_memory_rw_debug(cpu,
2054             stepped_over_bp->address,
2055             (void *)&whpx_breakpoint_instruction,
2056             1,
2057             true);
2058     }
2059 
2060     if (exclusive_step_mode != WHPX_STEP_NONE) {
2061         g_assert(cpu_in_exclusive_context(cpu));
2062         cpu->running = false;
2063         end_exclusive();
2064 
2065         exclusive_step_mode = WHPX_STEP_NONE;
2066     } else {
2067         cpu_exec_end(cpu);
2068     }
2069 
2070     qemu_mutex_lock_iothread();
2071     current_cpu = cpu;
2072 
2073     if (--whpx->running_cpus == 0) {
2074         whpx_last_vcpu_stopping(cpu);
2075     }
2076 
2077     qatomic_set(&cpu->exit_request, false);
2078 
2079     return ret < 0;
2080 }
2081 
2082 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2083 {
2084     if (!cpu->vcpu_dirty) {
2085         whpx_get_registers(cpu);
2086         cpu->vcpu_dirty = true;
2087     }
2088 }
2089 
2090 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2091                                                run_on_cpu_data arg)
2092 {
2093     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2094     cpu->vcpu_dirty = false;
2095 }
2096 
2097 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2098                                               run_on_cpu_data arg)
2099 {
2100     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2101     cpu->vcpu_dirty = false;
2102 }
2103 
2104 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2105                                                run_on_cpu_data arg)
2106 {
2107     cpu->vcpu_dirty = true;
2108 }
2109 
2110 /*
2111  * CPU support.
2112  */
2113 
2114 void whpx_cpu_synchronize_state(CPUState *cpu)
2115 {
2116     if (!cpu->vcpu_dirty) {
2117         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2118     }
2119 }
2120 
2121 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2122 {
2123     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2124 }
2125 
2126 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2127 {
2128     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2129 }
2130 
2131 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2132 {
2133     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2134 }
2135 
2136 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2137 {
2138     whpx_global.step_pending = step_pending;
2139 }
2140 
2141 /*
2142  * Vcpu support.
2143  */
2144 
2145 static Error *whpx_migration_blocker;
2146 
2147 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2148 {
2149     CPUX86State *env = opaque;
2150 
2151     if (running) {
2152         env->tsc_valid = false;
2153     }
2154 }
2155 
2156 int whpx_init_vcpu(CPUState *cpu)
2157 {
2158     HRESULT hr;
2159     struct whpx_state *whpx = &whpx_global;
2160     struct whpx_vcpu *vcpu = NULL;
2161     Error *local_error = NULL;
2162     CPUX86State *env = cpu->env_ptr;
2163     X86CPU *x86_cpu = X86_CPU(cpu);
2164     UINT64 freq = 0;
2165     int ret;
2166 
2167     /* Add migration blockers for all unsupported features of the
2168      * Windows Hypervisor Platform
2169      */
2170     if (whpx_migration_blocker == NULL) {
2171         error_setg(&whpx_migration_blocker,
2172                "State blocked due to non-migratable CPUID feature support,"
2173                "dirty memory tracking support, and XSAVE/XRSTOR support");
2174 
2175         if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) {
2176             error_report_err(local_error);
2177             error_free(whpx_migration_blocker);
2178             ret = -EINVAL;
2179             goto error;
2180         }
2181     }
2182 
2183     vcpu = g_new0(struct whpx_vcpu, 1);
2184 
2185     if (!vcpu) {
2186         error_report("WHPX: Failed to allocte VCPU context.");
2187         ret = -ENOMEM;
2188         goto error;
2189     }
2190 
2191     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2192         &whpx_emu_callbacks,
2193         &vcpu->emulator);
2194     if (FAILED(hr)) {
2195         error_report("WHPX: Failed to setup instruction completion support,"
2196                      " hr=%08lx", hr);
2197         ret = -EINVAL;
2198         goto error;
2199     }
2200 
2201     hr = whp_dispatch.WHvCreateVirtualProcessor(
2202         whpx->partition, cpu->cpu_index, 0);
2203     if (FAILED(hr)) {
2204         error_report("WHPX: Failed to create a virtual processor,"
2205                      " hr=%08lx", hr);
2206         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2207         ret = -EINVAL;
2208         goto error;
2209     }
2210 
2211     /*
2212      * vcpu's TSC frequency is either specified by user, or use the value
2213      * provided by Hyper-V if the former is not present. In the latter case, we
2214      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2215      * frequency can be migrated later via this field.
2216      */
2217     if (!env->tsc_khz) {
2218         hr = whp_dispatch.WHvGetCapability(
2219             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2220                 NULL);
2221         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2222             if (FAILED(hr)) {
2223                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2224             } else {
2225                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2226             }
2227         }
2228     }
2229 
2230     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2231     hr = whp_dispatch.WHvGetCapability(
2232         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2233     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2234         if (FAILED(hr)) {
2235             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2236         } else {
2237             env->apic_bus_freq = freq;
2238         }
2239     }
2240 
2241     /*
2242      * If the vmware cpuid frequency leaf option is set, and we have a valid
2243      * tsc value, trap the corresponding cpuid's.
2244      */
2245     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2246         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2247 
2248         hr = whp_dispatch.WHvSetPartitionProperty(
2249                 whpx->partition,
2250                 WHvPartitionPropertyCodeCpuidExitList,
2251                 cpuidExitList,
2252                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2253 
2254         if (FAILED(hr)) {
2255             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2256                         hr);
2257             ret = -EINVAL;
2258             goto error;
2259         }
2260     }
2261 
2262     vcpu->interruptable = true;
2263     cpu->vcpu_dirty = true;
2264     cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu;
2265     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2266     qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr);
2267 
2268     return 0;
2269 
2270 error:
2271     g_free(vcpu);
2272 
2273     return ret;
2274 }
2275 
2276 int whpx_vcpu_exec(CPUState *cpu)
2277 {
2278     int ret;
2279     int fatal;
2280 
2281     for (;;) {
2282         if (cpu->exception_index >= EXCP_INTERRUPT) {
2283             ret = cpu->exception_index;
2284             cpu->exception_index = -1;
2285             break;
2286         }
2287 
2288         fatal = whpx_vcpu_run(cpu);
2289 
2290         if (fatal) {
2291             error_report("WHPX: Failed to exec a virtual processor");
2292             abort();
2293         }
2294     }
2295 
2296     return ret;
2297 }
2298 
2299 void whpx_destroy_vcpu(CPUState *cpu)
2300 {
2301     struct whpx_state *whpx = &whpx_global;
2302     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
2303 
2304     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2305     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2306     g_free(cpu->hax_vcpu);
2307     return;
2308 }
2309 
2310 void whpx_vcpu_kick(CPUState *cpu)
2311 {
2312     struct whpx_state *whpx = &whpx_global;
2313     whp_dispatch.WHvCancelRunVirtualProcessor(
2314         whpx->partition, cpu->cpu_index, 0);
2315 }
2316 
2317 /*
2318  * Memory support.
2319  */
2320 
2321 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2322                                 void *host_va, int add, int rom,
2323                                 const char *name)
2324 {
2325     struct whpx_state *whpx = &whpx_global;
2326     HRESULT hr;
2327 
2328     /*
2329     if (add) {
2330         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2331                (void*)start_pa, (void*)size, host_va,
2332                (rom ? "ROM" : "RAM"), name);
2333     } else {
2334         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2335                (void*)start_pa, (void*)size, host_va, name);
2336     }
2337     */
2338 
2339     if (add) {
2340         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2341                                          host_va,
2342                                          start_pa,
2343                                          size,
2344                                          (WHvMapGpaRangeFlagRead |
2345                                           WHvMapGpaRangeFlagExecute |
2346                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2347     } else {
2348         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2349                                            start_pa,
2350                                            size);
2351     }
2352 
2353     if (FAILED(hr)) {
2354         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2355                      " Host:%p, hr=%08lx",
2356                      (add ? "MAP" : "UNMAP"), name,
2357                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2358     }
2359 }
2360 
2361 static void whpx_process_section(MemoryRegionSection *section, int add)
2362 {
2363     MemoryRegion *mr = section->mr;
2364     hwaddr start_pa = section->offset_within_address_space;
2365     ram_addr_t size = int128_get64(section->size);
2366     unsigned int delta;
2367     uint64_t host_va;
2368 
2369     if (!memory_region_is_ram(mr)) {
2370         return;
2371     }
2372 
2373     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2374     delta &= ~qemu_real_host_page_mask();
2375     if (delta > size) {
2376         return;
2377     }
2378     start_pa += delta;
2379     size -= delta;
2380     size &= qemu_real_host_page_mask();
2381     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2382         return;
2383     }
2384 
2385     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2386             + section->offset_within_region + delta;
2387 
2388     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2389                         memory_region_is_rom(mr), mr->name);
2390 }
2391 
2392 static void whpx_region_add(MemoryListener *listener,
2393                            MemoryRegionSection *section)
2394 {
2395     memory_region_ref(section->mr);
2396     whpx_process_section(section, 1);
2397 }
2398 
2399 static void whpx_region_del(MemoryListener *listener,
2400                            MemoryRegionSection *section)
2401 {
2402     whpx_process_section(section, 0);
2403     memory_region_unref(section->mr);
2404 }
2405 
2406 static void whpx_transaction_begin(MemoryListener *listener)
2407 {
2408 }
2409 
2410 static void whpx_transaction_commit(MemoryListener *listener)
2411 {
2412 }
2413 
2414 static void whpx_log_sync(MemoryListener *listener,
2415                          MemoryRegionSection *section)
2416 {
2417     MemoryRegion *mr = section->mr;
2418 
2419     if (!memory_region_is_ram(mr)) {
2420         return;
2421     }
2422 
2423     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2424 }
2425 
2426 static MemoryListener whpx_memory_listener = {
2427     .name = "whpx",
2428     .begin = whpx_transaction_begin,
2429     .commit = whpx_transaction_commit,
2430     .region_add = whpx_region_add,
2431     .region_del = whpx_region_del,
2432     .log_sync = whpx_log_sync,
2433     .priority = 10,
2434 };
2435 
2436 static void whpx_memory_init(void)
2437 {
2438     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2439 }
2440 
2441 /*
2442  * Load the functions from the given library, using the given handle. If a
2443  * handle is provided, it is used, otherwise the library is opened. The
2444  * handle will be updated on return with the opened one.
2445  */
2446 static bool load_whp_dispatch_fns(HMODULE *handle,
2447     WHPFunctionList function_list)
2448 {
2449     HMODULE hLib = *handle;
2450 
2451     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2452     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2453     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2454         whp_dispatch.function_name = \
2455             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2456 
2457     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2458         whp_dispatch.function_name = \
2459             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2460         if (!whp_dispatch.function_name) { \
2461             error_report("Could not load function %s", #function_name); \
2462             goto error; \
2463         } \
2464 
2465     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2466     if (!handle_lib) { \
2467         handle_lib = LoadLibrary(lib_name); \
2468         if (!handle_lib) { \
2469             error_report("Could not load library %s.", lib_name); \
2470             goto error; \
2471         } \
2472     } \
2473 
2474     switch (function_list) {
2475     case WINHV_PLATFORM_FNS_DEFAULT:
2476         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2477         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2478         break;
2479 
2480     case WINHV_EMULATION_FNS_DEFAULT:
2481         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2482         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2483         break;
2484 
2485     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2486         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2487         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2488         break;
2489     }
2490 
2491     *handle = hLib;
2492     return true;
2493 
2494 error:
2495     if (hLib) {
2496         FreeLibrary(hLib);
2497     }
2498 
2499     return false;
2500 }
2501 
2502 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2503                                    const char *name, void *opaque,
2504                                    Error **errp)
2505 {
2506     struct whpx_state *whpx = &whpx_global;
2507     OnOffSplit mode;
2508 
2509     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2510         return;
2511     }
2512 
2513     switch (mode) {
2514     case ON_OFF_SPLIT_ON:
2515         whpx->kernel_irqchip_allowed = true;
2516         whpx->kernel_irqchip_required = true;
2517         break;
2518 
2519     case ON_OFF_SPLIT_OFF:
2520         whpx->kernel_irqchip_allowed = false;
2521         whpx->kernel_irqchip_required = false;
2522         break;
2523 
2524     case ON_OFF_SPLIT_SPLIT:
2525         error_setg(errp, "WHPX: split irqchip currently not supported");
2526         error_append_hint(errp,
2527             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2528         break;
2529 
2530     default:
2531         /*
2532          * The value was checked in visit_type_OnOffSplit() above. If
2533          * we get here, then something is wrong in QEMU.
2534          */
2535         abort();
2536     }
2537 }
2538 
2539 /*
2540  * Partition support
2541  */
2542 
2543 static int whpx_accel_init(MachineState *ms)
2544 {
2545     struct whpx_state *whpx;
2546     int ret;
2547     HRESULT hr;
2548     WHV_CAPABILITY whpx_cap;
2549     UINT32 whpx_cap_size;
2550     WHV_PARTITION_PROPERTY prop;
2551     UINT32 cpuidExitList[] = {1, 0x80000001};
2552     WHV_CAPABILITY_FEATURES features = {0};
2553 
2554     whpx = &whpx_global;
2555 
2556     if (!init_whp_dispatch()) {
2557         ret = -ENOSYS;
2558         goto error;
2559     }
2560 
2561     whpx->mem_quota = ms->ram_size;
2562 
2563     hr = whp_dispatch.WHvGetCapability(
2564         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2565         sizeof(whpx_cap), &whpx_cap_size);
2566     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2567         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2568         ret = -ENOSPC;
2569         goto error;
2570     }
2571 
2572     hr = whp_dispatch.WHvGetCapability(
2573         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2574     if (FAILED(hr)) {
2575         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2576         ret = -EINVAL;
2577         goto error;
2578     }
2579 
2580     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2581     if (FAILED(hr)) {
2582         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2583         ret = -EINVAL;
2584         goto error;
2585     }
2586 
2587     /*
2588      * Query the XSAVE capability of the partition. Any error here is not
2589      * considered fatal.
2590      */
2591     hr = whp_dispatch.WHvGetPartitionProperty(
2592         whpx->partition,
2593         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2594         &whpx_xsave_cap,
2595         sizeof(whpx_xsave_cap),
2596         &whpx_cap_size);
2597 
2598     /*
2599      * Windows version which don't support this property will return with the
2600      * specific error code.
2601      */
2602     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2603         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2604     }
2605 
2606     if (!whpx_has_xsave()) {
2607         printf("WHPX: Partition is not XSAVE capable\n");
2608     }
2609 
2610     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2611     prop.ProcessorCount = ms->smp.cpus;
2612     hr = whp_dispatch.WHvSetPartitionProperty(
2613         whpx->partition,
2614         WHvPartitionPropertyCodeProcessorCount,
2615         &prop,
2616         sizeof(WHV_PARTITION_PROPERTY));
2617 
2618     if (FAILED(hr)) {
2619         error_report("WHPX: Failed to set partition core count to %d,"
2620                      " hr=%08lx", ms->smp.cores, hr);
2621         ret = -EINVAL;
2622         goto error;
2623     }
2624 
2625     /*
2626      * Error out if WHP doesn't support apic emulation and user is requiring
2627      * it.
2628      */
2629     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2630             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2631         error_report("WHPX: kernel irqchip requested, but unavailable. "
2632             "Try without kernel-irqchip or with kernel-irqchip=off");
2633         ret = -EINVAL;
2634         goto error;
2635     }
2636 
2637     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2638         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2639         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2640             WHvX64LocalApicEmulationModeXApic;
2641         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2642         hr = whp_dispatch.WHvSetPartitionProperty(
2643             whpx->partition,
2644             WHvPartitionPropertyCodeLocalApicEmulationMode,
2645             &mode,
2646             sizeof(mode));
2647         if (FAILED(hr)) {
2648             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2649             if (whpx->kernel_irqchip_required) {
2650                 error_report("WHPX: kernel irqchip requested, but unavailable");
2651                 ret = -EINVAL;
2652                 goto error;
2653             }
2654         } else {
2655             whpx->apic_in_platform = true;
2656         }
2657     }
2658 
2659     /* Register for MSR and CPUID exits */
2660     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2661     prop.ExtendedVmExits.X64MsrExit = 1;
2662     prop.ExtendedVmExits.X64CpuidExit = 1;
2663     prop.ExtendedVmExits.ExceptionExit = 1;
2664     if (whpx_apic_in_platform()) {
2665         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2666     }
2667 
2668     hr = whp_dispatch.WHvSetPartitionProperty(
2669             whpx->partition,
2670             WHvPartitionPropertyCodeExtendedVmExits,
2671             &prop,
2672             sizeof(WHV_PARTITION_PROPERTY));
2673     if (FAILED(hr)) {
2674         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2675         ret = -EINVAL;
2676         goto error;
2677     }
2678 
2679     hr = whp_dispatch.WHvSetPartitionProperty(
2680         whpx->partition,
2681         WHvPartitionPropertyCodeCpuidExitList,
2682         cpuidExitList,
2683         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2684 
2685     if (FAILED(hr)) {
2686         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2687                      hr);
2688         ret = -EINVAL;
2689         goto error;
2690     }
2691 
2692     /*
2693      * We do not want to intercept any exceptions from the guest,
2694      * until we actually start debugging with gdb.
2695      */
2696     whpx->exception_exit_bitmap = -1;
2697     hr = whpx_set_exception_exit_bitmap(0);
2698 
2699     if (FAILED(hr)) {
2700         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2701         ret = -EINVAL;
2702         goto error;
2703     }
2704 
2705     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2706     if (FAILED(hr)) {
2707         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2708         ret = -EINVAL;
2709         goto error;
2710     }
2711 
2712     whpx_memory_init();
2713 
2714     printf("Windows Hypervisor Platform accelerator is operational\n");
2715     return 0;
2716 
2717 error:
2718 
2719     if (NULL != whpx->partition) {
2720         whp_dispatch.WHvDeletePartition(whpx->partition);
2721         whpx->partition = NULL;
2722     }
2723 
2724     return ret;
2725 }
2726 
2727 int whpx_enabled(void)
2728 {
2729     return whpx_allowed;
2730 }
2731 
2732 bool whpx_apic_in_platform(void) {
2733     return whpx_global.apic_in_platform;
2734 }
2735 
2736 static void whpx_accel_class_init(ObjectClass *oc, void *data)
2737 {
2738     AccelClass *ac = ACCEL_CLASS(oc);
2739     ac->name = "WHPX";
2740     ac->init_machine = whpx_accel_init;
2741     ac->allowed = &whpx_allowed;
2742 
2743     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2744         NULL, whpx_set_kernel_irqchip,
2745         NULL, NULL);
2746     object_class_property_set_description(oc, "kernel-irqchip",
2747         "Configure WHPX in-kernel irqchip");
2748 }
2749 
2750 static void whpx_accel_instance_init(Object *obj)
2751 {
2752     struct whpx_state *whpx = &whpx_global;
2753 
2754     memset(whpx, 0, sizeof(struct whpx_state));
2755     /* Turn on kernel-irqchip, by default */
2756     whpx->kernel_irqchip_allowed = true;
2757 }
2758 
2759 static const TypeInfo whpx_accel_type = {
2760     .name = ACCEL_CLASS_NAME("whpx"),
2761     .parent = TYPE_ACCEL,
2762     .instance_init = whpx_accel_instance_init,
2763     .class_init = whpx_accel_class_init,
2764 };
2765 
2766 static void whpx_type_init(void)
2767 {
2768     type_register_static(&whpx_accel_type);
2769 }
2770 
2771 bool init_whp_dispatch(void)
2772 {
2773     if (whp_dispatch_initialized) {
2774         return true;
2775     }
2776 
2777     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2778         goto error;
2779     }
2780 
2781     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2782         goto error;
2783     }
2784 
2785     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2786         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2787     whp_dispatch_initialized = true;
2788 
2789     return true;
2790 error:
2791     if (hWinHvPlatform) {
2792         FreeLibrary(hWinHvPlatform);
2793     }
2794 
2795     if (hWinHvEmulation) {
2796         FreeLibrary(hWinHvEmulation);
2797     }
2798 
2799     return false;
2800 }
2801 
2802 type_init(whpx_type_init);
2803