xref: /openbmc/qemu/target/i386/whpx/whpx-all.c (revision 3dba0a33)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "exec/gdbstub.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/i386/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <WinHvPlatform.h>
35 #include <WinHvEmulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct whpx_vcpu {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240 
241     /* Must be the last field as it may have a tail */
242     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
243 };
244 
245 static bool whpx_allowed;
246 static bool whp_dispatch_initialized;
247 static HMODULE hWinHvPlatform, hWinHvEmulation;
248 static uint32_t max_vcpu_index;
249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
250 
251 struct whpx_state whpx_global;
252 struct WHPDispatch whp_dispatch;
253 
254 static bool whpx_has_xsave(void)
255 {
256     return whpx_xsave_cap.XsaveSupport;
257 }
258 
259 /*
260  * VP support
261  */
262 
263 static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu)
264 {
265     return (struct whpx_vcpu *)cpu->hax_vcpu;
266 }
267 
268 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
269                                              int r86)
270 {
271     WHV_X64_SEGMENT_REGISTER hs;
272     unsigned flags = qs->flags;
273 
274     hs.Base = qs->base;
275     hs.Limit = qs->limit;
276     hs.Selector = qs->selector;
277 
278     if (v86) {
279         hs.Attributes = 0;
280         hs.SegmentType = 3;
281         hs.Present = 1;
282         hs.DescriptorPrivilegeLevel = 3;
283         hs.NonSystemSegment = 1;
284 
285     } else {
286         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
287 
288         if (r86) {
289             /* hs.Base &= 0xfffff; */
290         }
291     }
292 
293     return hs;
294 }
295 
296 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
297 {
298     SegmentCache qs;
299 
300     qs.base = hs->Base;
301     qs.limit = hs->Limit;
302     qs.selector = hs->Selector;
303 
304     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
305 
306     return qs;
307 }
308 
309 /* X64 Extended Control Registers */
310 static void whpx_set_xcrs(CPUState *cpu)
311 {
312     CPUX86State *env = cpu->env_ptr;
313     HRESULT hr;
314     struct whpx_state *whpx = &whpx_global;
315     WHV_REGISTER_VALUE xcr0;
316     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
317 
318     if (!whpx_has_xsave()) {
319         return;
320     }
321 
322     /* Only xcr0 is supported by the hypervisor currently */
323     xcr0.Reg64 = env->xcr0;
324     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
325         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
326     if (FAILED(hr)) {
327         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
328     }
329 }
330 
331 static int whpx_set_tsc(CPUState *cpu)
332 {
333     CPUX86State *env = cpu->env_ptr;
334     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
335     WHV_REGISTER_VALUE tsc_val;
336     HRESULT hr;
337     struct whpx_state *whpx = &whpx_global;
338 
339     /*
340      * Suspend the partition prior to setting the TSC to reduce the variance
341      * in TSC across vCPUs. When the first vCPU runs post suspend, the
342      * partition is automatically resumed.
343      */
344     if (whp_dispatch.WHvSuspendPartitionTime) {
345 
346         /*
347          * Unable to suspend partition while setting TSC is not a fatal
348          * error. It just increases the likelihood of TSC variance between
349          * vCPUs and some guest OS are able to handle that just fine.
350          */
351         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
352         if (FAILED(hr)) {
353             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
354         }
355     }
356 
357     tsc_val.Reg64 = env->tsc;
358     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
359         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
360     if (FAILED(hr)) {
361         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
362         return -1;
363     }
364 
365     return 0;
366 }
367 
368 /*
369  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
370  * however, they use a slightly different encoding. Specifically:
371  *
372  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
373  *
374  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
375  * and IA-32 Architectures Software Developer's Manual.
376  *
377  * The functions below translate the value of CR8 to TPR and vice versa.
378  */
379 
380 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
381 {
382     return tpr >> 4;
383 }
384 
385 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
386 {
387     return cr8 << 4;
388 }
389 
390 static void whpx_set_registers(CPUState *cpu, int level)
391 {
392     struct whpx_state *whpx = &whpx_global;
393     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
394     CPUX86State *env = cpu->env_ptr;
395     X86CPU *x86_cpu = X86_CPU(cpu);
396     struct whpx_register_set vcxt;
397     HRESULT hr;
398     int idx;
399     int idx_next;
400     int i;
401     int v86, r86;
402 
403     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
404 
405     /*
406      * Following MSRs have side effects on the guest or are too heavy for
407      * runtime. Limit them to full state update.
408      */
409     if (level >= WHPX_SET_RESET_STATE) {
410         whpx_set_tsc(cpu);
411     }
412 
413     memset(&vcxt, 0, sizeof(struct whpx_register_set));
414 
415     v86 = (env->eflags & VM_MASK);
416     r86 = !(env->cr[0] & CR0_PE_MASK);
417 
418     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
419     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
420 
421     idx = 0;
422 
423     /* Indexes for first 16 registers match between HV and QEMU definitions */
424     idx_next = 16;
425     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
426         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
427     }
428     idx = idx_next;
429 
430     /* Same goes for RIP and RFLAGS */
431     assert(whpx_register_names[idx] == WHvX64RegisterRip);
432     vcxt.values[idx++].Reg64 = env->eip;
433 
434     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
435     vcxt.values[idx++].Reg64 = env->eflags;
436 
437     /* Translate 6+4 segment registers. HV and QEMU order matches  */
438     assert(idx == WHvX64RegisterEs);
439     for (i = 0; i < 6; i += 1, idx += 1) {
440         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
441     }
442 
443     assert(idx == WHvX64RegisterLdtr);
444     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
445 
446     assert(idx == WHvX64RegisterTr);
447     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
448 
449     assert(idx == WHvX64RegisterIdtr);
450     vcxt.values[idx].Table.Base = env->idt.base;
451     vcxt.values[idx].Table.Limit = env->idt.limit;
452     idx += 1;
453 
454     assert(idx == WHvX64RegisterGdtr);
455     vcxt.values[idx].Table.Base = env->gdt.base;
456     vcxt.values[idx].Table.Limit = env->gdt.limit;
457     idx += 1;
458 
459     /* CR0, 2, 3, 4, 8 */
460     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
461     vcxt.values[idx++].Reg64 = env->cr[0];
462     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
463     vcxt.values[idx++].Reg64 = env->cr[2];
464     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
465     vcxt.values[idx++].Reg64 = env->cr[3];
466     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
467     vcxt.values[idx++].Reg64 = env->cr[4];
468     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
469     vcxt.values[idx++].Reg64 = vcpu->tpr;
470 
471     /* 8 Debug Registers - Skipped */
472 
473     /*
474      * Extended control registers needs to be handled separately depending
475      * on whether xsave is supported/enabled or not.
476      */
477     whpx_set_xcrs(cpu);
478 
479     /* 16 XMM registers */
480     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
481     idx_next = idx + 16;
482     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
483         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
484         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
485     }
486     idx = idx_next;
487 
488     /* 8 FP registers */
489     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
490     for (i = 0; i < 8; i += 1, idx += 1) {
491         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
492         /* vcxt.values[idx].Fp.AsUINT128.High64 =
493                env->fpregs[i].mmx.MMX_Q(1);
494         */
495     }
496 
497     /* FP control status register */
498     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
499     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
500     vcxt.values[idx].FpControlStatus.FpStatus =
501         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
502     vcxt.values[idx].FpControlStatus.FpTag = 0;
503     for (i = 0; i < 8; ++i) {
504         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
505     }
506     vcxt.values[idx].FpControlStatus.Reserved = 0;
507     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
508     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
509     idx += 1;
510 
511     /* XMM control status register */
512     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
513     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
514     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
515     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
516     idx += 1;
517 
518     /* MSRs */
519     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
520     vcxt.values[idx++].Reg64 = env->efer;
521 #ifdef TARGET_X86_64
522     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
523     vcxt.values[idx++].Reg64 = env->kernelgsbase;
524 #endif
525 
526     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
527     vcxt.values[idx++].Reg64 = vcpu->apic_base;
528 
529     /* WHvX64RegisterPat - Skipped */
530 
531     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
532     vcxt.values[idx++].Reg64 = env->sysenter_cs;
533     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
534     vcxt.values[idx++].Reg64 = env->sysenter_eip;
535     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
536     vcxt.values[idx++].Reg64 = env->sysenter_esp;
537     assert(whpx_register_names[idx] == WHvX64RegisterStar);
538     vcxt.values[idx++].Reg64 = env->star;
539 #ifdef TARGET_X86_64
540     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
541     vcxt.values[idx++].Reg64 = env->lstar;
542     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
543     vcxt.values[idx++].Reg64 = env->cstar;
544     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
545     vcxt.values[idx++].Reg64 = env->fmask;
546 #endif
547 
548     /* Interrupt / Event Registers - Skipped */
549 
550     assert(idx == RTL_NUMBER_OF(whpx_register_names));
551 
552     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
553         whpx->partition, cpu->cpu_index,
554         whpx_register_names,
555         RTL_NUMBER_OF(whpx_register_names),
556         &vcxt.values[0]);
557 
558     if (FAILED(hr)) {
559         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
560                      hr);
561     }
562 
563     return;
564 }
565 
566 static int whpx_get_tsc(CPUState *cpu)
567 {
568     CPUX86State *env = cpu->env_ptr;
569     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
570     WHV_REGISTER_VALUE tsc_val;
571     HRESULT hr;
572     struct whpx_state *whpx = &whpx_global;
573 
574     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
575         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
576     if (FAILED(hr)) {
577         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
578         return -1;
579     }
580 
581     env->tsc = tsc_val.Reg64;
582     return 0;
583 }
584 
585 /* X64 Extended Control Registers */
586 static void whpx_get_xcrs(CPUState *cpu)
587 {
588     CPUX86State *env = cpu->env_ptr;
589     HRESULT hr;
590     struct whpx_state *whpx = &whpx_global;
591     WHV_REGISTER_VALUE xcr0;
592     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
593 
594     if (!whpx_has_xsave()) {
595         return;
596     }
597 
598     /* Only xcr0 is supported by the hypervisor currently */
599     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
600         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
601     if (FAILED(hr)) {
602         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
603         return;
604     }
605 
606     env->xcr0 = xcr0.Reg64;
607 }
608 
609 static void whpx_get_registers(CPUState *cpu)
610 {
611     struct whpx_state *whpx = &whpx_global;
612     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
613     CPUX86State *env = cpu->env_ptr;
614     X86CPU *x86_cpu = X86_CPU(cpu);
615     struct whpx_register_set vcxt;
616     uint64_t tpr, apic_base;
617     HRESULT hr;
618     int idx;
619     int idx_next;
620     int i;
621 
622     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
623 
624     if (!env->tsc_valid) {
625         whpx_get_tsc(cpu);
626         env->tsc_valid = !runstate_is_running();
627     }
628 
629     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
630         whpx->partition, cpu->cpu_index,
631         whpx_register_names,
632         RTL_NUMBER_OF(whpx_register_names),
633         &vcxt.values[0]);
634     if (FAILED(hr)) {
635         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
636                      hr);
637     }
638 
639     if (whpx_apic_in_platform()) {
640         /*
641          * Fetch the TPR value from the emulated APIC. It may get overwritten
642          * below with the value from CR8 returned by
643          * WHvGetVirtualProcessorRegisters().
644          */
645         whpx_apic_get(x86_cpu->apic_state);
646         vcpu->tpr = whpx_apic_tpr_to_cr8(
647             cpu_get_apic_tpr(x86_cpu->apic_state));
648     }
649 
650     idx = 0;
651 
652     /* Indexes for first 16 registers match between HV and QEMU definitions */
653     idx_next = 16;
654     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
655         env->regs[idx] = vcxt.values[idx].Reg64;
656     }
657     idx = idx_next;
658 
659     /* Same goes for RIP and RFLAGS */
660     assert(whpx_register_names[idx] == WHvX64RegisterRip);
661     env->eip = vcxt.values[idx++].Reg64;
662     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
663     env->eflags = vcxt.values[idx++].Reg64;
664 
665     /* Translate 6+4 segment registers. HV and QEMU order matches  */
666     assert(idx == WHvX64RegisterEs);
667     for (i = 0; i < 6; i += 1, idx += 1) {
668         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
669     }
670 
671     assert(idx == WHvX64RegisterLdtr);
672     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
673     assert(idx == WHvX64RegisterTr);
674     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
675     assert(idx == WHvX64RegisterIdtr);
676     env->idt.base = vcxt.values[idx].Table.Base;
677     env->idt.limit = vcxt.values[idx].Table.Limit;
678     idx += 1;
679     assert(idx == WHvX64RegisterGdtr);
680     env->gdt.base = vcxt.values[idx].Table.Base;
681     env->gdt.limit = vcxt.values[idx].Table.Limit;
682     idx += 1;
683 
684     /* CR0, 2, 3, 4, 8 */
685     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
686     env->cr[0] = vcxt.values[idx++].Reg64;
687     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
688     env->cr[2] = vcxt.values[idx++].Reg64;
689     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
690     env->cr[3] = vcxt.values[idx++].Reg64;
691     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
692     env->cr[4] = vcxt.values[idx++].Reg64;
693     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
694     tpr = vcxt.values[idx++].Reg64;
695     if (tpr != vcpu->tpr) {
696         vcpu->tpr = tpr;
697         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
698     }
699 
700     /* 8 Debug Registers - Skipped */
701 
702     /*
703      * Extended control registers needs to be handled separately depending
704      * on whether xsave is supported/enabled or not.
705      */
706     whpx_get_xcrs(cpu);
707 
708     /* 16 XMM registers */
709     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
710     idx_next = idx + 16;
711     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
712         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
713         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
714     }
715     idx = idx_next;
716 
717     /* 8 FP registers */
718     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
719     for (i = 0; i < 8; i += 1, idx += 1) {
720         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
721         /* env->fpregs[i].mmx.MMX_Q(1) =
722                vcxt.values[idx].Fp.AsUINT128.High64;
723         */
724     }
725 
726     /* FP control status register */
727     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
728     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
729     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
730     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
731     for (i = 0; i < 8; ++i) {
732         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
733     }
734     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
735     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
736     idx += 1;
737 
738     /* XMM control status register */
739     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
740     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
741     idx += 1;
742 
743     /* MSRs */
744     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
745     env->efer = vcxt.values[idx++].Reg64;
746 #ifdef TARGET_X86_64
747     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
748     env->kernelgsbase = vcxt.values[idx++].Reg64;
749 #endif
750 
751     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
752     apic_base = vcxt.values[idx++].Reg64;
753     if (apic_base != vcpu->apic_base) {
754         vcpu->apic_base = apic_base;
755         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
756     }
757 
758     /* WHvX64RegisterPat - Skipped */
759 
760     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
761     env->sysenter_cs = vcxt.values[idx++].Reg64;
762     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
763     env->sysenter_eip = vcxt.values[idx++].Reg64;
764     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
765     env->sysenter_esp = vcxt.values[idx++].Reg64;
766     assert(whpx_register_names[idx] == WHvX64RegisterStar);
767     env->star = vcxt.values[idx++].Reg64;
768 #ifdef TARGET_X86_64
769     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
770     env->lstar = vcxt.values[idx++].Reg64;
771     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
772     env->cstar = vcxt.values[idx++].Reg64;
773     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
774     env->fmask = vcxt.values[idx++].Reg64;
775 #endif
776 
777     /* Interrupt / Event Registers - Skipped */
778 
779     assert(idx == RTL_NUMBER_OF(whpx_register_names));
780 
781     if (whpx_apic_in_platform()) {
782         whpx_apic_get(x86_cpu->apic_state);
783     }
784 
785     x86_update_hflags(env);
786 
787     return;
788 }
789 
790 static HRESULT CALLBACK whpx_emu_ioport_callback(
791     void *ctx,
792     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
793 {
794     MemTxAttrs attrs = { 0 };
795     address_space_rw(&address_space_io, IoAccess->Port, attrs,
796                      &IoAccess->Data, IoAccess->AccessSize,
797                      IoAccess->Direction);
798     return S_OK;
799 }
800 
801 static HRESULT CALLBACK whpx_emu_mmio_callback(
802     void *ctx,
803     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
804 {
805     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
806                            ma->Direction);
807     return S_OK;
808 }
809 
810 static HRESULT CALLBACK whpx_emu_getreg_callback(
811     void *ctx,
812     const WHV_REGISTER_NAME *RegisterNames,
813     UINT32 RegisterCount,
814     WHV_REGISTER_VALUE *RegisterValues)
815 {
816     HRESULT hr;
817     struct whpx_state *whpx = &whpx_global;
818     CPUState *cpu = (CPUState *)ctx;
819 
820     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
821         whpx->partition, cpu->cpu_index,
822         RegisterNames, RegisterCount,
823         RegisterValues);
824     if (FAILED(hr)) {
825         error_report("WHPX: Failed to get virtual processor registers,"
826                      " hr=%08lx", hr);
827     }
828 
829     return hr;
830 }
831 
832 static HRESULT CALLBACK whpx_emu_setreg_callback(
833     void *ctx,
834     const WHV_REGISTER_NAME *RegisterNames,
835     UINT32 RegisterCount,
836     const WHV_REGISTER_VALUE *RegisterValues)
837 {
838     HRESULT hr;
839     struct whpx_state *whpx = &whpx_global;
840     CPUState *cpu = (CPUState *)ctx;
841 
842     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
843         whpx->partition, cpu->cpu_index,
844         RegisterNames, RegisterCount,
845         RegisterValues);
846     if (FAILED(hr)) {
847         error_report("WHPX: Failed to set virtual processor registers,"
848                      " hr=%08lx", hr);
849     }
850 
851     /*
852      * The emulator just successfully wrote the register state. We clear the
853      * dirty state so we avoid the double write on resume of the VP.
854      */
855     cpu->vcpu_dirty = false;
856 
857     return hr;
858 }
859 
860 static HRESULT CALLBACK whpx_emu_translate_callback(
861     void *ctx,
862     WHV_GUEST_VIRTUAL_ADDRESS Gva,
863     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
864     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
865     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
866 {
867     HRESULT hr;
868     struct whpx_state *whpx = &whpx_global;
869     CPUState *cpu = (CPUState *)ctx;
870     WHV_TRANSLATE_GVA_RESULT res;
871 
872     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
873                                       Gva, TranslateFlags, &res, Gpa);
874     if (FAILED(hr)) {
875         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
876     } else {
877         *TranslationResult = res.ResultCode;
878     }
879 
880     return hr;
881 }
882 
883 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
884     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
885     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
886     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
887     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
888     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
889     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
890 };
891 
892 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
893 {
894     HRESULT hr;
895     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
896     WHV_EMULATOR_STATUS emu_status;
897 
898     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
899         vcpu->emulator, cpu,
900         &vcpu->exit_ctx.VpContext, ctx,
901         &emu_status);
902     if (FAILED(hr)) {
903         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
904         return -1;
905     }
906 
907     if (!emu_status.EmulationSuccessful) {
908         error_report("WHPX: Failed to emulate MMIO access with"
909                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
910         return -1;
911     }
912 
913     return 0;
914 }
915 
916 static int whpx_handle_portio(CPUState *cpu,
917                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
918 {
919     HRESULT hr;
920     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
921     WHV_EMULATOR_STATUS emu_status;
922 
923     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
924         vcpu->emulator, cpu,
925         &vcpu->exit_ctx.VpContext, ctx,
926         &emu_status);
927     if (FAILED(hr)) {
928         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
929         return -1;
930     }
931 
932     if (!emu_status.EmulationSuccessful) {
933         error_report("WHPX: Failed to emulate PortIO access with"
934                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
935         return -1;
936     }
937 
938     return 0;
939 }
940 
941 /*
942  * Controls whether we should intercept various exceptions on the guest,
943  * namely breakpoint/single-step events.
944  *
945  * The 'exceptions' argument accepts a bitmask, e.g:
946  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
947  */
948 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
949 {
950     struct whpx_state *whpx = &whpx_global;
951     WHV_PARTITION_PROPERTY prop = { 0, };
952     HRESULT hr;
953 
954     if (exceptions == whpx->exception_exit_bitmap) {
955         return S_OK;
956     }
957 
958     prop.ExceptionExitBitmap = exceptions;
959 
960     hr = whp_dispatch.WHvSetPartitionProperty(
961         whpx->partition,
962         WHvPartitionPropertyCodeExceptionExitBitmap,
963         &prop,
964         sizeof(WHV_PARTITION_PROPERTY));
965 
966     if (SUCCEEDED(hr)) {
967         whpx->exception_exit_bitmap = exceptions;
968     }
969 
970     return hr;
971 }
972 
973 
974 /*
975  * This function is called before/after stepping over a single instruction.
976  * It will update the CPU registers to arm/disarm the instruction stepping
977  * accordingly.
978  */
979 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
980     bool set,
981     uint64_t *exit_context_rflags)
982 {
983     WHV_REGISTER_NAME reg_name;
984     WHV_REGISTER_VALUE reg_value;
985     HRESULT hr;
986     struct whpx_state *whpx = &whpx_global;
987 
988     /*
989      * If we are trying to step over a single instruction, we need to set the
990      * TF bit in rflags. Otherwise, clear it.
991      */
992     reg_name = WHvX64RegisterRflags;
993     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
994         whpx->partition,
995         cpu->cpu_index,
996         &reg_name,
997         1,
998         &reg_value);
999 
1000     if (FAILED(hr)) {
1001         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
1002         return hr;
1003     }
1004 
1005     if (exit_context_rflags) {
1006         assert(*exit_context_rflags == reg_value.Reg64);
1007     }
1008 
1009     if (set) {
1010         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
1011         reg_value.Reg64 |= TF_MASK;
1012     } else {
1013         reg_value.Reg64 &= ~TF_MASK;
1014     }
1015 
1016     if (exit_context_rflags) {
1017         *exit_context_rflags = reg_value.Reg64;
1018     }
1019 
1020     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1021         whpx->partition,
1022         cpu->cpu_index,
1023         &reg_name,
1024         1,
1025         &reg_value);
1026 
1027     if (FAILED(hr)) {
1028         error_report("WHPX: Failed to set rflags,"
1029             " hr=%08lx",
1030             hr);
1031         return hr;
1032     }
1033 
1034     reg_name = WHvRegisterInterruptState;
1035     reg_value.Reg64 = 0;
1036 
1037     /* Suspend delivery of hardware interrupts during single-stepping. */
1038     reg_value.InterruptState.InterruptShadow = set != 0;
1039 
1040     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1041     whpx->partition,
1042         cpu->cpu_index,
1043         &reg_name,
1044         1,
1045         &reg_value);
1046 
1047     if (FAILED(hr)) {
1048         error_report("WHPX: Failed to set InterruptState,"
1049             " hr=%08lx",
1050             hr);
1051         return hr;
1052     }
1053 
1054     if (!set) {
1055         /*
1056          * We have just finished stepping over a single instruction,
1057          * and intercepted the INT1 generated by it.
1058          * We need to now hide the INT1 from the guest,
1059          * as it would not be expecting it.
1060          */
1061 
1062         reg_name = WHvX64RegisterPendingDebugException;
1063         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1064         whpx->partition,
1065             cpu->cpu_index,
1066             &reg_name,
1067             1,
1068             &reg_value);
1069 
1070         if (FAILED(hr)) {
1071             error_report("WHPX: Failed to get pending debug exceptions,"
1072                          "hr=%08lx", hr);
1073             return hr;
1074         }
1075 
1076         if (reg_value.PendingDebugException.SingleStep) {
1077             reg_value.PendingDebugException.SingleStep = 0;
1078 
1079             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1080                 whpx->partition,
1081                 cpu->cpu_index,
1082                 &reg_name,
1083                 1,
1084                 &reg_value);
1085 
1086             if (FAILED(hr)) {
1087                 error_report("WHPX: Failed to clear pending debug exceptions,"
1088                              "hr=%08lx", hr);
1089              return hr;
1090             }
1091         }
1092 
1093     }
1094 
1095     return S_OK;
1096 }
1097 
1098 /* Tries to find a breakpoint at the specified address. */
1099 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1100 {
1101     struct whpx_state *whpx = &whpx_global;
1102     int i;
1103 
1104     if (whpx->breakpoints.breakpoints) {
1105         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1106             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1107                 return &whpx->breakpoints.breakpoints->data[i];
1108             }
1109         }
1110     }
1111 
1112     return NULL;
1113 }
1114 
1115 /*
1116  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1117  * debugging user-mode applications. Since the WHPX API does not offer
1118  * an easy way to pass the intercepted exception back to the guest, we
1119  * resort to using INT1 instead, and let the guest always handle INT3.
1120  */
1121 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1122 
1123 /*
1124  * The WHPX QEMU backend implements breakpoints by writing the INT1
1125  * instruction into memory (ignoring the DRx registers). This raises a few
1126  * issues that need to be carefully handled:
1127  *
1128  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1129  *    at the same location, and later remove them in arbitrary order.
1130  *    This should not cause memory corruption, and should only remove the
1131  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1132  *
1133  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1134  *    physical location. Hence, physically adding/removing a breakpoint can
1135  *    theoretically fail at any time. We need to keep track of it.
1136  *
1137  * The function below rebuilds a list of low-level breakpoints (one per
1138  * address, tracking the original instruction and any errors) from the list of
1139  * high-level breakpoints (set via cpu_breakpoint_insert()).
1140  *
1141  * In order to optimize performance, this function stores the list of
1142  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1143  * low-level ones, so that it won't be re-invoked until these breakpoints
1144  * change.
1145  *
1146  * Note that this function decides which breakpoints should be inserted into,
1147  * memory, but doesn't actually do it. The memory accessing is done in
1148  * whpx_apply_breakpoints().
1149  */
1150 static void whpx_translate_cpu_breakpoints(
1151     struct whpx_breakpoints *breakpoints,
1152     CPUState *cpu,
1153     int cpu_breakpoint_count)
1154 {
1155     CPUBreakpoint *bp;
1156     int cpu_bp_index = 0;
1157 
1158     breakpoints->original_addresses =
1159         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1160 
1161     breakpoints->original_address_count = cpu_breakpoint_count;
1162 
1163     int max_breakpoints = cpu_breakpoint_count +
1164         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1165 
1166     struct whpx_breakpoint_collection *new_breakpoints =
1167         (struct whpx_breakpoint_collection *)g_malloc0(
1168         sizeof(struct whpx_breakpoint_collection) +
1169             max_breakpoints * sizeof(struct whpx_breakpoint));
1170 
1171     new_breakpoints->allocated = max_breakpoints;
1172     new_breakpoints->used = 0;
1173 
1174     /*
1175      * 1. Preserve all old breakpoints that could not be automatically
1176      * cleared when the CPU got stopped.
1177      */
1178     if (breakpoints->breakpoints) {
1179         int i;
1180         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1181             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1182                 new_breakpoints->data[new_breakpoints->used++] =
1183                     breakpoints->breakpoints->data[i];
1184             }
1185         }
1186     }
1187 
1188     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1189     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1190         int i;
1191         bool found = false;
1192 
1193         /* This will be used to detect changed CPU breakpoints later. */
1194         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1195 
1196         for (i = 0; i < new_breakpoints->used; i++) {
1197             /*
1198              * WARNING: This loop has O(N^2) complexity, where N is the
1199              * number of breakpoints. It should not be a bottleneck in
1200              * real-world scenarios, since it only needs to run once after
1201              * the breakpoints have been modified.
1202              * If this ever becomes a concern, it can be optimized by storing
1203              * high-level breakpoint objects in a tree or hash map.
1204              */
1205 
1206             if (new_breakpoints->data[i].address == bp->pc) {
1207                 /* There was already a breakpoint at this address. */
1208                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1209                     new_breakpoints->data[i].state = WHPX_BP_SET;
1210                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1211                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1212                 }
1213 
1214                 found = true;
1215                 break;
1216             }
1217         }
1218 
1219         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1220             /* No WHPX breakpoint at this address. Create one. */
1221             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1222             new_breakpoints->data[new_breakpoints->used].state =
1223                 WHPX_BP_SET_PENDING;
1224             new_breakpoints->used++;
1225         }
1226     }
1227 
1228     /*
1229      * Free the previous breakpoint list. This can be optimized by keeping
1230      * it as shadow buffer for the next computation instead of freeing
1231      * it immediately.
1232      */
1233     g_free(breakpoints->breakpoints);
1234 
1235     breakpoints->breakpoints = new_breakpoints;
1236 }
1237 
1238 /*
1239  * Physically inserts/removes the breakpoints by reading and writing the
1240  * physical memory, keeping a track of the failed attempts.
1241  *
1242  * Passing resuming=true  will try to set all previously unset breakpoints.
1243  * Passing resuming=false will remove all inserted ones.
1244  */
1245 static void whpx_apply_breakpoints(
1246     struct whpx_breakpoint_collection *breakpoints,
1247     CPUState *cpu,
1248     bool resuming)
1249 {
1250     int i, rc;
1251     if (!breakpoints) {
1252         return;
1253     }
1254 
1255     for (i = 0; i < breakpoints->used; i++) {
1256         /* Decide what to do right now based on the last known state. */
1257         WhpxBreakpointState state = breakpoints->data[i].state;
1258         switch (state) {
1259         case WHPX_BP_CLEARED:
1260             if (resuming) {
1261                 state = WHPX_BP_SET_PENDING;
1262             }
1263             break;
1264         case WHPX_BP_SET_PENDING:
1265             if (!resuming) {
1266                 state = WHPX_BP_CLEARED;
1267             }
1268             break;
1269         case WHPX_BP_SET:
1270             if (!resuming) {
1271                 state = WHPX_BP_CLEAR_PENDING;
1272             }
1273             break;
1274         case WHPX_BP_CLEAR_PENDING:
1275             if (resuming) {
1276                 state = WHPX_BP_SET;
1277             }
1278             break;
1279         }
1280 
1281         if (state == WHPX_BP_SET_PENDING) {
1282             /* Remember the original instruction. */
1283             rc = cpu_memory_rw_debug(cpu,
1284                 breakpoints->data[i].address,
1285                 &breakpoints->data[i].original_instruction,
1286                 1,
1287                 false);
1288 
1289             if (!rc) {
1290                 /* Write the breakpoint instruction. */
1291                 rc = cpu_memory_rw_debug(cpu,
1292                     breakpoints->data[i].address,
1293                     (void *)&whpx_breakpoint_instruction,
1294                     1,
1295                     true);
1296             }
1297 
1298             if (!rc) {
1299                 state = WHPX_BP_SET;
1300             }
1301 
1302         }
1303 
1304         if (state == WHPX_BP_CLEAR_PENDING) {
1305             /* Restore the original instruction. */
1306             rc = cpu_memory_rw_debug(cpu,
1307                 breakpoints->data[i].address,
1308                 &breakpoints->data[i].original_instruction,
1309                 1,
1310                 true);
1311 
1312             if (!rc) {
1313                 state = WHPX_BP_CLEARED;
1314             }
1315         }
1316 
1317         breakpoints->data[i].state = state;
1318     }
1319 }
1320 
1321 /*
1322  * This function is called when the a VCPU is about to start and no other
1323  * VCPUs have been started so far. Since the VCPU start order could be
1324  * arbitrary, it doesn't have to be VCPU#0.
1325  *
1326  * It is used to commit the breakpoints into memory, and configure WHPX
1327  * to intercept debug exceptions.
1328  *
1329  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1330  * more VCPUs are already running, so this is the best place to do it.
1331  */
1332 static int whpx_first_vcpu_starting(CPUState *cpu)
1333 {
1334     struct whpx_state *whpx = &whpx_global;
1335     HRESULT hr;
1336 
1337     g_assert(qemu_mutex_iothread_locked());
1338 
1339     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1340             (whpx->breakpoints.breakpoints &&
1341              whpx->breakpoints.breakpoints->used)) {
1342         CPUBreakpoint *bp;
1343         int i = 0;
1344         bool update_pending = false;
1345 
1346         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1347             if (i >= whpx->breakpoints.original_address_count ||
1348                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1349                 update_pending = true;
1350             }
1351 
1352             i++;
1353         }
1354 
1355         if (i != whpx->breakpoints.original_address_count) {
1356             update_pending = true;
1357         }
1358 
1359         if (update_pending) {
1360             /*
1361              * The CPU breakpoints have changed since the last call to
1362              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1363              * now be recomputed.
1364              */
1365             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1366         }
1367 
1368         /* Actually insert the breakpoints into the memory. */
1369         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1370     }
1371 
1372     uint64_t exception_mask;
1373     if (whpx->step_pending ||
1374         (whpx->breakpoints.breakpoints &&
1375          whpx->breakpoints.breakpoints->used)) {
1376         /*
1377          * We are either attempting to single-step one or more CPUs, or
1378          * have one or more breakpoints enabled. Both require intercepting
1379          * the WHvX64ExceptionTypeBreakpointTrap exception.
1380          */
1381 
1382         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1383     } else {
1384         /* Let the guest handle all exceptions. */
1385         exception_mask = 0;
1386     }
1387 
1388     hr = whpx_set_exception_exit_bitmap(exception_mask);
1389     if (!SUCCEEDED(hr)) {
1390         error_report("WHPX: Failed to update exception exit mask,"
1391                      "hr=%08lx.", hr);
1392         return 1;
1393     }
1394 
1395     return 0;
1396 }
1397 
1398 /*
1399  * This function is called when the last VCPU has finished running.
1400  * It is used to remove any previously set breakpoints from memory.
1401  */
1402 static int whpx_last_vcpu_stopping(CPUState *cpu)
1403 {
1404     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1405     return 0;
1406 }
1407 
1408 /* Returns the address of the next instruction that is about to be executed. */
1409 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1410 {
1411     if (cpu->vcpu_dirty) {
1412         /* The CPU registers have been modified by other parts of QEMU. */
1413         CPUArchState *env = (CPUArchState *)(cpu->env_ptr);
1414         return env->eip;
1415     } else if (exit_context_valid) {
1416         /*
1417          * The CPU registers have not been modified by neither other parts
1418          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1419          * This is the most common case.
1420          */
1421         struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1422         return vcpu->exit_ctx.VpContext.Rip;
1423     } else {
1424         /*
1425          * The CPU registers have been modified by a call to
1426          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1427          * the target.
1428          */
1429         WHV_REGISTER_VALUE reg_value;
1430         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1431         HRESULT hr;
1432         struct whpx_state *whpx = &whpx_global;
1433 
1434         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1435             whpx->partition,
1436             cpu->cpu_index,
1437             &reg_name,
1438             1,
1439             &reg_value);
1440 
1441         if (FAILED(hr)) {
1442             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1443             return 0;
1444         }
1445 
1446         return reg_value.Reg64;
1447     }
1448 }
1449 
1450 static int whpx_handle_halt(CPUState *cpu)
1451 {
1452     CPUX86State *env = cpu->env_ptr;
1453     int ret = 0;
1454 
1455     qemu_mutex_lock_iothread();
1456     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1457           (env->eflags & IF_MASK)) &&
1458         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1459         cpu->exception_index = EXCP_HLT;
1460         cpu->halted = true;
1461         ret = 1;
1462     }
1463     qemu_mutex_unlock_iothread();
1464 
1465     return ret;
1466 }
1467 
1468 static void whpx_vcpu_pre_run(CPUState *cpu)
1469 {
1470     HRESULT hr;
1471     struct whpx_state *whpx = &whpx_global;
1472     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1473     CPUX86State *env = cpu->env_ptr;
1474     X86CPU *x86_cpu = X86_CPU(cpu);
1475     int irq;
1476     uint8_t tpr;
1477     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1478     UINT32 reg_count = 0;
1479     WHV_REGISTER_VALUE reg_values[3];
1480     WHV_REGISTER_NAME reg_names[3];
1481 
1482     memset(&new_int, 0, sizeof(new_int));
1483     memset(reg_values, 0, sizeof(reg_values));
1484 
1485     qemu_mutex_lock_iothread();
1486 
1487     /* Inject NMI */
1488     if (!vcpu->interruption_pending &&
1489         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1490         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1491             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1492             vcpu->interruptable = false;
1493             new_int.InterruptionType = WHvX64PendingNmi;
1494             new_int.InterruptionPending = 1;
1495             new_int.InterruptionVector = 2;
1496         }
1497         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1498             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1499         }
1500     }
1501 
1502     /*
1503      * Force the VCPU out of its inner loop to process any INIT requests or
1504      * commit pending TPR access.
1505      */
1506     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1507         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1508             !(env->hflags & HF_SMM_MASK)) {
1509             cpu->exit_request = 1;
1510         }
1511         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1512             cpu->exit_request = 1;
1513         }
1514     }
1515 
1516     /* Get pending hard interruption or replay one that was overwritten */
1517     if (!whpx_apic_in_platform()) {
1518         if (!vcpu->interruption_pending &&
1519             vcpu->interruptable && (env->eflags & IF_MASK)) {
1520             assert(!new_int.InterruptionPending);
1521             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1522                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1523                 irq = cpu_get_pic_interrupt(env);
1524                 if (irq >= 0) {
1525                     new_int.InterruptionType = WHvX64PendingInterrupt;
1526                     new_int.InterruptionPending = 1;
1527                     new_int.InterruptionVector = irq;
1528                 }
1529             }
1530         }
1531 
1532         /* Setup interrupt state if new one was prepared */
1533         if (new_int.InterruptionPending) {
1534             reg_values[reg_count].PendingInterruption = new_int;
1535             reg_names[reg_count] = WHvRegisterPendingInterruption;
1536             reg_count += 1;
1537         }
1538     } else if (vcpu->ready_for_pic_interrupt &&
1539                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1540         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1541         irq = cpu_get_pic_interrupt(env);
1542         if (irq >= 0) {
1543             reg_names[reg_count] = WHvRegisterPendingEvent;
1544             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1545             {
1546                 .EventPending = 1,
1547                 .EventType = WHvX64PendingEventExtInt,
1548                 .Vector = irq,
1549             };
1550             reg_count += 1;
1551         }
1552      }
1553 
1554     /* Sync the TPR to the CR8 if was modified during the intercept */
1555     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1556     if (tpr != vcpu->tpr) {
1557         vcpu->tpr = tpr;
1558         reg_values[reg_count].Reg64 = tpr;
1559         cpu->exit_request = 1;
1560         reg_names[reg_count] = WHvX64RegisterCr8;
1561         reg_count += 1;
1562     }
1563 
1564     /* Update the state of the interrupt delivery notification */
1565     if (!vcpu->window_registered &&
1566         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1567         reg_values[reg_count].DeliverabilityNotifications =
1568             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1569                 .InterruptNotification = 1
1570             };
1571         vcpu->window_registered = 1;
1572         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1573         reg_count += 1;
1574     }
1575 
1576     qemu_mutex_unlock_iothread();
1577     vcpu->ready_for_pic_interrupt = false;
1578 
1579     if (reg_count) {
1580         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1581             whpx->partition, cpu->cpu_index,
1582             reg_names, reg_count, reg_values);
1583         if (FAILED(hr)) {
1584             error_report("WHPX: Failed to set interrupt state registers,"
1585                          " hr=%08lx", hr);
1586         }
1587     }
1588 
1589     return;
1590 }
1591 
1592 static void whpx_vcpu_post_run(CPUState *cpu)
1593 {
1594     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1595     CPUX86State *env = cpu->env_ptr;
1596     X86CPU *x86_cpu = X86_CPU(cpu);
1597 
1598     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1599 
1600     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1601     if (vcpu->tpr != tpr) {
1602         vcpu->tpr = tpr;
1603         qemu_mutex_lock_iothread();
1604         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1605         qemu_mutex_unlock_iothread();
1606     }
1607 
1608     vcpu->interruption_pending =
1609         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1610 
1611     vcpu->interruptable =
1612         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1613 
1614     return;
1615 }
1616 
1617 static void whpx_vcpu_process_async_events(CPUState *cpu)
1618 {
1619     CPUX86State *env = cpu->env_ptr;
1620     X86CPU *x86_cpu = X86_CPU(cpu);
1621     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1622 
1623     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1624         !(env->hflags & HF_SMM_MASK)) {
1625         whpx_cpu_synchronize_state(cpu);
1626         do_cpu_init(x86_cpu);
1627         vcpu->interruptable = true;
1628     }
1629 
1630     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1631         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1632         apic_poll_irq(x86_cpu->apic_state);
1633     }
1634 
1635     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1636          (env->eflags & IF_MASK)) ||
1637         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1638         cpu->halted = false;
1639     }
1640 
1641     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1642         whpx_cpu_synchronize_state(cpu);
1643         do_cpu_sipi(x86_cpu);
1644     }
1645 
1646     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1647         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1648         whpx_cpu_synchronize_state(cpu);
1649         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1650                                       env->tpr_access_type);
1651     }
1652 
1653     return;
1654 }
1655 
1656 static int whpx_vcpu_run(CPUState *cpu)
1657 {
1658     HRESULT hr;
1659     struct whpx_state *whpx = &whpx_global;
1660     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1661     struct whpx_breakpoint *stepped_over_bp = NULL;
1662     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1663     int ret;
1664 
1665     g_assert(qemu_mutex_iothread_locked());
1666 
1667     if (whpx->running_cpus++ == 0) {
1668         /* Insert breakpoints into memory, update exception exit bitmap. */
1669         ret = whpx_first_vcpu_starting(cpu);
1670         if (ret != 0) {
1671             return ret;
1672         }
1673     }
1674 
1675     if (whpx->breakpoints.breakpoints &&
1676         whpx->breakpoints.breakpoints->used > 0)
1677     {
1678         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1679         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1680         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1681             stepped_over_bp = NULL;
1682         }
1683 
1684         if (stepped_over_bp) {
1685             /*
1686              * We are trying to run the instruction overwritten by an active
1687              * breakpoint. We will temporarily disable the breakpoint, suspend
1688              * other CPUs, and step over the instruction.
1689              */
1690             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1691         }
1692     }
1693 
1694     if (exclusive_step_mode == WHPX_STEP_NONE) {
1695         whpx_vcpu_process_async_events(cpu);
1696         if (cpu->halted && !whpx_apic_in_platform()) {
1697             cpu->exception_index = EXCP_HLT;
1698             qatomic_set(&cpu->exit_request, false);
1699             return 0;
1700         }
1701     }
1702 
1703     qemu_mutex_unlock_iothread();
1704 
1705     if (exclusive_step_mode != WHPX_STEP_NONE) {
1706         start_exclusive();
1707         g_assert(cpu == current_cpu);
1708         g_assert(!cpu->running);
1709         cpu->running = true;
1710 
1711         hr = whpx_set_exception_exit_bitmap(
1712             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1713         if (!SUCCEEDED(hr)) {
1714             error_report("WHPX: Failed to update exception exit mask, "
1715                          "hr=%08lx.", hr);
1716             return 1;
1717         }
1718 
1719         if (stepped_over_bp) {
1720             /* Temporarily disable the triggered breakpoint. */
1721             cpu_memory_rw_debug(cpu,
1722                 stepped_over_bp->address,
1723                 &stepped_over_bp->original_instruction,
1724                 1,
1725                 true);
1726         }
1727     } else {
1728         cpu_exec_start(cpu);
1729     }
1730 
1731     do {
1732         if (cpu->vcpu_dirty) {
1733             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1734             cpu->vcpu_dirty = false;
1735         }
1736 
1737         if (exclusive_step_mode == WHPX_STEP_NONE) {
1738             whpx_vcpu_pre_run(cpu);
1739 
1740             if (qatomic_read(&cpu->exit_request)) {
1741                 whpx_vcpu_kick(cpu);
1742             }
1743         }
1744 
1745         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1746             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1747         }
1748 
1749         hr = whp_dispatch.WHvRunVirtualProcessor(
1750             whpx->partition, cpu->cpu_index,
1751             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1752 
1753         if (FAILED(hr)) {
1754             error_report("WHPX: Failed to exec a virtual processor,"
1755                          " hr=%08lx", hr);
1756             ret = -1;
1757             break;
1758         }
1759 
1760         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1761             whpx_vcpu_configure_single_stepping(cpu,
1762                 false,
1763                 &vcpu->exit_ctx.VpContext.Rflags);
1764         }
1765 
1766         whpx_vcpu_post_run(cpu);
1767 
1768         switch (vcpu->exit_ctx.ExitReason) {
1769         case WHvRunVpExitReasonMemoryAccess:
1770             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1771             break;
1772 
1773         case WHvRunVpExitReasonX64IoPortAccess:
1774             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1775             break;
1776 
1777         case WHvRunVpExitReasonX64InterruptWindow:
1778             vcpu->ready_for_pic_interrupt = 1;
1779             vcpu->window_registered = 0;
1780             ret = 0;
1781             break;
1782 
1783         case WHvRunVpExitReasonX64ApicEoi:
1784             assert(whpx_apic_in_platform());
1785             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1786             break;
1787 
1788         case WHvRunVpExitReasonX64Halt:
1789             /*
1790              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1791              * longer used.
1792              */
1793             ret = whpx_handle_halt(cpu);
1794             break;
1795 
1796         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1797             WHV_INTERRUPT_CONTROL ipi = {0};
1798             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1799             uint32_t delivery_mode =
1800                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1801             int dest_shorthand =
1802                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1803             bool broadcast = false;
1804             bool include_self = false;
1805             uint32_t i;
1806 
1807             /* We only registered for INIT and SIPI exits. */
1808             if ((delivery_mode != APIC_DM_INIT) &&
1809                 (delivery_mode != APIC_DM_SIPI)) {
1810                 error_report(
1811                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1812                 break;
1813             }
1814 
1815             if (delivery_mode == APIC_DM_INIT) {
1816                 ipi.Type = WHvX64InterruptTypeInit;
1817             } else {
1818                 ipi.Type = WHvX64InterruptTypeSipi;
1819             }
1820 
1821             ipi.DestinationMode =
1822                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1823                     WHvX64InterruptDestinationModeLogical :
1824                     WHvX64InterruptDestinationModePhysical;
1825 
1826             ipi.TriggerMode =
1827                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1828                     WHvX64InterruptTriggerModeLevel :
1829                     WHvX64InterruptTriggerModeEdge;
1830 
1831             ipi.Vector = icr & APIC_VECTOR_MASK;
1832             switch (dest_shorthand) {
1833             /* no shorthand. Bits 56-63 contain the destination. */
1834             case 0:
1835                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1836                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1837                         &ipi, sizeof(ipi));
1838                 if (FAILED(hr)) {
1839                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1840                         hr);
1841                 }
1842 
1843                 break;
1844 
1845             /* self */
1846             case 1:
1847                 include_self = true;
1848                 break;
1849 
1850             /* broadcast, including self */
1851             case 2:
1852                 broadcast = true;
1853                 include_self = true;
1854                 break;
1855 
1856             /* broadcast, excluding self */
1857             case 3:
1858                 broadcast = true;
1859                 break;
1860             }
1861 
1862             if (!broadcast && !include_self) {
1863                 break;
1864             }
1865 
1866             for (i = 0; i <= max_vcpu_index; i++) {
1867                 if (i == cpu->cpu_index && !include_self) {
1868                     continue;
1869                 }
1870 
1871                 /*
1872                  * Assuming that APIC Ids are identity mapped since
1873                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1874                  * are not handled yet and the hypervisor doesn't allow the
1875                  * guest to modify the APIC ID.
1876                  */
1877                 ipi.Destination = i;
1878                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1879                         &ipi, sizeof(ipi));
1880                 if (FAILED(hr)) {
1881                     error_report(
1882                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1883                         i, hr);
1884                 }
1885             }
1886 
1887             break;
1888         }
1889 
1890         case WHvRunVpExitReasonCanceled:
1891             if (exclusive_step_mode != WHPX_STEP_NONE) {
1892                 /*
1893                  * We are trying to step over a single instruction, and
1894                  * likely got a request to stop from another thread.
1895                  * Delay it until we are done stepping
1896                  * over.
1897                  */
1898                 ret = 0;
1899             } else {
1900                 cpu->exception_index = EXCP_INTERRUPT;
1901                 ret = 1;
1902             }
1903             break;
1904         case WHvRunVpExitReasonX64MsrAccess: {
1905             WHV_REGISTER_VALUE reg_values[3] = {0};
1906             WHV_REGISTER_NAME reg_names[3];
1907             UINT32 reg_count;
1908 
1909             reg_names[0] = WHvX64RegisterRip;
1910             reg_names[1] = WHvX64RegisterRax;
1911             reg_names[2] = WHvX64RegisterRdx;
1912 
1913             reg_values[0].Reg64 =
1914                 vcpu->exit_ctx.VpContext.Rip +
1915                 vcpu->exit_ctx.VpContext.InstructionLength;
1916 
1917             /*
1918              * For all unsupported MSR access we:
1919              *     ignore writes
1920              *     return 0 on read.
1921              */
1922             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1923                         1 : 3;
1924 
1925             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1926                 whpx->partition,
1927                 cpu->cpu_index,
1928                 reg_names, reg_count,
1929                 reg_values);
1930 
1931             if (FAILED(hr)) {
1932                 error_report("WHPX: Failed to set MsrAccess state "
1933                              " registers, hr=%08lx", hr);
1934             }
1935             ret = 0;
1936             break;
1937         }
1938         case WHvRunVpExitReasonX64Cpuid: {
1939             WHV_REGISTER_VALUE reg_values[5];
1940             WHV_REGISTER_NAME reg_names[5];
1941             UINT32 reg_count = 5;
1942             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1943             X86CPU *x86_cpu = X86_CPU(cpu);
1944             CPUX86State *env = &x86_cpu->env;
1945 
1946             memset(reg_values, 0, sizeof(reg_values));
1947 
1948             rip = vcpu->exit_ctx.VpContext.Rip +
1949                   vcpu->exit_ctx.VpContext.InstructionLength;
1950             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1951 
1952             /*
1953              * Ideally, these should be supplied to the hypervisor during VCPU
1954              * initialization and it should be able to satisfy this request.
1955              * But, currently, WHPX doesn't support setting CPUID values in the
1956              * hypervisor once the partition has been setup, which is too late
1957              * since VCPUs are realized later. For now, use the values from
1958              * QEMU to satisfy these requests, until WHPX adds support for
1959              * being able to set these values in the hypervisor at runtime.
1960              */
1961             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1962                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1963             switch (cpuid_fn) {
1964             case 0x40000000:
1965                 /* Expose the vmware cpu frequency cpuid leaf */
1966                 rax = 0x40000010;
1967                 rbx = rcx = rdx = 0;
1968                 break;
1969 
1970             case 0x40000010:
1971                 rax = env->tsc_khz;
1972                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1973                 rcx = rdx = 0;
1974                 break;
1975 
1976             case 0x80000001:
1977                 /* Remove any support of OSVW */
1978                 rcx &= ~CPUID_EXT3_OSVW;
1979                 break;
1980             }
1981 
1982             reg_names[0] = WHvX64RegisterRip;
1983             reg_names[1] = WHvX64RegisterRax;
1984             reg_names[2] = WHvX64RegisterRcx;
1985             reg_names[3] = WHvX64RegisterRdx;
1986             reg_names[4] = WHvX64RegisterRbx;
1987 
1988             reg_values[0].Reg64 = rip;
1989             reg_values[1].Reg64 = rax;
1990             reg_values[2].Reg64 = rcx;
1991             reg_values[3].Reg64 = rdx;
1992             reg_values[4].Reg64 = rbx;
1993 
1994             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1995                 whpx->partition, cpu->cpu_index,
1996                 reg_names,
1997                 reg_count,
1998                 reg_values);
1999 
2000             if (FAILED(hr)) {
2001                 error_report("WHPX: Failed to set CpuidAccess state registers,"
2002                              " hr=%08lx", hr);
2003             }
2004             ret = 0;
2005             break;
2006         }
2007         case WHvRunVpExitReasonException:
2008             whpx_get_registers(cpu);
2009 
2010             if ((vcpu->exit_ctx.VpException.ExceptionType ==
2011                  WHvX64ExceptionTypeDebugTrapOrFault) &&
2012                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
2013                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
2014                  whpx_breakpoint_instruction)) {
2015                 /* Stopped at a software breakpoint. */
2016                 cpu->exception_index = EXCP_DEBUG;
2017             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
2018                         WHvX64ExceptionTypeDebugTrapOrFault) &&
2019                        !cpu->singlestep_enabled) {
2020                 /*
2021                  * Just finished stepping over a breakpoint, but the
2022                  * gdb does not expect us to do single-stepping.
2023                  * Don't do anything special.
2024                  */
2025                 cpu->exception_index = EXCP_INTERRUPT;
2026             } else {
2027                 /* Another exception or debug event. Report it to GDB. */
2028                 cpu->exception_index = EXCP_DEBUG;
2029             }
2030 
2031             ret = 1;
2032             break;
2033         case WHvRunVpExitReasonNone:
2034         case WHvRunVpExitReasonUnrecoverableException:
2035         case WHvRunVpExitReasonInvalidVpRegisterValue:
2036         case WHvRunVpExitReasonUnsupportedFeature:
2037         default:
2038             error_report("WHPX: Unexpected VP exit code %d",
2039                          vcpu->exit_ctx.ExitReason);
2040             whpx_get_registers(cpu);
2041             qemu_mutex_lock_iothread();
2042             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2043             qemu_mutex_unlock_iothread();
2044             break;
2045         }
2046 
2047     } while (!ret);
2048 
2049     if (stepped_over_bp) {
2050         /* Restore the breakpoint we stepped over */
2051         cpu_memory_rw_debug(cpu,
2052             stepped_over_bp->address,
2053             (void *)&whpx_breakpoint_instruction,
2054             1,
2055             true);
2056     }
2057 
2058     if (exclusive_step_mode != WHPX_STEP_NONE) {
2059         g_assert(cpu_in_exclusive_context(cpu));
2060         cpu->running = false;
2061         end_exclusive();
2062 
2063         exclusive_step_mode = WHPX_STEP_NONE;
2064     } else {
2065         cpu_exec_end(cpu);
2066     }
2067 
2068     qemu_mutex_lock_iothread();
2069     current_cpu = cpu;
2070 
2071     if (--whpx->running_cpus == 0) {
2072         whpx_last_vcpu_stopping(cpu);
2073     }
2074 
2075     qatomic_set(&cpu->exit_request, false);
2076 
2077     return ret < 0;
2078 }
2079 
2080 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2081 {
2082     if (!cpu->vcpu_dirty) {
2083         whpx_get_registers(cpu);
2084         cpu->vcpu_dirty = true;
2085     }
2086 }
2087 
2088 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2089                                                run_on_cpu_data arg)
2090 {
2091     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2092     cpu->vcpu_dirty = false;
2093 }
2094 
2095 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2096                                               run_on_cpu_data arg)
2097 {
2098     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2099     cpu->vcpu_dirty = false;
2100 }
2101 
2102 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2103                                                run_on_cpu_data arg)
2104 {
2105     cpu->vcpu_dirty = true;
2106 }
2107 
2108 /*
2109  * CPU support.
2110  */
2111 
2112 void whpx_cpu_synchronize_state(CPUState *cpu)
2113 {
2114     if (!cpu->vcpu_dirty) {
2115         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2116     }
2117 }
2118 
2119 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2120 {
2121     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2122 }
2123 
2124 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2125 {
2126     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2127 }
2128 
2129 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2130 {
2131     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2132 }
2133 
2134 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2135 {
2136     whpx_global.step_pending = step_pending;
2137 }
2138 
2139 /*
2140  * Vcpu support.
2141  */
2142 
2143 static Error *whpx_migration_blocker;
2144 
2145 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2146 {
2147     CPUX86State *env = opaque;
2148 
2149     if (running) {
2150         env->tsc_valid = false;
2151     }
2152 }
2153 
2154 int whpx_init_vcpu(CPUState *cpu)
2155 {
2156     HRESULT hr;
2157     struct whpx_state *whpx = &whpx_global;
2158     struct whpx_vcpu *vcpu = NULL;
2159     Error *local_error = NULL;
2160     CPUX86State *env = cpu->env_ptr;
2161     X86CPU *x86_cpu = X86_CPU(cpu);
2162     UINT64 freq = 0;
2163     int ret;
2164 
2165     /* Add migration blockers for all unsupported features of the
2166      * Windows Hypervisor Platform
2167      */
2168     if (whpx_migration_blocker == NULL) {
2169         error_setg(&whpx_migration_blocker,
2170                "State blocked due to non-migratable CPUID feature support,"
2171                "dirty memory tracking support, and XSAVE/XRSTOR support");
2172 
2173         if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) {
2174             error_report_err(local_error);
2175             error_free(whpx_migration_blocker);
2176             ret = -EINVAL;
2177             goto error;
2178         }
2179     }
2180 
2181     vcpu = g_new0(struct whpx_vcpu, 1);
2182 
2183     if (!vcpu) {
2184         error_report("WHPX: Failed to allocte VCPU context.");
2185         ret = -ENOMEM;
2186         goto error;
2187     }
2188 
2189     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2190         &whpx_emu_callbacks,
2191         &vcpu->emulator);
2192     if (FAILED(hr)) {
2193         error_report("WHPX: Failed to setup instruction completion support,"
2194                      " hr=%08lx", hr);
2195         ret = -EINVAL;
2196         goto error;
2197     }
2198 
2199     hr = whp_dispatch.WHvCreateVirtualProcessor(
2200         whpx->partition, cpu->cpu_index, 0);
2201     if (FAILED(hr)) {
2202         error_report("WHPX: Failed to create a virtual processor,"
2203                      " hr=%08lx", hr);
2204         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2205         ret = -EINVAL;
2206         goto error;
2207     }
2208 
2209     /*
2210      * vcpu's TSC frequency is either specified by user, or use the value
2211      * provided by Hyper-V if the former is not present. In the latter case, we
2212      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2213      * frequency can be migrated later via this field.
2214      */
2215     if (!env->tsc_khz) {
2216         hr = whp_dispatch.WHvGetCapability(
2217             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2218                 NULL);
2219         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2220             if (FAILED(hr)) {
2221                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2222             } else {
2223                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2224             }
2225         }
2226     }
2227 
2228     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2229     hr = whp_dispatch.WHvGetCapability(
2230         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2231     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2232         if (FAILED(hr)) {
2233             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2234         } else {
2235             env->apic_bus_freq = freq;
2236         }
2237     }
2238 
2239     /*
2240      * If the vmware cpuid frequency leaf option is set, and we have a valid
2241      * tsc value, trap the corresponding cpuid's.
2242      */
2243     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2244         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2245 
2246         hr = whp_dispatch.WHvSetPartitionProperty(
2247                 whpx->partition,
2248                 WHvPartitionPropertyCodeCpuidExitList,
2249                 cpuidExitList,
2250                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2251 
2252         if (FAILED(hr)) {
2253             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2254                         hr);
2255             ret = -EINVAL;
2256             goto error;
2257         }
2258     }
2259 
2260     vcpu->interruptable = true;
2261     cpu->vcpu_dirty = true;
2262     cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu;
2263     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2264     qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr);
2265 
2266     return 0;
2267 
2268 error:
2269     g_free(vcpu);
2270 
2271     return ret;
2272 }
2273 
2274 int whpx_vcpu_exec(CPUState *cpu)
2275 {
2276     int ret;
2277     int fatal;
2278 
2279     for (;;) {
2280         if (cpu->exception_index >= EXCP_INTERRUPT) {
2281             ret = cpu->exception_index;
2282             cpu->exception_index = -1;
2283             break;
2284         }
2285 
2286         fatal = whpx_vcpu_run(cpu);
2287 
2288         if (fatal) {
2289             error_report("WHPX: Failed to exec a virtual processor");
2290             abort();
2291         }
2292     }
2293 
2294     return ret;
2295 }
2296 
2297 void whpx_destroy_vcpu(CPUState *cpu)
2298 {
2299     struct whpx_state *whpx = &whpx_global;
2300     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
2301 
2302     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2303     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2304     g_free(cpu->hax_vcpu);
2305     return;
2306 }
2307 
2308 void whpx_vcpu_kick(CPUState *cpu)
2309 {
2310     struct whpx_state *whpx = &whpx_global;
2311     whp_dispatch.WHvCancelRunVirtualProcessor(
2312         whpx->partition, cpu->cpu_index, 0);
2313 }
2314 
2315 /*
2316  * Memory support.
2317  */
2318 
2319 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2320                                 void *host_va, int add, int rom,
2321                                 const char *name)
2322 {
2323     struct whpx_state *whpx = &whpx_global;
2324     HRESULT hr;
2325 
2326     /*
2327     if (add) {
2328         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2329                (void*)start_pa, (void*)size, host_va,
2330                (rom ? "ROM" : "RAM"), name);
2331     } else {
2332         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2333                (void*)start_pa, (void*)size, host_va, name);
2334     }
2335     */
2336 
2337     if (add) {
2338         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2339                                          host_va,
2340                                          start_pa,
2341                                          size,
2342                                          (WHvMapGpaRangeFlagRead |
2343                                           WHvMapGpaRangeFlagExecute |
2344                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2345     } else {
2346         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2347                                            start_pa,
2348                                            size);
2349     }
2350 
2351     if (FAILED(hr)) {
2352         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2353                      " Host:%p, hr=%08lx",
2354                      (add ? "MAP" : "UNMAP"), name,
2355                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2356     }
2357 }
2358 
2359 static void whpx_process_section(MemoryRegionSection *section, int add)
2360 {
2361     MemoryRegion *mr = section->mr;
2362     hwaddr start_pa = section->offset_within_address_space;
2363     ram_addr_t size = int128_get64(section->size);
2364     unsigned int delta;
2365     uint64_t host_va;
2366 
2367     if (!memory_region_is_ram(mr)) {
2368         return;
2369     }
2370 
2371     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2372     delta &= ~qemu_real_host_page_mask();
2373     if (delta > size) {
2374         return;
2375     }
2376     start_pa += delta;
2377     size -= delta;
2378     size &= qemu_real_host_page_mask();
2379     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2380         return;
2381     }
2382 
2383     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2384             + section->offset_within_region + delta;
2385 
2386     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2387                         memory_region_is_rom(mr), mr->name);
2388 }
2389 
2390 static void whpx_region_add(MemoryListener *listener,
2391                            MemoryRegionSection *section)
2392 {
2393     memory_region_ref(section->mr);
2394     whpx_process_section(section, 1);
2395 }
2396 
2397 static void whpx_region_del(MemoryListener *listener,
2398                            MemoryRegionSection *section)
2399 {
2400     whpx_process_section(section, 0);
2401     memory_region_unref(section->mr);
2402 }
2403 
2404 static void whpx_transaction_begin(MemoryListener *listener)
2405 {
2406 }
2407 
2408 static void whpx_transaction_commit(MemoryListener *listener)
2409 {
2410 }
2411 
2412 static void whpx_log_sync(MemoryListener *listener,
2413                          MemoryRegionSection *section)
2414 {
2415     MemoryRegion *mr = section->mr;
2416 
2417     if (!memory_region_is_ram(mr)) {
2418         return;
2419     }
2420 
2421     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2422 }
2423 
2424 static MemoryListener whpx_memory_listener = {
2425     .name = "whpx",
2426     .begin = whpx_transaction_begin,
2427     .commit = whpx_transaction_commit,
2428     .region_add = whpx_region_add,
2429     .region_del = whpx_region_del,
2430     .log_sync = whpx_log_sync,
2431     .priority = 10,
2432 };
2433 
2434 static void whpx_memory_init(void)
2435 {
2436     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2437 }
2438 
2439 /*
2440  * Load the functions from the given library, using the given handle. If a
2441  * handle is provided, it is used, otherwise the library is opened. The
2442  * handle will be updated on return with the opened one.
2443  */
2444 static bool load_whp_dispatch_fns(HMODULE *handle,
2445     WHPFunctionList function_list)
2446 {
2447     HMODULE hLib = *handle;
2448 
2449     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2450     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2451     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2452         whp_dispatch.function_name = \
2453             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2454 
2455     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2456         whp_dispatch.function_name = \
2457             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2458         if (!whp_dispatch.function_name) { \
2459             error_report("Could not load function %s", #function_name); \
2460             goto error; \
2461         } \
2462 
2463     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2464     if (!handle_lib) { \
2465         handle_lib = LoadLibrary(lib_name); \
2466         if (!handle_lib) { \
2467             error_report("Could not load library %s.", lib_name); \
2468             goto error; \
2469         } \
2470     } \
2471 
2472     switch (function_list) {
2473     case WINHV_PLATFORM_FNS_DEFAULT:
2474         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2475         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2476         break;
2477 
2478     case WINHV_EMULATION_FNS_DEFAULT:
2479         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2480         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2481         break;
2482 
2483     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2484         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2485         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2486         break;
2487     }
2488 
2489     *handle = hLib;
2490     return true;
2491 
2492 error:
2493     if (hLib) {
2494         FreeLibrary(hLib);
2495     }
2496 
2497     return false;
2498 }
2499 
2500 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2501                                    const char *name, void *opaque,
2502                                    Error **errp)
2503 {
2504     struct whpx_state *whpx = &whpx_global;
2505     OnOffSplit mode;
2506 
2507     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2508         return;
2509     }
2510 
2511     switch (mode) {
2512     case ON_OFF_SPLIT_ON:
2513         whpx->kernel_irqchip_allowed = true;
2514         whpx->kernel_irqchip_required = true;
2515         break;
2516 
2517     case ON_OFF_SPLIT_OFF:
2518         whpx->kernel_irqchip_allowed = false;
2519         whpx->kernel_irqchip_required = false;
2520         break;
2521 
2522     case ON_OFF_SPLIT_SPLIT:
2523         error_setg(errp, "WHPX: split irqchip currently not supported");
2524         error_append_hint(errp,
2525             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2526         break;
2527 
2528     default:
2529         /*
2530          * The value was checked in visit_type_OnOffSplit() above. If
2531          * we get here, then something is wrong in QEMU.
2532          */
2533         abort();
2534     }
2535 }
2536 
2537 /*
2538  * Partition support
2539  */
2540 
2541 static int whpx_accel_init(MachineState *ms)
2542 {
2543     struct whpx_state *whpx;
2544     int ret;
2545     HRESULT hr;
2546     WHV_CAPABILITY whpx_cap;
2547     UINT32 whpx_cap_size;
2548     WHV_PARTITION_PROPERTY prop;
2549     UINT32 cpuidExitList[] = {1, 0x80000001};
2550     WHV_CAPABILITY_FEATURES features = {0};
2551 
2552     whpx = &whpx_global;
2553 
2554     if (!init_whp_dispatch()) {
2555         ret = -ENOSYS;
2556         goto error;
2557     }
2558 
2559     whpx->mem_quota = ms->ram_size;
2560 
2561     hr = whp_dispatch.WHvGetCapability(
2562         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2563         sizeof(whpx_cap), &whpx_cap_size);
2564     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2565         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2566         ret = -ENOSPC;
2567         goto error;
2568     }
2569 
2570     hr = whp_dispatch.WHvGetCapability(
2571         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2572     if (FAILED(hr)) {
2573         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2574         ret = -EINVAL;
2575         goto error;
2576     }
2577 
2578     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2579     if (FAILED(hr)) {
2580         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2581         ret = -EINVAL;
2582         goto error;
2583     }
2584 
2585     /*
2586      * Query the XSAVE capability of the partition. Any error here is not
2587      * considered fatal.
2588      */
2589     hr = whp_dispatch.WHvGetPartitionProperty(
2590         whpx->partition,
2591         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2592         &whpx_xsave_cap,
2593         sizeof(whpx_xsave_cap),
2594         &whpx_cap_size);
2595 
2596     /*
2597      * Windows version which don't support this property will return with the
2598      * specific error code.
2599      */
2600     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2601         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2602     }
2603 
2604     if (!whpx_has_xsave()) {
2605         printf("WHPX: Partition is not XSAVE capable\n");
2606     }
2607 
2608     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2609     prop.ProcessorCount = ms->smp.cpus;
2610     hr = whp_dispatch.WHvSetPartitionProperty(
2611         whpx->partition,
2612         WHvPartitionPropertyCodeProcessorCount,
2613         &prop,
2614         sizeof(WHV_PARTITION_PROPERTY));
2615 
2616     if (FAILED(hr)) {
2617         error_report("WHPX: Failed to set partition core count to %d,"
2618                      " hr=%08lx", ms->smp.cores, hr);
2619         ret = -EINVAL;
2620         goto error;
2621     }
2622 
2623     /*
2624      * Error out if WHP doesn't support apic emulation and user is requiring
2625      * it.
2626      */
2627     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2628             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2629         error_report("WHPX: kernel irqchip requested, but unavailable. "
2630             "Try without kernel-irqchip or with kernel-irqchip=off");
2631         ret = -EINVAL;
2632         goto error;
2633     }
2634 
2635     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2636         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2637         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2638             WHvX64LocalApicEmulationModeXApic;
2639         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2640         hr = whp_dispatch.WHvSetPartitionProperty(
2641             whpx->partition,
2642             WHvPartitionPropertyCodeLocalApicEmulationMode,
2643             &mode,
2644             sizeof(mode));
2645         if (FAILED(hr)) {
2646             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2647             if (whpx->kernel_irqchip_required) {
2648                 error_report("WHPX: kernel irqchip requested, but unavailable");
2649                 ret = -EINVAL;
2650                 goto error;
2651             }
2652         } else {
2653             whpx->apic_in_platform = true;
2654         }
2655     }
2656 
2657     /* Register for MSR and CPUID exits */
2658     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2659     prop.ExtendedVmExits.X64MsrExit = 1;
2660     prop.ExtendedVmExits.X64CpuidExit = 1;
2661     prop.ExtendedVmExits.ExceptionExit = 1;
2662     if (whpx_apic_in_platform()) {
2663         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2664     }
2665 
2666     hr = whp_dispatch.WHvSetPartitionProperty(
2667             whpx->partition,
2668             WHvPartitionPropertyCodeExtendedVmExits,
2669             &prop,
2670             sizeof(WHV_PARTITION_PROPERTY));
2671     if (FAILED(hr)) {
2672         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2673         ret = -EINVAL;
2674         goto error;
2675     }
2676 
2677     hr = whp_dispatch.WHvSetPartitionProperty(
2678         whpx->partition,
2679         WHvPartitionPropertyCodeCpuidExitList,
2680         cpuidExitList,
2681         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2682 
2683     if (FAILED(hr)) {
2684         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2685                      hr);
2686         ret = -EINVAL;
2687         goto error;
2688     }
2689 
2690     /*
2691      * We do not want to intercept any exceptions from the guest,
2692      * until we actually start debugging with gdb.
2693      */
2694     whpx->exception_exit_bitmap = -1;
2695     hr = whpx_set_exception_exit_bitmap(0);
2696 
2697     if (FAILED(hr)) {
2698         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2699         ret = -EINVAL;
2700         goto error;
2701     }
2702 
2703     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2704     if (FAILED(hr)) {
2705         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2706         ret = -EINVAL;
2707         goto error;
2708     }
2709 
2710     whpx_memory_init();
2711 
2712     printf("Windows Hypervisor Platform accelerator is operational\n");
2713     return 0;
2714 
2715 error:
2716 
2717     if (NULL != whpx->partition) {
2718         whp_dispatch.WHvDeletePartition(whpx->partition);
2719         whpx->partition = NULL;
2720     }
2721 
2722     return ret;
2723 }
2724 
2725 int whpx_enabled(void)
2726 {
2727     return whpx_allowed;
2728 }
2729 
2730 bool whpx_apic_in_platform(void) {
2731     return whpx_global.apic_in_platform;
2732 }
2733 
2734 static void whpx_accel_class_init(ObjectClass *oc, void *data)
2735 {
2736     AccelClass *ac = ACCEL_CLASS(oc);
2737     ac->name = "WHPX";
2738     ac->init_machine = whpx_accel_init;
2739     ac->allowed = &whpx_allowed;
2740 
2741     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2742         NULL, whpx_set_kernel_irqchip,
2743         NULL, NULL);
2744     object_class_property_set_description(oc, "kernel-irqchip",
2745         "Configure WHPX in-kernel irqchip");
2746 }
2747 
2748 static void whpx_accel_instance_init(Object *obj)
2749 {
2750     struct whpx_state *whpx = &whpx_global;
2751 
2752     memset(whpx, 0, sizeof(struct whpx_state));
2753     /* Turn on kernel-irqchip, by default */
2754     whpx->kernel_irqchip_allowed = true;
2755 }
2756 
2757 static const TypeInfo whpx_accel_type = {
2758     .name = ACCEL_CLASS_NAME("whpx"),
2759     .parent = TYPE_ACCEL,
2760     .instance_init = whpx_accel_instance_init,
2761     .class_init = whpx_accel_class_init,
2762 };
2763 
2764 static void whpx_type_init(void)
2765 {
2766     type_register_static(&whpx_accel_type);
2767 }
2768 
2769 bool init_whp_dispatch(void)
2770 {
2771     if (whp_dispatch_initialized) {
2772         return true;
2773     }
2774 
2775     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2776         goto error;
2777     }
2778 
2779     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2780         goto error;
2781     }
2782 
2783     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2784         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2785     whp_dispatch_initialized = true;
2786 
2787     return true;
2788 error:
2789     if (hWinHvPlatform) {
2790         FreeLibrary(hWinHvPlatform);
2791     }
2792 
2793     if (hWinHvEmulation) {
2794         FreeLibrary(hWinHvEmulation);
2795     }
2796 
2797     return false;
2798 }
2799 
2800 type_init(whpx_type_init);
2801