xref: /openbmc/qemu/target/i386/whpx/whpx-all.c (revision 350785d41d8bb0b799dd16ea04a7232dc8d6093a)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "system/address-spaces.h"
14 #include "system/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "system/whpx.h"
18 #include "system/cpus.h"
19 #include "system/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include "host-cpu.h"
30 #include "accel/accel-cpu-target.h"
31 #include <winerror.h>
32 
33 #include "whpx-internal.h"
34 #include "whpx-accel-ops.h"
35 
36 #include <winhvplatform.h>
37 #include <winhvemulation.h>
38 
39 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
40 
41 static const WHV_REGISTER_NAME whpx_register_names[] = {
42 
43     /* X64 General purpose registers */
44     WHvX64RegisterRax,
45     WHvX64RegisterRcx,
46     WHvX64RegisterRdx,
47     WHvX64RegisterRbx,
48     WHvX64RegisterRsp,
49     WHvX64RegisterRbp,
50     WHvX64RegisterRsi,
51     WHvX64RegisterRdi,
52     WHvX64RegisterR8,
53     WHvX64RegisterR9,
54     WHvX64RegisterR10,
55     WHvX64RegisterR11,
56     WHvX64RegisterR12,
57     WHvX64RegisterR13,
58     WHvX64RegisterR14,
59     WHvX64RegisterR15,
60     WHvX64RegisterRip,
61     WHvX64RegisterRflags,
62 
63     /* X64 Segment registers */
64     WHvX64RegisterEs,
65     WHvX64RegisterCs,
66     WHvX64RegisterSs,
67     WHvX64RegisterDs,
68     WHvX64RegisterFs,
69     WHvX64RegisterGs,
70     WHvX64RegisterLdtr,
71     WHvX64RegisterTr,
72 
73     /* X64 Table registers */
74     WHvX64RegisterIdtr,
75     WHvX64RegisterGdtr,
76 
77     /* X64 Control Registers */
78     WHvX64RegisterCr0,
79     WHvX64RegisterCr2,
80     WHvX64RegisterCr3,
81     WHvX64RegisterCr4,
82     WHvX64RegisterCr8,
83 
84     /* X64 Debug Registers */
85     /*
86      * WHvX64RegisterDr0,
87      * WHvX64RegisterDr1,
88      * WHvX64RegisterDr2,
89      * WHvX64RegisterDr3,
90      * WHvX64RegisterDr6,
91      * WHvX64RegisterDr7,
92      */
93 
94     /* X64 Floating Point and Vector Registers */
95     WHvX64RegisterXmm0,
96     WHvX64RegisterXmm1,
97     WHvX64RegisterXmm2,
98     WHvX64RegisterXmm3,
99     WHvX64RegisterXmm4,
100     WHvX64RegisterXmm5,
101     WHvX64RegisterXmm6,
102     WHvX64RegisterXmm7,
103     WHvX64RegisterXmm8,
104     WHvX64RegisterXmm9,
105     WHvX64RegisterXmm10,
106     WHvX64RegisterXmm11,
107     WHvX64RegisterXmm12,
108     WHvX64RegisterXmm13,
109     WHvX64RegisterXmm14,
110     WHvX64RegisterXmm15,
111     WHvX64RegisterFpMmx0,
112     WHvX64RegisterFpMmx1,
113     WHvX64RegisterFpMmx2,
114     WHvX64RegisterFpMmx3,
115     WHvX64RegisterFpMmx4,
116     WHvX64RegisterFpMmx5,
117     WHvX64RegisterFpMmx6,
118     WHvX64RegisterFpMmx7,
119     WHvX64RegisterFpControlStatus,
120     WHvX64RegisterXmmControlStatus,
121 
122     /* X64 MSRs */
123     WHvX64RegisterEfer,
124 #ifdef TARGET_X86_64
125     WHvX64RegisterKernelGsBase,
126 #endif
127     WHvX64RegisterApicBase,
128     /* WHvX64RegisterPat, */
129     WHvX64RegisterSysenterCs,
130     WHvX64RegisterSysenterEip,
131     WHvX64RegisterSysenterEsp,
132     WHvX64RegisterStar,
133 #ifdef TARGET_X86_64
134     WHvX64RegisterLstar,
135     WHvX64RegisterCstar,
136     WHvX64RegisterSfmask,
137 #endif
138 
139     /* Interrupt / Event Registers */
140     /*
141      * WHvRegisterPendingInterruption,
142      * WHvRegisterInterruptState,
143      * WHvRegisterPendingEvent0,
144      * WHvRegisterPendingEvent1
145      * WHvX64RegisterDeliverabilityNotifications,
146      */
147 };
148 
149 struct whpx_register_set {
150     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
151 };
152 
153 /*
154  * The current implementation of instruction stepping sets the TF flag
155  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
156  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
157  *
158  * This approach has a few limitations:
159  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
160  *        along with the other flags, possibly restoring it later. It would
161  *        result in another INT1 when the flags are restored, triggering
162  *        a stop in gdb that could be cleared by doing another step.
163  *
164  *        Stepping over a POPF/LAHF instruction will let it overwrite the
165  *        TF flags, ending the stepping mode.
166  *
167  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
168  *        or anything that could result in a page fault) will save the flags
169  *        to the stack, clear the TF flag, and let the guest execute the
170  *        handler. Normally, the guest will restore the original flags,
171  *        that will continue single-stepping.
172  *
173  *     3. Debuggers running on the guest may wish to set TF to do instruction
174  *        stepping. INT1 events generated by it would be intercepted by us,
175  *        as long as the gdb is connected to QEMU.
176  *
177  * In practice this means that:
178  *     1. Stepping through flags-modifying instructions may cause gdb to
179  *        continue or stop in unexpected places. This will be fully recoverable
180  *        and will not crash the target.
181  *
182  *     2. Stepping over an instruction that triggers an exception will step
183  *        over the exception handler, not into it.
184  *
185  *     3. Debugging the guest via gdb, while running debugger on the guest
186  *        at the same time may lead to unexpected effects. Removing all
187  *        breakpoints set via QEMU will prevent any further interference
188  *        with the guest-level debuggers.
189  *
190  * The limitations can be addressed as shown below:
191  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
192  *        stepping through them. The exact semantics of the instructions is
193  *        defined in the "Combined Volume Set of Intel 64 and IA-32
194  *        Architectures Software Developer's Manuals", however it involves a
195  *        fair amount of corner cases due to compatibility with real mode,
196  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
197  *
198  *     2. We could step into the guest's exception handlers using the following
199  *        sequence:
200  *          a. Temporarily enable catching of all exception types via
201  *             whpx_set_exception_exit_bitmap().
202  *          b. Once an exception is intercepted, read the IDT/GDT and locate
203  *             the original handler.
204  *          c. Patch the original handler, injecting an INT3 at the beginning.
205  *          d. Update the exception exit bitmap to only catch the
206  *             WHvX64ExceptionTypeBreakpointTrap exception.
207  *          e. Let the affected CPU run in the exclusive mode.
208  *          f. Restore the original handler and the exception exit bitmap.
209  *        Note that handling all corner cases related to IDT/GDT is harder
210  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
211  *        rough idea.
212  *
213  *     3. In order to properly support guest-level debugging in parallel with
214  *        the QEMU-level debugging, we would need to be able to pass some INT1
215  *        events to the guest. This could be done via the following methods:
216  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
217  *             it seems to only work for interrupts and not software
218  *             exceptions.
219  *          b. Locating and patching the original handler by parsing IDT/GDT.
220  *             This involves relatively complex logic outlined in the previous
221  *             paragraph.
222  *          c. Emulating the exception invocation (i.e. manually updating RIP,
223  *             RFLAGS, and pushing the old values to stack). This is even more
224  *             complicated than the previous option, since it involves checking
225  *             CPL, gate attributes, and doing various adjustments depending
226  *             on the current CPU mode, whether the CPL is changing, etc.
227  */
228 typedef enum WhpxStepMode {
229     WHPX_STEP_NONE = 0,
230     /* Halt other VCPUs */
231     WHPX_STEP_EXCLUSIVE,
232 } WhpxStepMode;
233 
234 struct AccelCPUState {
235     WHV_EMULATOR_HANDLE emulator;
236     bool window_registered;
237     bool interruptable;
238     bool ready_for_pic_interrupt;
239     uint64_t tpr;
240     uint64_t apic_base;
241     bool interruption_pending;
242 
243     /* Must be the last field as it may have a tail */
244     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
245 };
246 
247 bool whpx_allowed;
248 static bool whp_dispatch_initialized;
249 static HMODULE hWinHvPlatform, hWinHvEmulation;
250 static uint32_t max_vcpu_index;
251 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
252 
253 struct whpx_state whpx_global;
254 struct WHPDispatch whp_dispatch;
255 
256 static bool whpx_has_xsave(void)
257 {
258     return whpx_xsave_cap.XsaveSupport;
259 }
260 
261 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
262                                              int r86)
263 {
264     WHV_X64_SEGMENT_REGISTER hs;
265     unsigned flags = qs->flags;
266 
267     hs.Base = qs->base;
268     hs.Limit = qs->limit;
269     hs.Selector = qs->selector;
270 
271     if (v86) {
272         hs.Attributes = 0;
273         hs.SegmentType = 3;
274         hs.Present = 1;
275         hs.DescriptorPrivilegeLevel = 3;
276         hs.NonSystemSegment = 1;
277 
278     } else {
279         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
280 
281         if (r86) {
282             /* hs.Base &= 0xfffff; */
283         }
284     }
285 
286     return hs;
287 }
288 
289 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
290 {
291     SegmentCache qs;
292 
293     qs.base = hs->Base;
294     qs.limit = hs->Limit;
295     qs.selector = hs->Selector;
296 
297     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
298 
299     return qs;
300 }
301 
302 /* X64 Extended Control Registers */
303 static void whpx_set_xcrs(CPUState *cpu)
304 {
305     HRESULT hr;
306     struct whpx_state *whpx = &whpx_global;
307     WHV_REGISTER_VALUE xcr0;
308     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
309 
310     if (!whpx_has_xsave()) {
311         return;
312     }
313 
314     /* Only xcr0 is supported by the hypervisor currently */
315     xcr0.Reg64 = cpu_env(cpu)->xcr0;
316     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
317         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
318     if (FAILED(hr)) {
319         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
320     }
321 }
322 
323 static int whpx_set_tsc(CPUState *cpu)
324 {
325     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
326     WHV_REGISTER_VALUE tsc_val;
327     HRESULT hr;
328     struct whpx_state *whpx = &whpx_global;
329 
330     /*
331      * Suspend the partition prior to setting the TSC to reduce the variance
332      * in TSC across vCPUs. When the first vCPU runs post suspend, the
333      * partition is automatically resumed.
334      */
335     if (whp_dispatch.WHvSuspendPartitionTime) {
336 
337         /*
338          * Unable to suspend partition while setting TSC is not a fatal
339          * error. It just increases the likelihood of TSC variance between
340          * vCPUs and some guest OS are able to handle that just fine.
341          */
342         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
343         if (FAILED(hr)) {
344             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
345         }
346     }
347 
348     tsc_val.Reg64 = cpu_env(cpu)->tsc;
349     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
350         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
351     if (FAILED(hr)) {
352         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
353         return -1;
354     }
355 
356     return 0;
357 }
358 
359 /*
360  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
361  * however, they use a slightly different encoding. Specifically:
362  *
363  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
364  *
365  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
366  * and IA-32 Architectures Software Developer's Manual.
367  *
368  * The functions below translate the value of CR8 to TPR and vice versa.
369  */
370 
371 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
372 {
373     return tpr >> 4;
374 }
375 
376 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
377 {
378     return cr8 << 4;
379 }
380 
381 static void whpx_set_registers(CPUState *cpu, int level)
382 {
383     struct whpx_state *whpx = &whpx_global;
384     AccelCPUState *vcpu = cpu->accel;
385     X86CPU *x86_cpu = X86_CPU(cpu);
386     CPUX86State *env = &x86_cpu->env;
387     struct whpx_register_set vcxt;
388     HRESULT hr;
389     int idx;
390     int idx_next;
391     int i;
392     int v86, r86;
393 
394     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
395 
396     /*
397      * Following MSRs have side effects on the guest or are too heavy for
398      * runtime. Limit them to full state update.
399      */
400     if (level >= WHPX_SET_RESET_STATE) {
401         whpx_set_tsc(cpu);
402     }
403 
404     memset(&vcxt, 0, sizeof(struct whpx_register_set));
405 
406     v86 = (env->eflags & VM_MASK);
407     r86 = !(env->cr[0] & CR0_PE_MASK);
408 
409     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
410     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
411 
412     idx = 0;
413 
414     /* Indexes for first 16 registers match between HV and QEMU definitions */
415     idx_next = 16;
416     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
417         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
418     }
419     idx = idx_next;
420 
421     /* Same goes for RIP and RFLAGS */
422     assert(whpx_register_names[idx] == WHvX64RegisterRip);
423     vcxt.values[idx++].Reg64 = env->eip;
424 
425     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
426     vcxt.values[idx++].Reg64 = env->eflags;
427 
428     /* Translate 6+4 segment registers. HV and QEMU order matches  */
429     assert(idx == WHvX64RegisterEs);
430     for (i = 0; i < 6; i += 1, idx += 1) {
431         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
432     }
433 
434     assert(idx == WHvX64RegisterLdtr);
435     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
436 
437     assert(idx == WHvX64RegisterTr);
438     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
439 
440     assert(idx == WHvX64RegisterIdtr);
441     vcxt.values[idx].Table.Base = env->idt.base;
442     vcxt.values[idx].Table.Limit = env->idt.limit;
443     idx += 1;
444 
445     assert(idx == WHvX64RegisterGdtr);
446     vcxt.values[idx].Table.Base = env->gdt.base;
447     vcxt.values[idx].Table.Limit = env->gdt.limit;
448     idx += 1;
449 
450     /* CR0, 2, 3, 4, 8 */
451     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
452     vcxt.values[idx++].Reg64 = env->cr[0];
453     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
454     vcxt.values[idx++].Reg64 = env->cr[2];
455     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
456     vcxt.values[idx++].Reg64 = env->cr[3];
457     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
458     vcxt.values[idx++].Reg64 = env->cr[4];
459     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
460     vcxt.values[idx++].Reg64 = vcpu->tpr;
461 
462     /* 8 Debug Registers - Skipped */
463 
464     /*
465      * Extended control registers needs to be handled separately depending
466      * on whether xsave is supported/enabled or not.
467      */
468     whpx_set_xcrs(cpu);
469 
470     /* 16 XMM registers */
471     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
472     idx_next = idx + 16;
473     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
474         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
475         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
476     }
477     idx = idx_next;
478 
479     /* 8 FP registers */
480     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
481     for (i = 0; i < 8; i += 1, idx += 1) {
482         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
483         /* vcxt.values[idx].Fp.AsUINT128.High64 =
484                env->fpregs[i].mmx.MMX_Q(1);
485         */
486     }
487 
488     /* FP control status register */
489     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
490     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
491     vcxt.values[idx].FpControlStatus.FpStatus =
492         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
493     vcxt.values[idx].FpControlStatus.FpTag = 0;
494     for (i = 0; i < 8; ++i) {
495         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
496     }
497     vcxt.values[idx].FpControlStatus.Reserved = 0;
498     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
499     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
500     idx += 1;
501 
502     /* XMM control status register */
503     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
504     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
505     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
506     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
507     idx += 1;
508 
509     /* MSRs */
510     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
511     vcxt.values[idx++].Reg64 = env->efer;
512 #ifdef TARGET_X86_64
513     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
514     vcxt.values[idx++].Reg64 = env->kernelgsbase;
515 #endif
516 
517     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
518     vcxt.values[idx++].Reg64 = vcpu->apic_base;
519 
520     /* WHvX64RegisterPat - Skipped */
521 
522     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
523     vcxt.values[idx++].Reg64 = env->sysenter_cs;
524     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
525     vcxt.values[idx++].Reg64 = env->sysenter_eip;
526     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
527     vcxt.values[idx++].Reg64 = env->sysenter_esp;
528     assert(whpx_register_names[idx] == WHvX64RegisterStar);
529     vcxt.values[idx++].Reg64 = env->star;
530 #ifdef TARGET_X86_64
531     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
532     vcxt.values[idx++].Reg64 = env->lstar;
533     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
534     vcxt.values[idx++].Reg64 = env->cstar;
535     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
536     vcxt.values[idx++].Reg64 = env->fmask;
537 #endif
538 
539     /* Interrupt / Event Registers - Skipped */
540 
541     assert(idx == RTL_NUMBER_OF(whpx_register_names));
542 
543     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
544         whpx->partition, cpu->cpu_index,
545         whpx_register_names,
546         RTL_NUMBER_OF(whpx_register_names),
547         &vcxt.values[0]);
548 
549     if (FAILED(hr)) {
550         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
551                      hr);
552     }
553 }
554 
555 static int whpx_get_tsc(CPUState *cpu)
556 {
557     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
558     WHV_REGISTER_VALUE tsc_val;
559     HRESULT hr;
560     struct whpx_state *whpx = &whpx_global;
561 
562     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
563         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
564     if (FAILED(hr)) {
565         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
566         return -1;
567     }
568 
569     cpu_env(cpu)->tsc = tsc_val.Reg64;
570     return 0;
571 }
572 
573 /* X64 Extended Control Registers */
574 static void whpx_get_xcrs(CPUState *cpu)
575 {
576     HRESULT hr;
577     struct whpx_state *whpx = &whpx_global;
578     WHV_REGISTER_VALUE xcr0;
579     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
580 
581     if (!whpx_has_xsave()) {
582         return;
583     }
584 
585     /* Only xcr0 is supported by the hypervisor currently */
586     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
587         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
588     if (FAILED(hr)) {
589         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
590         return;
591     }
592 
593     cpu_env(cpu)->xcr0 = xcr0.Reg64;
594 }
595 
596 static void whpx_get_registers(CPUState *cpu)
597 {
598     struct whpx_state *whpx = &whpx_global;
599     AccelCPUState *vcpu = cpu->accel;
600     X86CPU *x86_cpu = X86_CPU(cpu);
601     CPUX86State *env = &x86_cpu->env;
602     struct whpx_register_set vcxt;
603     uint64_t tpr, apic_base;
604     HRESULT hr;
605     int idx;
606     int idx_next;
607     int i;
608 
609     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
610 
611     if (!env->tsc_valid) {
612         whpx_get_tsc(cpu);
613         env->tsc_valid = !runstate_is_running();
614     }
615 
616     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
617         whpx->partition, cpu->cpu_index,
618         whpx_register_names,
619         RTL_NUMBER_OF(whpx_register_names),
620         &vcxt.values[0]);
621     if (FAILED(hr)) {
622         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
623                      hr);
624     }
625 
626     if (whpx_apic_in_platform()) {
627         /*
628          * Fetch the TPR value from the emulated APIC. It may get overwritten
629          * below with the value from CR8 returned by
630          * WHvGetVirtualProcessorRegisters().
631          */
632         whpx_apic_get(x86_cpu->apic_state);
633         vcpu->tpr = whpx_apic_tpr_to_cr8(
634             cpu_get_apic_tpr(x86_cpu->apic_state));
635     }
636 
637     idx = 0;
638 
639     /* Indexes for first 16 registers match between HV and QEMU definitions */
640     idx_next = 16;
641     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
642         env->regs[idx] = vcxt.values[idx].Reg64;
643     }
644     idx = idx_next;
645 
646     /* Same goes for RIP and RFLAGS */
647     assert(whpx_register_names[idx] == WHvX64RegisterRip);
648     env->eip = vcxt.values[idx++].Reg64;
649     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
650     env->eflags = vcxt.values[idx++].Reg64;
651 
652     /* Translate 6+4 segment registers. HV and QEMU order matches  */
653     assert(idx == WHvX64RegisterEs);
654     for (i = 0; i < 6; i += 1, idx += 1) {
655         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
656     }
657 
658     assert(idx == WHvX64RegisterLdtr);
659     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
660     assert(idx == WHvX64RegisterTr);
661     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
662     assert(idx == WHvX64RegisterIdtr);
663     env->idt.base = vcxt.values[idx].Table.Base;
664     env->idt.limit = vcxt.values[idx].Table.Limit;
665     idx += 1;
666     assert(idx == WHvX64RegisterGdtr);
667     env->gdt.base = vcxt.values[idx].Table.Base;
668     env->gdt.limit = vcxt.values[idx].Table.Limit;
669     idx += 1;
670 
671     /* CR0, 2, 3, 4, 8 */
672     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
673     env->cr[0] = vcxt.values[idx++].Reg64;
674     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
675     env->cr[2] = vcxt.values[idx++].Reg64;
676     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
677     env->cr[3] = vcxt.values[idx++].Reg64;
678     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
679     env->cr[4] = vcxt.values[idx++].Reg64;
680     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
681     tpr = vcxt.values[idx++].Reg64;
682     if (tpr != vcpu->tpr) {
683         vcpu->tpr = tpr;
684         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
685     }
686 
687     /* 8 Debug Registers - Skipped */
688 
689     /*
690      * Extended control registers needs to be handled separately depending
691      * on whether xsave is supported/enabled or not.
692      */
693     whpx_get_xcrs(cpu);
694 
695     /* 16 XMM registers */
696     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
697     idx_next = idx + 16;
698     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
699         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
700         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
701     }
702     idx = idx_next;
703 
704     /* 8 FP registers */
705     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
706     for (i = 0; i < 8; i += 1, idx += 1) {
707         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
708         /* env->fpregs[i].mmx.MMX_Q(1) =
709                vcxt.values[idx].Fp.AsUINT128.High64;
710         */
711     }
712 
713     /* FP control status register */
714     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
715     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
716     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
717     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
718     for (i = 0; i < 8; ++i) {
719         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
720     }
721     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
722     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
723     idx += 1;
724 
725     /* XMM control status register */
726     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
727     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
728     idx += 1;
729 
730     /* MSRs */
731     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
732     env->efer = vcxt.values[idx++].Reg64;
733 #ifdef TARGET_X86_64
734     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
735     env->kernelgsbase = vcxt.values[idx++].Reg64;
736 #endif
737 
738     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
739     apic_base = vcxt.values[idx++].Reg64;
740     if (apic_base != vcpu->apic_base) {
741         vcpu->apic_base = apic_base;
742         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
743     }
744 
745     /* WHvX64RegisterPat - Skipped */
746 
747     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
748     env->sysenter_cs = vcxt.values[idx++].Reg64;
749     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
750     env->sysenter_eip = vcxt.values[idx++].Reg64;
751     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
752     env->sysenter_esp = vcxt.values[idx++].Reg64;
753     assert(whpx_register_names[idx] == WHvX64RegisterStar);
754     env->star = vcxt.values[idx++].Reg64;
755 #ifdef TARGET_X86_64
756     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
757     env->lstar = vcxt.values[idx++].Reg64;
758     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
759     env->cstar = vcxt.values[idx++].Reg64;
760     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
761     env->fmask = vcxt.values[idx++].Reg64;
762 #endif
763 
764     /* Interrupt / Event Registers - Skipped */
765 
766     assert(idx == RTL_NUMBER_OF(whpx_register_names));
767 
768     if (whpx_apic_in_platform()) {
769         whpx_apic_get(x86_cpu->apic_state);
770     }
771 
772     x86_update_hflags(env);
773 }
774 
775 static HRESULT CALLBACK whpx_emu_ioport_callback(
776     void *ctx,
777     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
778 {
779     MemTxAttrs attrs = { 0 };
780     address_space_rw(&address_space_io, IoAccess->Port, attrs,
781                      &IoAccess->Data, IoAccess->AccessSize,
782                      IoAccess->Direction);
783     return S_OK;
784 }
785 
786 static HRESULT CALLBACK whpx_emu_mmio_callback(
787     void *ctx,
788     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
789 {
790     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
791                            ma->Direction);
792     return S_OK;
793 }
794 
795 static HRESULT CALLBACK whpx_emu_getreg_callback(
796     void *ctx,
797     const WHV_REGISTER_NAME *RegisterNames,
798     UINT32 RegisterCount,
799     WHV_REGISTER_VALUE *RegisterValues)
800 {
801     HRESULT hr;
802     struct whpx_state *whpx = &whpx_global;
803     CPUState *cpu = (CPUState *)ctx;
804 
805     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
806         whpx->partition, cpu->cpu_index,
807         RegisterNames, RegisterCount,
808         RegisterValues);
809     if (FAILED(hr)) {
810         error_report("WHPX: Failed to get virtual processor registers,"
811                      " hr=%08lx", hr);
812     }
813 
814     return hr;
815 }
816 
817 static HRESULT CALLBACK whpx_emu_setreg_callback(
818     void *ctx,
819     const WHV_REGISTER_NAME *RegisterNames,
820     UINT32 RegisterCount,
821     const WHV_REGISTER_VALUE *RegisterValues)
822 {
823     HRESULT hr;
824     struct whpx_state *whpx = &whpx_global;
825     CPUState *cpu = (CPUState *)ctx;
826 
827     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
828         whpx->partition, cpu->cpu_index,
829         RegisterNames, RegisterCount,
830         RegisterValues);
831     if (FAILED(hr)) {
832         error_report("WHPX: Failed to set virtual processor registers,"
833                      " hr=%08lx", hr);
834     }
835 
836     /*
837      * The emulator just successfully wrote the register state. We clear the
838      * dirty state so we avoid the double write on resume of the VP.
839      */
840     cpu->vcpu_dirty = false;
841 
842     return hr;
843 }
844 
845 static HRESULT CALLBACK whpx_emu_translate_callback(
846     void *ctx,
847     WHV_GUEST_VIRTUAL_ADDRESS Gva,
848     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
849     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
850     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
851 {
852     HRESULT hr;
853     struct whpx_state *whpx = &whpx_global;
854     CPUState *cpu = (CPUState *)ctx;
855     WHV_TRANSLATE_GVA_RESULT res;
856 
857     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
858                                       Gva, TranslateFlags, &res, Gpa);
859     if (FAILED(hr)) {
860         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
861     } else {
862         *TranslationResult = res.ResultCode;
863     }
864 
865     return hr;
866 }
867 
868 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
869     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
870     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
871     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
872     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
873     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
874     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
875 };
876 
877 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
878 {
879     HRESULT hr;
880     AccelCPUState *vcpu = cpu->accel;
881     WHV_EMULATOR_STATUS emu_status;
882 
883     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
884         vcpu->emulator, cpu,
885         &vcpu->exit_ctx.VpContext, ctx,
886         &emu_status);
887     if (FAILED(hr)) {
888         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
889         return -1;
890     }
891 
892     if (!emu_status.EmulationSuccessful) {
893         error_report("WHPX: Failed to emulate MMIO access with"
894                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
895         return -1;
896     }
897 
898     return 0;
899 }
900 
901 static int whpx_handle_portio(CPUState *cpu,
902                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
903 {
904     HRESULT hr;
905     AccelCPUState *vcpu = cpu->accel;
906     WHV_EMULATOR_STATUS emu_status;
907 
908     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
909         vcpu->emulator, cpu,
910         &vcpu->exit_ctx.VpContext, ctx,
911         &emu_status);
912     if (FAILED(hr)) {
913         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
914         return -1;
915     }
916 
917     if (!emu_status.EmulationSuccessful) {
918         error_report("WHPX: Failed to emulate PortIO access with"
919                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
920         return -1;
921     }
922 
923     return 0;
924 }
925 
926 /*
927  * Controls whether we should intercept various exceptions on the guest,
928  * namely breakpoint/single-step events.
929  *
930  * The 'exceptions' argument accepts a bitmask, e.g:
931  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
932  */
933 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
934 {
935     struct whpx_state *whpx = &whpx_global;
936     WHV_PARTITION_PROPERTY prop = { 0, };
937     HRESULT hr;
938 
939     if (exceptions == whpx->exception_exit_bitmap) {
940         return S_OK;
941     }
942 
943     prop.ExceptionExitBitmap = exceptions;
944 
945     hr = whp_dispatch.WHvSetPartitionProperty(
946         whpx->partition,
947         WHvPartitionPropertyCodeExceptionExitBitmap,
948         &prop,
949         sizeof(WHV_PARTITION_PROPERTY));
950 
951     if (SUCCEEDED(hr)) {
952         whpx->exception_exit_bitmap = exceptions;
953     }
954 
955     return hr;
956 }
957 
958 
959 /*
960  * This function is called before/after stepping over a single instruction.
961  * It will update the CPU registers to arm/disarm the instruction stepping
962  * accordingly.
963  */
964 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
965     bool set,
966     uint64_t *exit_context_rflags)
967 {
968     WHV_REGISTER_NAME reg_name;
969     WHV_REGISTER_VALUE reg_value;
970     HRESULT hr;
971     struct whpx_state *whpx = &whpx_global;
972 
973     /*
974      * If we are trying to step over a single instruction, we need to set the
975      * TF bit in rflags. Otherwise, clear it.
976      */
977     reg_name = WHvX64RegisterRflags;
978     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
979         whpx->partition,
980         cpu->cpu_index,
981         &reg_name,
982         1,
983         &reg_value);
984 
985     if (FAILED(hr)) {
986         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
987         return hr;
988     }
989 
990     if (exit_context_rflags) {
991         assert(*exit_context_rflags == reg_value.Reg64);
992     }
993 
994     if (set) {
995         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
996         reg_value.Reg64 |= TF_MASK;
997     } else {
998         reg_value.Reg64 &= ~TF_MASK;
999     }
1000 
1001     if (exit_context_rflags) {
1002         *exit_context_rflags = reg_value.Reg64;
1003     }
1004 
1005     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1006         whpx->partition,
1007         cpu->cpu_index,
1008         &reg_name,
1009         1,
1010         &reg_value);
1011 
1012     if (FAILED(hr)) {
1013         error_report("WHPX: Failed to set rflags,"
1014             " hr=%08lx",
1015             hr);
1016         return hr;
1017     }
1018 
1019     reg_name = WHvRegisterInterruptState;
1020     reg_value.Reg64 = 0;
1021 
1022     /* Suspend delivery of hardware interrupts during single-stepping. */
1023     reg_value.InterruptState.InterruptShadow = set != 0;
1024 
1025     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1026     whpx->partition,
1027         cpu->cpu_index,
1028         &reg_name,
1029         1,
1030         &reg_value);
1031 
1032     if (FAILED(hr)) {
1033         error_report("WHPX: Failed to set InterruptState,"
1034             " hr=%08lx",
1035             hr);
1036         return hr;
1037     }
1038 
1039     if (!set) {
1040         /*
1041          * We have just finished stepping over a single instruction,
1042          * and intercepted the INT1 generated by it.
1043          * We need to now hide the INT1 from the guest,
1044          * as it would not be expecting it.
1045          */
1046 
1047         reg_name = WHvX64RegisterPendingDebugException;
1048         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1049         whpx->partition,
1050             cpu->cpu_index,
1051             &reg_name,
1052             1,
1053             &reg_value);
1054 
1055         if (FAILED(hr)) {
1056             error_report("WHPX: Failed to get pending debug exceptions,"
1057                          "hr=%08lx", hr);
1058             return hr;
1059         }
1060 
1061         if (reg_value.PendingDebugException.SingleStep) {
1062             reg_value.PendingDebugException.SingleStep = 0;
1063 
1064             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1065                 whpx->partition,
1066                 cpu->cpu_index,
1067                 &reg_name,
1068                 1,
1069                 &reg_value);
1070 
1071             if (FAILED(hr)) {
1072                 error_report("WHPX: Failed to clear pending debug exceptions,"
1073                              "hr=%08lx", hr);
1074              return hr;
1075             }
1076         }
1077 
1078     }
1079 
1080     return S_OK;
1081 }
1082 
1083 /* Tries to find a breakpoint at the specified address. */
1084 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1085 {
1086     struct whpx_state *whpx = &whpx_global;
1087     int i;
1088 
1089     if (whpx->breakpoints.breakpoints) {
1090         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1091             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1092                 return &whpx->breakpoints.breakpoints->data[i];
1093             }
1094         }
1095     }
1096 
1097     return NULL;
1098 }
1099 
1100 /*
1101  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1102  * debugging user-mode applications. Since the WHPX API does not offer
1103  * an easy way to pass the intercepted exception back to the guest, we
1104  * resort to using INT1 instead, and let the guest always handle INT3.
1105  */
1106 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1107 
1108 /*
1109  * The WHPX QEMU backend implements breakpoints by writing the INT1
1110  * instruction into memory (ignoring the DRx registers). This raises a few
1111  * issues that need to be carefully handled:
1112  *
1113  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1114  *    at the same location, and later remove them in arbitrary order.
1115  *    This should not cause memory corruption, and should only remove the
1116  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1117  *
1118  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1119  *    physical location. Hence, physically adding/removing a breakpoint can
1120  *    theoretically fail at any time. We need to keep track of it.
1121  *
1122  * The function below rebuilds a list of low-level breakpoints (one per
1123  * address, tracking the original instruction and any errors) from the list of
1124  * high-level breakpoints (set via cpu_breakpoint_insert()).
1125  *
1126  * In order to optimize performance, this function stores the list of
1127  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1128  * low-level ones, so that it won't be re-invoked until these breakpoints
1129  * change.
1130  *
1131  * Note that this function decides which breakpoints should be inserted into,
1132  * memory, but doesn't actually do it. The memory accessing is done in
1133  * whpx_apply_breakpoints().
1134  */
1135 static void whpx_translate_cpu_breakpoints(
1136     struct whpx_breakpoints *breakpoints,
1137     CPUState *cpu,
1138     int cpu_breakpoint_count)
1139 {
1140     CPUBreakpoint *bp;
1141     int cpu_bp_index = 0;
1142 
1143     breakpoints->original_addresses =
1144         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1145 
1146     breakpoints->original_address_count = cpu_breakpoint_count;
1147 
1148     int max_breakpoints = cpu_breakpoint_count +
1149         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1150 
1151     struct whpx_breakpoint_collection *new_breakpoints =
1152         g_malloc0(sizeof(struct whpx_breakpoint_collection)
1153                   + max_breakpoints * sizeof(struct whpx_breakpoint));
1154 
1155     new_breakpoints->allocated = max_breakpoints;
1156     new_breakpoints->used = 0;
1157 
1158     /*
1159      * 1. Preserve all old breakpoints that could not be automatically
1160      * cleared when the CPU got stopped.
1161      */
1162     if (breakpoints->breakpoints) {
1163         int i;
1164         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1165             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1166                 new_breakpoints->data[new_breakpoints->used++] =
1167                     breakpoints->breakpoints->data[i];
1168             }
1169         }
1170     }
1171 
1172     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1173     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1174         int i;
1175         bool found = false;
1176 
1177         /* This will be used to detect changed CPU breakpoints later. */
1178         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1179 
1180         for (i = 0; i < new_breakpoints->used; i++) {
1181             /*
1182              * WARNING: This loop has O(N^2) complexity, where N is the
1183              * number of breakpoints. It should not be a bottleneck in
1184              * real-world scenarios, since it only needs to run once after
1185              * the breakpoints have been modified.
1186              * If this ever becomes a concern, it can be optimized by storing
1187              * high-level breakpoint objects in a tree or hash map.
1188              */
1189 
1190             if (new_breakpoints->data[i].address == bp->pc) {
1191                 /* There was already a breakpoint at this address. */
1192                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1193                     new_breakpoints->data[i].state = WHPX_BP_SET;
1194                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1195                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1196                 }
1197 
1198                 found = true;
1199                 break;
1200             }
1201         }
1202 
1203         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1204             /* No WHPX breakpoint at this address. Create one. */
1205             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1206             new_breakpoints->data[new_breakpoints->used].state =
1207                 WHPX_BP_SET_PENDING;
1208             new_breakpoints->used++;
1209         }
1210     }
1211 
1212     /*
1213      * Free the previous breakpoint list. This can be optimized by keeping
1214      * it as shadow buffer for the next computation instead of freeing
1215      * it immediately.
1216      */
1217     g_free(breakpoints->breakpoints);
1218 
1219     breakpoints->breakpoints = new_breakpoints;
1220 }
1221 
1222 /*
1223  * Physically inserts/removes the breakpoints by reading and writing the
1224  * physical memory, keeping a track of the failed attempts.
1225  *
1226  * Passing resuming=true  will try to set all previously unset breakpoints.
1227  * Passing resuming=false will remove all inserted ones.
1228  */
1229 static void whpx_apply_breakpoints(
1230     struct whpx_breakpoint_collection *breakpoints,
1231     CPUState *cpu,
1232     bool resuming)
1233 {
1234     int i, rc;
1235     if (!breakpoints) {
1236         return;
1237     }
1238 
1239     for (i = 0; i < breakpoints->used; i++) {
1240         /* Decide what to do right now based on the last known state. */
1241         WhpxBreakpointState state = breakpoints->data[i].state;
1242         switch (state) {
1243         case WHPX_BP_CLEARED:
1244             if (resuming) {
1245                 state = WHPX_BP_SET_PENDING;
1246             }
1247             break;
1248         case WHPX_BP_SET_PENDING:
1249             if (!resuming) {
1250                 state = WHPX_BP_CLEARED;
1251             }
1252             break;
1253         case WHPX_BP_SET:
1254             if (!resuming) {
1255                 state = WHPX_BP_CLEAR_PENDING;
1256             }
1257             break;
1258         case WHPX_BP_CLEAR_PENDING:
1259             if (resuming) {
1260                 state = WHPX_BP_SET;
1261             }
1262             break;
1263         }
1264 
1265         if (state == WHPX_BP_SET_PENDING) {
1266             /* Remember the original instruction. */
1267             rc = cpu_memory_rw_debug(cpu,
1268                 breakpoints->data[i].address,
1269                 &breakpoints->data[i].original_instruction,
1270                 1,
1271                 false);
1272 
1273             if (!rc) {
1274                 /* Write the breakpoint instruction. */
1275                 rc = cpu_memory_rw_debug(cpu,
1276                     breakpoints->data[i].address,
1277                     (void *)&whpx_breakpoint_instruction,
1278                     1,
1279                     true);
1280             }
1281 
1282             if (!rc) {
1283                 state = WHPX_BP_SET;
1284             }
1285 
1286         }
1287 
1288         if (state == WHPX_BP_CLEAR_PENDING) {
1289             /* Restore the original instruction. */
1290             rc = cpu_memory_rw_debug(cpu,
1291                 breakpoints->data[i].address,
1292                 &breakpoints->data[i].original_instruction,
1293                 1,
1294                 true);
1295 
1296             if (!rc) {
1297                 state = WHPX_BP_CLEARED;
1298             }
1299         }
1300 
1301         breakpoints->data[i].state = state;
1302     }
1303 }
1304 
1305 /*
1306  * This function is called when the a VCPU is about to start and no other
1307  * VCPUs have been started so far. Since the VCPU start order could be
1308  * arbitrary, it doesn't have to be VCPU#0.
1309  *
1310  * It is used to commit the breakpoints into memory, and configure WHPX
1311  * to intercept debug exceptions.
1312  *
1313  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1314  * more VCPUs are already running, so this is the best place to do it.
1315  */
1316 static int whpx_first_vcpu_starting(CPUState *cpu)
1317 {
1318     struct whpx_state *whpx = &whpx_global;
1319     HRESULT hr;
1320 
1321     g_assert(bql_locked());
1322 
1323     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1324             (whpx->breakpoints.breakpoints &&
1325              whpx->breakpoints.breakpoints->used)) {
1326         CPUBreakpoint *bp;
1327         int i = 0;
1328         bool update_pending = false;
1329 
1330         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1331             if (i >= whpx->breakpoints.original_address_count ||
1332                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1333                 update_pending = true;
1334             }
1335 
1336             i++;
1337         }
1338 
1339         if (i != whpx->breakpoints.original_address_count) {
1340             update_pending = true;
1341         }
1342 
1343         if (update_pending) {
1344             /*
1345              * The CPU breakpoints have changed since the last call to
1346              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1347              * now be recomputed.
1348              */
1349             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1350         }
1351 
1352         /* Actually insert the breakpoints into the memory. */
1353         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1354     }
1355 
1356     uint64_t exception_mask;
1357     if (whpx->step_pending ||
1358         (whpx->breakpoints.breakpoints &&
1359          whpx->breakpoints.breakpoints->used)) {
1360         /*
1361          * We are either attempting to single-step one or more CPUs, or
1362          * have one or more breakpoints enabled. Both require intercepting
1363          * the WHvX64ExceptionTypeBreakpointTrap exception.
1364          */
1365 
1366         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1367     } else {
1368         /* Let the guest handle all exceptions. */
1369         exception_mask = 0;
1370     }
1371 
1372     hr = whpx_set_exception_exit_bitmap(exception_mask);
1373     if (!SUCCEEDED(hr)) {
1374         error_report("WHPX: Failed to update exception exit mask,"
1375                      "hr=%08lx.", hr);
1376         return 1;
1377     }
1378 
1379     return 0;
1380 }
1381 
1382 /*
1383  * This function is called when the last VCPU has finished running.
1384  * It is used to remove any previously set breakpoints from memory.
1385  */
1386 static int whpx_last_vcpu_stopping(CPUState *cpu)
1387 {
1388     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1389     return 0;
1390 }
1391 
1392 /* Returns the address of the next instruction that is about to be executed. */
1393 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1394 {
1395     if (cpu->vcpu_dirty) {
1396         /* The CPU registers have been modified by other parts of QEMU. */
1397         return cpu_env(cpu)->eip;
1398     } else if (exit_context_valid) {
1399         /*
1400          * The CPU registers have not been modified by neither other parts
1401          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1402          * This is the most common case.
1403          */
1404         AccelCPUState *vcpu = cpu->accel;
1405         return vcpu->exit_ctx.VpContext.Rip;
1406     } else {
1407         /*
1408          * The CPU registers have been modified by a call to
1409          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1410          * the target.
1411          */
1412         WHV_REGISTER_VALUE reg_value;
1413         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1414         HRESULT hr;
1415         struct whpx_state *whpx = &whpx_global;
1416 
1417         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1418             whpx->partition,
1419             cpu->cpu_index,
1420             &reg_name,
1421             1,
1422             &reg_value);
1423 
1424         if (FAILED(hr)) {
1425             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1426             return 0;
1427         }
1428 
1429         return reg_value.Reg64;
1430     }
1431 }
1432 
1433 static int whpx_handle_halt(CPUState *cpu)
1434 {
1435     int ret = 0;
1436 
1437     bql_lock();
1438     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1439           (cpu_env(cpu)->eflags & IF_MASK)) &&
1440         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1441         cpu->exception_index = EXCP_HLT;
1442         cpu->halted = true;
1443         ret = 1;
1444     }
1445     bql_unlock();
1446 
1447     return ret;
1448 }
1449 
1450 static void whpx_vcpu_pre_run(CPUState *cpu)
1451 {
1452     HRESULT hr;
1453     struct whpx_state *whpx = &whpx_global;
1454     AccelCPUState *vcpu = cpu->accel;
1455     X86CPU *x86_cpu = X86_CPU(cpu);
1456     CPUX86State *env = &x86_cpu->env;
1457     int irq;
1458     uint8_t tpr;
1459     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1460     UINT32 reg_count = 0;
1461     WHV_REGISTER_VALUE reg_values[3];
1462     WHV_REGISTER_NAME reg_names[3];
1463 
1464     memset(&new_int, 0, sizeof(new_int));
1465     memset(reg_values, 0, sizeof(reg_values));
1466 
1467     bql_lock();
1468 
1469     /* Inject NMI */
1470     if (!vcpu->interruption_pending &&
1471         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1472         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1473             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1474             vcpu->interruptable = false;
1475             new_int.InterruptionType = WHvX64PendingNmi;
1476             new_int.InterruptionPending = 1;
1477             new_int.InterruptionVector = 2;
1478         }
1479         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1480             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1481         }
1482     }
1483 
1484     /*
1485      * Force the VCPU out of its inner loop to process any INIT requests or
1486      * commit pending TPR access.
1487      */
1488     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1489         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1490             !(env->hflags & HF_SMM_MASK)) {
1491             cpu->exit_request = 1;
1492         }
1493         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1494             cpu->exit_request = 1;
1495         }
1496     }
1497 
1498     /* Get pending hard interruption or replay one that was overwritten */
1499     if (!whpx_apic_in_platform()) {
1500         if (!vcpu->interruption_pending &&
1501             vcpu->interruptable && (env->eflags & IF_MASK)) {
1502             assert(!new_int.InterruptionPending);
1503             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1504                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1505                 irq = cpu_get_pic_interrupt(env);
1506                 if (irq >= 0) {
1507                     new_int.InterruptionType = WHvX64PendingInterrupt;
1508                     new_int.InterruptionPending = 1;
1509                     new_int.InterruptionVector = irq;
1510                 }
1511             }
1512         }
1513 
1514         /* Setup interrupt state if new one was prepared */
1515         if (new_int.InterruptionPending) {
1516             reg_values[reg_count].PendingInterruption = new_int;
1517             reg_names[reg_count] = WHvRegisterPendingInterruption;
1518             reg_count += 1;
1519         }
1520     } else if (vcpu->ready_for_pic_interrupt &&
1521                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1522         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1523         irq = cpu_get_pic_interrupt(env);
1524         if (irq >= 0) {
1525             reg_names[reg_count] = WHvRegisterPendingEvent;
1526             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1527             {
1528                 .EventPending = 1,
1529                 .EventType = WHvX64PendingEventExtInt,
1530                 .Vector = irq,
1531             };
1532             reg_count += 1;
1533         }
1534      }
1535 
1536     /* Sync the TPR to the CR8 if was modified during the intercept */
1537     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1538     if (tpr != vcpu->tpr) {
1539         vcpu->tpr = tpr;
1540         reg_values[reg_count].Reg64 = tpr;
1541         cpu->exit_request = 1;
1542         reg_names[reg_count] = WHvX64RegisterCr8;
1543         reg_count += 1;
1544     }
1545 
1546     /* Update the state of the interrupt delivery notification */
1547     if (!vcpu->window_registered &&
1548         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1549         reg_values[reg_count].DeliverabilityNotifications =
1550             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1551                 .InterruptNotification = 1
1552             };
1553         vcpu->window_registered = 1;
1554         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1555         reg_count += 1;
1556     }
1557 
1558     bql_unlock();
1559     vcpu->ready_for_pic_interrupt = false;
1560 
1561     if (reg_count) {
1562         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1563             whpx->partition, cpu->cpu_index,
1564             reg_names, reg_count, reg_values);
1565         if (FAILED(hr)) {
1566             error_report("WHPX: Failed to set interrupt state registers,"
1567                          " hr=%08lx", hr);
1568         }
1569     }
1570 }
1571 
1572 static void whpx_vcpu_post_run(CPUState *cpu)
1573 {
1574     AccelCPUState *vcpu = cpu->accel;
1575     X86CPU *x86_cpu = X86_CPU(cpu);
1576     CPUX86State *env = &x86_cpu->env;
1577 
1578     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1579 
1580     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1581     if (vcpu->tpr != tpr) {
1582         vcpu->tpr = tpr;
1583         bql_lock();
1584         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1585         bql_unlock();
1586     }
1587 
1588     vcpu->interruption_pending =
1589         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1590 
1591     vcpu->interruptable =
1592         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1593 }
1594 
1595 static void whpx_vcpu_process_async_events(CPUState *cpu)
1596 {
1597     X86CPU *x86_cpu = X86_CPU(cpu);
1598     CPUX86State *env = &x86_cpu->env;
1599     AccelCPUState *vcpu = cpu->accel;
1600 
1601     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1602         !(env->hflags & HF_SMM_MASK)) {
1603         whpx_cpu_synchronize_state(cpu);
1604         do_cpu_init(x86_cpu);
1605         vcpu->interruptable = true;
1606     }
1607 
1608     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1609         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1610         apic_poll_irq(x86_cpu->apic_state);
1611     }
1612 
1613     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1614          (env->eflags & IF_MASK)) ||
1615         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1616         cpu->halted = false;
1617     }
1618 
1619     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1620         whpx_cpu_synchronize_state(cpu);
1621         do_cpu_sipi(x86_cpu);
1622     }
1623 
1624     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1625         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1626         whpx_cpu_synchronize_state(cpu);
1627         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1628                                       env->tpr_access_type);
1629     }
1630 }
1631 
1632 static int whpx_vcpu_run(CPUState *cpu)
1633 {
1634     HRESULT hr;
1635     struct whpx_state *whpx = &whpx_global;
1636     AccelCPUState *vcpu = cpu->accel;
1637     struct whpx_breakpoint *stepped_over_bp = NULL;
1638     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1639     int ret;
1640 
1641     g_assert(bql_locked());
1642 
1643     if (whpx->running_cpus++ == 0) {
1644         /* Insert breakpoints into memory, update exception exit bitmap. */
1645         ret = whpx_first_vcpu_starting(cpu);
1646         if (ret != 0) {
1647             return ret;
1648         }
1649     }
1650 
1651     if (whpx->breakpoints.breakpoints &&
1652         whpx->breakpoints.breakpoints->used > 0)
1653     {
1654         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1655         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1656         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1657             stepped_over_bp = NULL;
1658         }
1659 
1660         if (stepped_over_bp) {
1661             /*
1662              * We are trying to run the instruction overwritten by an active
1663              * breakpoint. We will temporarily disable the breakpoint, suspend
1664              * other CPUs, and step over the instruction.
1665              */
1666             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1667         }
1668     }
1669 
1670     if (exclusive_step_mode == WHPX_STEP_NONE) {
1671         whpx_vcpu_process_async_events(cpu);
1672         if (cpu->halted && !whpx_apic_in_platform()) {
1673             cpu->exception_index = EXCP_HLT;
1674             qatomic_set(&cpu->exit_request, false);
1675             return 0;
1676         }
1677     }
1678 
1679     bql_unlock();
1680 
1681     if (exclusive_step_mode != WHPX_STEP_NONE) {
1682         start_exclusive();
1683         g_assert(cpu == current_cpu);
1684         g_assert(!cpu->running);
1685         cpu->running = true;
1686 
1687         hr = whpx_set_exception_exit_bitmap(
1688             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1689         if (!SUCCEEDED(hr)) {
1690             error_report("WHPX: Failed to update exception exit mask, "
1691                          "hr=%08lx.", hr);
1692             return 1;
1693         }
1694 
1695         if (stepped_over_bp) {
1696             /* Temporarily disable the triggered breakpoint. */
1697             cpu_memory_rw_debug(cpu,
1698                 stepped_over_bp->address,
1699                 &stepped_over_bp->original_instruction,
1700                 1,
1701                 true);
1702         }
1703     } else {
1704         cpu_exec_start(cpu);
1705     }
1706 
1707     do {
1708         if (cpu->vcpu_dirty) {
1709             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1710             cpu->vcpu_dirty = false;
1711         }
1712 
1713         if (exclusive_step_mode == WHPX_STEP_NONE) {
1714             whpx_vcpu_pre_run(cpu);
1715 
1716             if (qatomic_read(&cpu->exit_request)) {
1717                 whpx_vcpu_kick(cpu);
1718             }
1719         }
1720 
1721         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1722             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1723         }
1724 
1725         hr = whp_dispatch.WHvRunVirtualProcessor(
1726             whpx->partition, cpu->cpu_index,
1727             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1728 
1729         if (FAILED(hr)) {
1730             error_report("WHPX: Failed to exec a virtual processor,"
1731                          " hr=%08lx", hr);
1732             ret = -1;
1733             break;
1734         }
1735 
1736         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1737             whpx_vcpu_configure_single_stepping(cpu,
1738                 false,
1739                 &vcpu->exit_ctx.VpContext.Rflags);
1740         }
1741 
1742         whpx_vcpu_post_run(cpu);
1743 
1744         switch (vcpu->exit_ctx.ExitReason) {
1745         case WHvRunVpExitReasonMemoryAccess:
1746             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1747             break;
1748 
1749         case WHvRunVpExitReasonX64IoPortAccess:
1750             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1751             break;
1752 
1753         case WHvRunVpExitReasonX64InterruptWindow:
1754             vcpu->ready_for_pic_interrupt = 1;
1755             vcpu->window_registered = 0;
1756             ret = 0;
1757             break;
1758 
1759         case WHvRunVpExitReasonX64ApicEoi:
1760             assert(whpx_apic_in_platform());
1761             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1762             break;
1763 
1764         case WHvRunVpExitReasonX64Halt:
1765             /*
1766              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1767              * longer used.
1768              */
1769             ret = whpx_handle_halt(cpu);
1770             break;
1771 
1772         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1773             WHV_INTERRUPT_CONTROL ipi = {0};
1774             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1775             uint32_t delivery_mode =
1776                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1777             int dest_shorthand =
1778                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1779             bool broadcast = false;
1780             bool include_self = false;
1781             uint32_t i;
1782 
1783             /* We only registered for INIT and SIPI exits. */
1784             if ((delivery_mode != APIC_DM_INIT) &&
1785                 (delivery_mode != APIC_DM_SIPI)) {
1786                 error_report(
1787                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1788                 break;
1789             }
1790 
1791             if (delivery_mode == APIC_DM_INIT) {
1792                 ipi.Type = WHvX64InterruptTypeInit;
1793             } else {
1794                 ipi.Type = WHvX64InterruptTypeSipi;
1795             }
1796 
1797             ipi.DestinationMode =
1798                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1799                     WHvX64InterruptDestinationModeLogical :
1800                     WHvX64InterruptDestinationModePhysical;
1801 
1802             ipi.TriggerMode =
1803                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1804                     WHvX64InterruptTriggerModeLevel :
1805                     WHvX64InterruptTriggerModeEdge;
1806 
1807             ipi.Vector = icr & APIC_VECTOR_MASK;
1808             switch (dest_shorthand) {
1809             /* no shorthand. Bits 56-63 contain the destination. */
1810             case 0:
1811                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1812                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1813                         &ipi, sizeof(ipi));
1814                 if (FAILED(hr)) {
1815                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1816                         hr);
1817                 }
1818 
1819                 break;
1820 
1821             /* self */
1822             case 1:
1823                 include_self = true;
1824                 break;
1825 
1826             /* broadcast, including self */
1827             case 2:
1828                 broadcast = true;
1829                 include_self = true;
1830                 break;
1831 
1832             /* broadcast, excluding self */
1833             case 3:
1834                 broadcast = true;
1835                 break;
1836             }
1837 
1838             if (!broadcast && !include_self) {
1839                 break;
1840             }
1841 
1842             for (i = 0; i <= max_vcpu_index; i++) {
1843                 if (i == cpu->cpu_index && !include_self) {
1844                     continue;
1845                 }
1846 
1847                 /*
1848                  * Assuming that APIC Ids are identity mapped since
1849                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1850                  * are not handled yet and the hypervisor doesn't allow the
1851                  * guest to modify the APIC ID.
1852                  */
1853                 ipi.Destination = i;
1854                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1855                         &ipi, sizeof(ipi));
1856                 if (FAILED(hr)) {
1857                     error_report(
1858                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1859                         i, hr);
1860                 }
1861             }
1862 
1863             break;
1864         }
1865 
1866         case WHvRunVpExitReasonCanceled:
1867             if (exclusive_step_mode != WHPX_STEP_NONE) {
1868                 /*
1869                  * We are trying to step over a single instruction, and
1870                  * likely got a request to stop from another thread.
1871                  * Delay it until we are done stepping
1872                  * over.
1873                  */
1874                 ret = 0;
1875             } else {
1876                 cpu->exception_index = EXCP_INTERRUPT;
1877                 ret = 1;
1878             }
1879             break;
1880         case WHvRunVpExitReasonX64MsrAccess: {
1881             WHV_REGISTER_VALUE reg_values[3] = {0};
1882             WHV_REGISTER_NAME reg_names[3];
1883             UINT32 reg_count;
1884 
1885             reg_names[0] = WHvX64RegisterRip;
1886             reg_names[1] = WHvX64RegisterRax;
1887             reg_names[2] = WHvX64RegisterRdx;
1888 
1889             reg_values[0].Reg64 =
1890                 vcpu->exit_ctx.VpContext.Rip +
1891                 vcpu->exit_ctx.VpContext.InstructionLength;
1892 
1893             /*
1894              * For all unsupported MSR access we:
1895              *     ignore writes
1896              *     return 0 on read.
1897              */
1898             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1899                         1 : 3;
1900 
1901             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1902                 whpx->partition,
1903                 cpu->cpu_index,
1904                 reg_names, reg_count,
1905                 reg_values);
1906 
1907             if (FAILED(hr)) {
1908                 error_report("WHPX: Failed to set MsrAccess state "
1909                              " registers, hr=%08lx", hr);
1910             }
1911             ret = 0;
1912             break;
1913         }
1914         case WHvRunVpExitReasonX64Cpuid: {
1915             WHV_REGISTER_VALUE reg_values[5];
1916             WHV_REGISTER_NAME reg_names[5];
1917             UINT32 reg_count = 5;
1918             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1919             X86CPU *x86_cpu = X86_CPU(cpu);
1920             CPUX86State *env = &x86_cpu->env;
1921 
1922             memset(reg_values, 0, sizeof(reg_values));
1923 
1924             rip = vcpu->exit_ctx.VpContext.Rip +
1925                   vcpu->exit_ctx.VpContext.InstructionLength;
1926             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1927 
1928             /*
1929              * Ideally, these should be supplied to the hypervisor during VCPU
1930              * initialization and it should be able to satisfy this request.
1931              * But, currently, WHPX doesn't support setting CPUID values in the
1932              * hypervisor once the partition has been setup, which is too late
1933              * since VCPUs are realized later. For now, use the values from
1934              * QEMU to satisfy these requests, until WHPX adds support for
1935              * being able to set these values in the hypervisor at runtime.
1936              */
1937             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1938                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1939             switch (cpuid_fn) {
1940             case 0x40000000:
1941                 /* Expose the vmware cpu frequency cpuid leaf */
1942                 rax = 0x40000010;
1943                 rbx = rcx = rdx = 0;
1944                 break;
1945 
1946             case 0x40000010:
1947                 rax = env->tsc_khz;
1948                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1949                 rcx = rdx = 0;
1950                 break;
1951 
1952             case 0x80000001:
1953                 /* Remove any support of OSVW */
1954                 rcx &= ~CPUID_EXT3_OSVW;
1955                 break;
1956             }
1957 
1958             reg_names[0] = WHvX64RegisterRip;
1959             reg_names[1] = WHvX64RegisterRax;
1960             reg_names[2] = WHvX64RegisterRcx;
1961             reg_names[3] = WHvX64RegisterRdx;
1962             reg_names[4] = WHvX64RegisterRbx;
1963 
1964             reg_values[0].Reg64 = rip;
1965             reg_values[1].Reg64 = rax;
1966             reg_values[2].Reg64 = rcx;
1967             reg_values[3].Reg64 = rdx;
1968             reg_values[4].Reg64 = rbx;
1969 
1970             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1971                 whpx->partition, cpu->cpu_index,
1972                 reg_names,
1973                 reg_count,
1974                 reg_values);
1975 
1976             if (FAILED(hr)) {
1977                 error_report("WHPX: Failed to set CpuidAccess state registers,"
1978                              " hr=%08lx", hr);
1979             }
1980             ret = 0;
1981             break;
1982         }
1983         case WHvRunVpExitReasonException:
1984             whpx_get_registers(cpu);
1985 
1986             if ((vcpu->exit_ctx.VpException.ExceptionType ==
1987                  WHvX64ExceptionTypeDebugTrapOrFault) &&
1988                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1989                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1990                  whpx_breakpoint_instruction)) {
1991                 /* Stopped at a software breakpoint. */
1992                 cpu->exception_index = EXCP_DEBUG;
1993             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
1994                         WHvX64ExceptionTypeDebugTrapOrFault) &&
1995                        !cpu->singlestep_enabled) {
1996                 /*
1997                  * Just finished stepping over a breakpoint, but the
1998                  * gdb does not expect us to do single-stepping.
1999                  * Don't do anything special.
2000                  */
2001                 cpu->exception_index = EXCP_INTERRUPT;
2002             } else {
2003                 /* Another exception or debug event. Report it to GDB. */
2004                 cpu->exception_index = EXCP_DEBUG;
2005             }
2006 
2007             ret = 1;
2008             break;
2009         case WHvRunVpExitReasonNone:
2010         case WHvRunVpExitReasonUnrecoverableException:
2011         case WHvRunVpExitReasonInvalidVpRegisterValue:
2012         case WHvRunVpExitReasonUnsupportedFeature:
2013         default:
2014             error_report("WHPX: Unexpected VP exit code %d",
2015                          vcpu->exit_ctx.ExitReason);
2016             whpx_get_registers(cpu);
2017             bql_lock();
2018             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2019             bql_unlock();
2020             break;
2021         }
2022 
2023     } while (!ret);
2024 
2025     if (stepped_over_bp) {
2026         /* Restore the breakpoint we stepped over */
2027         cpu_memory_rw_debug(cpu,
2028             stepped_over_bp->address,
2029             (void *)&whpx_breakpoint_instruction,
2030             1,
2031             true);
2032     }
2033 
2034     if (exclusive_step_mode != WHPX_STEP_NONE) {
2035         g_assert(cpu_in_exclusive_context(cpu));
2036         cpu->running = false;
2037         end_exclusive();
2038 
2039         exclusive_step_mode = WHPX_STEP_NONE;
2040     } else {
2041         cpu_exec_end(cpu);
2042     }
2043 
2044     bql_lock();
2045     current_cpu = cpu;
2046 
2047     if (--whpx->running_cpus == 0) {
2048         whpx_last_vcpu_stopping(cpu);
2049     }
2050 
2051     qatomic_set(&cpu->exit_request, false);
2052 
2053     return ret < 0;
2054 }
2055 
2056 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2057 {
2058     if (!cpu->vcpu_dirty) {
2059         whpx_get_registers(cpu);
2060         cpu->vcpu_dirty = true;
2061     }
2062 }
2063 
2064 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2065                                                run_on_cpu_data arg)
2066 {
2067     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2068     cpu->vcpu_dirty = false;
2069 }
2070 
2071 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2072                                               run_on_cpu_data arg)
2073 {
2074     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2075     cpu->vcpu_dirty = false;
2076 }
2077 
2078 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2079                                                run_on_cpu_data arg)
2080 {
2081     cpu->vcpu_dirty = true;
2082 }
2083 
2084 /*
2085  * CPU support.
2086  */
2087 
2088 void whpx_cpu_synchronize_state(CPUState *cpu)
2089 {
2090     if (!cpu->vcpu_dirty) {
2091         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2092     }
2093 }
2094 
2095 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2096 {
2097     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2098 }
2099 
2100 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2101 {
2102     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2103 }
2104 
2105 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2106 {
2107     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2108 }
2109 
2110 static void whpx_pre_resume_vm(AccelState *as, bool step_pending)
2111 {
2112     whpx_global.step_pending = step_pending;
2113 }
2114 
2115 /*
2116  * Vcpu support.
2117  */
2118 
2119 static Error *whpx_migration_blocker;
2120 
2121 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2122 {
2123     CPUX86State *env = opaque;
2124 
2125     if (running) {
2126         env->tsc_valid = false;
2127     }
2128 }
2129 
2130 int whpx_init_vcpu(CPUState *cpu)
2131 {
2132     HRESULT hr;
2133     struct whpx_state *whpx = &whpx_global;
2134     AccelCPUState *vcpu = NULL;
2135     Error *local_error = NULL;
2136     X86CPU *x86_cpu = X86_CPU(cpu);
2137     CPUX86State *env = &x86_cpu->env;
2138     UINT64 freq = 0;
2139     int ret;
2140 
2141     /* Add migration blockers for all unsupported features of the
2142      * Windows Hypervisor Platform
2143      */
2144     if (whpx_migration_blocker == NULL) {
2145         error_setg(&whpx_migration_blocker,
2146                "State blocked due to non-migratable CPUID feature support,"
2147                "dirty memory tracking support, and XSAVE/XRSTOR support");
2148 
2149         if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2150             error_report_err(local_error);
2151             ret = -EINVAL;
2152             goto error;
2153         }
2154     }
2155 
2156     vcpu = g_new0(AccelCPUState, 1);
2157 
2158     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2159         &whpx_emu_callbacks,
2160         &vcpu->emulator);
2161     if (FAILED(hr)) {
2162         error_report("WHPX: Failed to setup instruction completion support,"
2163                      " hr=%08lx", hr);
2164         ret = -EINVAL;
2165         goto error;
2166     }
2167 
2168     hr = whp_dispatch.WHvCreateVirtualProcessor(
2169         whpx->partition, cpu->cpu_index, 0);
2170     if (FAILED(hr)) {
2171         error_report("WHPX: Failed to create a virtual processor,"
2172                      " hr=%08lx", hr);
2173         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2174         ret = -EINVAL;
2175         goto error;
2176     }
2177 
2178     /*
2179      * vcpu's TSC frequency is either specified by user, or use the value
2180      * provided by Hyper-V if the former is not present. In the latter case, we
2181      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2182      * frequency can be migrated later via this field.
2183      */
2184     if (!env->tsc_khz) {
2185         hr = whp_dispatch.WHvGetCapability(
2186             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2187                 NULL);
2188         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2189             if (FAILED(hr)) {
2190                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2191             } else {
2192                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2193             }
2194         }
2195     }
2196 
2197     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2198     hr = whp_dispatch.WHvGetCapability(
2199         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2200     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2201         if (FAILED(hr)) {
2202             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2203         } else {
2204             env->apic_bus_freq = freq;
2205         }
2206     }
2207 
2208     /*
2209      * If the vmware cpuid frequency leaf option is set, and we have a valid
2210      * tsc value, trap the corresponding cpuid's.
2211      */
2212     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2213         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2214 
2215         hr = whp_dispatch.WHvSetPartitionProperty(
2216                 whpx->partition,
2217                 WHvPartitionPropertyCodeCpuidExitList,
2218                 cpuidExitList,
2219                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2220 
2221         if (FAILED(hr)) {
2222             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2223                         hr);
2224             ret = -EINVAL;
2225             goto error;
2226         }
2227     }
2228 
2229     vcpu->interruptable = true;
2230     cpu->vcpu_dirty = true;
2231     cpu->accel = vcpu;
2232     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2233     qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2234 
2235     return 0;
2236 
2237 error:
2238     g_free(vcpu);
2239 
2240     return ret;
2241 }
2242 
2243 int whpx_vcpu_exec(CPUState *cpu)
2244 {
2245     int ret;
2246     int fatal;
2247 
2248     for (;;) {
2249         if (cpu->exception_index >= EXCP_INTERRUPT) {
2250             ret = cpu->exception_index;
2251             cpu->exception_index = -1;
2252             break;
2253         }
2254 
2255         fatal = whpx_vcpu_run(cpu);
2256 
2257         if (fatal) {
2258             error_report("WHPX: Failed to exec a virtual processor");
2259             abort();
2260         }
2261     }
2262 
2263     return ret;
2264 }
2265 
2266 void whpx_destroy_vcpu(CPUState *cpu)
2267 {
2268     struct whpx_state *whpx = &whpx_global;
2269     AccelCPUState *vcpu = cpu->accel;
2270 
2271     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2272     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2273     g_free(cpu->accel);
2274 }
2275 
2276 void whpx_vcpu_kick(CPUState *cpu)
2277 {
2278     struct whpx_state *whpx = &whpx_global;
2279     whp_dispatch.WHvCancelRunVirtualProcessor(
2280         whpx->partition, cpu->cpu_index, 0);
2281 }
2282 
2283 /*
2284  * Memory support.
2285  */
2286 
2287 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2288                                 void *host_va, int add, int rom,
2289                                 const char *name)
2290 {
2291     struct whpx_state *whpx = &whpx_global;
2292     HRESULT hr;
2293 
2294     /*
2295     if (add) {
2296         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2297                (void*)start_pa, (void*)size, host_va,
2298                (rom ? "ROM" : "RAM"), name);
2299     } else {
2300         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2301                (void*)start_pa, (void*)size, host_va, name);
2302     }
2303     */
2304 
2305     if (add) {
2306         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2307                                          host_va,
2308                                          start_pa,
2309                                          size,
2310                                          (WHvMapGpaRangeFlagRead |
2311                                           WHvMapGpaRangeFlagExecute |
2312                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2313     } else {
2314         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2315                                            start_pa,
2316                                            size);
2317     }
2318 
2319     if (FAILED(hr)) {
2320         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2321                      " Host:%p, hr=%08lx",
2322                      (add ? "MAP" : "UNMAP"), name,
2323                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2324     }
2325 }
2326 
2327 static void whpx_process_section(MemoryRegionSection *section, int add)
2328 {
2329     MemoryRegion *mr = section->mr;
2330     hwaddr start_pa = section->offset_within_address_space;
2331     ram_addr_t size = int128_get64(section->size);
2332     unsigned int delta;
2333     uint64_t host_va;
2334 
2335     if (!memory_region_is_ram(mr)) {
2336         return;
2337     }
2338 
2339     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2340     delta &= ~qemu_real_host_page_mask();
2341     if (delta > size) {
2342         return;
2343     }
2344     start_pa += delta;
2345     size -= delta;
2346     size &= qemu_real_host_page_mask();
2347     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2348         return;
2349     }
2350 
2351     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2352             + section->offset_within_region + delta;
2353 
2354     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2355                         memory_region_is_rom(mr), mr->name);
2356 }
2357 
2358 static void whpx_region_add(MemoryListener *listener,
2359                            MemoryRegionSection *section)
2360 {
2361     memory_region_ref(section->mr);
2362     whpx_process_section(section, 1);
2363 }
2364 
2365 static void whpx_region_del(MemoryListener *listener,
2366                            MemoryRegionSection *section)
2367 {
2368     whpx_process_section(section, 0);
2369     memory_region_unref(section->mr);
2370 }
2371 
2372 static void whpx_transaction_begin(MemoryListener *listener)
2373 {
2374 }
2375 
2376 static void whpx_transaction_commit(MemoryListener *listener)
2377 {
2378 }
2379 
2380 static void whpx_log_sync(MemoryListener *listener,
2381                          MemoryRegionSection *section)
2382 {
2383     MemoryRegion *mr = section->mr;
2384 
2385     if (!memory_region_is_ram(mr)) {
2386         return;
2387     }
2388 
2389     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2390 }
2391 
2392 static MemoryListener whpx_memory_listener = {
2393     .name = "whpx",
2394     .begin = whpx_transaction_begin,
2395     .commit = whpx_transaction_commit,
2396     .region_add = whpx_region_add,
2397     .region_del = whpx_region_del,
2398     .log_sync = whpx_log_sync,
2399     .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2400 };
2401 
2402 static void whpx_memory_init(void)
2403 {
2404     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2405 }
2406 
2407 /*
2408  * Load the functions from the given library, using the given handle. If a
2409  * handle is provided, it is used, otherwise the library is opened. The
2410  * handle will be updated on return with the opened one.
2411  */
2412 static bool load_whp_dispatch_fns(HMODULE *handle,
2413     WHPFunctionList function_list)
2414 {
2415     HMODULE hLib = *handle;
2416 
2417     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2418     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2419     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2420         whp_dispatch.function_name = \
2421             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2422 
2423     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2424         whp_dispatch.function_name = \
2425             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2426         if (!whp_dispatch.function_name) { \
2427             error_report("Could not load function %s", #function_name); \
2428             goto error; \
2429         } \
2430 
2431     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2432     if (!handle_lib) { \
2433         handle_lib = LoadLibrary(lib_name); \
2434         if (!handle_lib) { \
2435             error_report("Could not load library %s.", lib_name); \
2436             goto error; \
2437         } \
2438     } \
2439 
2440     switch (function_list) {
2441     case WINHV_PLATFORM_FNS_DEFAULT:
2442         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2443         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2444         break;
2445 
2446     case WINHV_EMULATION_FNS_DEFAULT:
2447         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2448         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2449         break;
2450 
2451     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2452         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2453         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2454         break;
2455     }
2456 
2457     *handle = hLib;
2458     return true;
2459 
2460 error:
2461     if (hLib) {
2462         FreeLibrary(hLib);
2463     }
2464 
2465     return false;
2466 }
2467 
2468 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2469                                    const char *name, void *opaque,
2470                                    Error **errp)
2471 {
2472     struct whpx_state *whpx = &whpx_global;
2473     OnOffSplit mode;
2474 
2475     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2476         return;
2477     }
2478 
2479     switch (mode) {
2480     case ON_OFF_SPLIT_ON:
2481         whpx->kernel_irqchip_allowed = true;
2482         whpx->kernel_irqchip_required = true;
2483         break;
2484 
2485     case ON_OFF_SPLIT_OFF:
2486         whpx->kernel_irqchip_allowed = false;
2487         whpx->kernel_irqchip_required = false;
2488         break;
2489 
2490     case ON_OFF_SPLIT_SPLIT:
2491         error_setg(errp, "WHPX: split irqchip currently not supported");
2492         error_append_hint(errp,
2493             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2494         break;
2495 
2496     default:
2497         /*
2498          * The value was checked in visit_type_OnOffSplit() above. If
2499          * we get here, then something is wrong in QEMU.
2500          */
2501         abort();
2502     }
2503 }
2504 
2505 static void whpx_cpu_instance_init(CPUState *cs)
2506 {
2507     X86CPU *cpu = X86_CPU(cs);
2508 
2509     host_cpu_instance_init(cpu);
2510 }
2511 
2512 static void whpx_cpu_accel_class_init(ObjectClass *oc, const void *data)
2513 {
2514     AccelCPUClass *acc = ACCEL_CPU_CLASS(oc);
2515 
2516     acc->cpu_instance_init = whpx_cpu_instance_init;
2517 }
2518 
2519 static const TypeInfo whpx_cpu_accel_type = {
2520     .name = ACCEL_CPU_NAME("whpx"),
2521 
2522     .parent = TYPE_ACCEL_CPU,
2523     .class_init = whpx_cpu_accel_class_init,
2524     .abstract = true,
2525 };
2526 
2527 /*
2528  * Partition support
2529  */
2530 
2531 static int whpx_accel_init(AccelState *as, MachineState *ms)
2532 {
2533     struct whpx_state *whpx;
2534     int ret;
2535     HRESULT hr;
2536     WHV_CAPABILITY whpx_cap;
2537     UINT32 whpx_cap_size;
2538     WHV_PARTITION_PROPERTY prop;
2539     UINT32 cpuidExitList[] = {1, 0x80000001};
2540     WHV_CAPABILITY_FEATURES features = {0};
2541 
2542     whpx = &whpx_global;
2543 
2544     if (!init_whp_dispatch()) {
2545         ret = -ENOSYS;
2546         goto error;
2547     }
2548 
2549     whpx->mem_quota = ms->ram_size;
2550 
2551     hr = whp_dispatch.WHvGetCapability(
2552         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2553         sizeof(whpx_cap), &whpx_cap_size);
2554     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2555         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2556         ret = -ENOSPC;
2557         goto error;
2558     }
2559 
2560     hr = whp_dispatch.WHvGetCapability(
2561         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2562     if (FAILED(hr)) {
2563         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2564         ret = -EINVAL;
2565         goto error;
2566     }
2567 
2568     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2569     if (FAILED(hr)) {
2570         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2571         ret = -EINVAL;
2572         goto error;
2573     }
2574 
2575     /*
2576      * Query the XSAVE capability of the partition. Any error here is not
2577      * considered fatal.
2578      */
2579     hr = whp_dispatch.WHvGetPartitionProperty(
2580         whpx->partition,
2581         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2582         &whpx_xsave_cap,
2583         sizeof(whpx_xsave_cap),
2584         &whpx_cap_size);
2585 
2586     /*
2587      * Windows version which don't support this property will return with the
2588      * specific error code.
2589      */
2590     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2591         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2592     }
2593 
2594     if (!whpx_has_xsave()) {
2595         printf("WHPX: Partition is not XSAVE capable\n");
2596     }
2597 
2598     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2599     prop.ProcessorCount = ms->smp.cpus;
2600     hr = whp_dispatch.WHvSetPartitionProperty(
2601         whpx->partition,
2602         WHvPartitionPropertyCodeProcessorCount,
2603         &prop,
2604         sizeof(WHV_PARTITION_PROPERTY));
2605 
2606     if (FAILED(hr)) {
2607         error_report("WHPX: Failed to set partition processor count to %u,"
2608                      " hr=%08lx", prop.ProcessorCount, hr);
2609         ret = -EINVAL;
2610         goto error;
2611     }
2612 
2613     /*
2614      * Error out if WHP doesn't support apic emulation and user is requiring
2615      * it.
2616      */
2617     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2618             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2619         error_report("WHPX: kernel irqchip requested, but unavailable. "
2620             "Try without kernel-irqchip or with kernel-irqchip=off");
2621         ret = -EINVAL;
2622         goto error;
2623     }
2624 
2625     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2626         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2627         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2628             WHvX64LocalApicEmulationModeXApic;
2629         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2630         hr = whp_dispatch.WHvSetPartitionProperty(
2631             whpx->partition,
2632             WHvPartitionPropertyCodeLocalApicEmulationMode,
2633             &mode,
2634             sizeof(mode));
2635         if (FAILED(hr)) {
2636             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2637             if (whpx->kernel_irqchip_required) {
2638                 error_report("WHPX: kernel irqchip requested, but unavailable");
2639                 ret = -EINVAL;
2640                 goto error;
2641             }
2642         } else {
2643             whpx->apic_in_platform = true;
2644         }
2645     }
2646 
2647     /* Register for MSR and CPUID exits */
2648     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2649     prop.ExtendedVmExits.X64MsrExit = 1;
2650     prop.ExtendedVmExits.X64CpuidExit = 1;
2651     prop.ExtendedVmExits.ExceptionExit = 1;
2652     if (whpx_apic_in_platform()) {
2653         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2654     }
2655 
2656     hr = whp_dispatch.WHvSetPartitionProperty(
2657             whpx->partition,
2658             WHvPartitionPropertyCodeExtendedVmExits,
2659             &prop,
2660             sizeof(WHV_PARTITION_PROPERTY));
2661     if (FAILED(hr)) {
2662         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2663         ret = -EINVAL;
2664         goto error;
2665     }
2666 
2667     hr = whp_dispatch.WHvSetPartitionProperty(
2668         whpx->partition,
2669         WHvPartitionPropertyCodeCpuidExitList,
2670         cpuidExitList,
2671         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2672 
2673     if (FAILED(hr)) {
2674         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2675                      hr);
2676         ret = -EINVAL;
2677         goto error;
2678     }
2679 
2680     /*
2681      * We do not want to intercept any exceptions from the guest,
2682      * until we actually start debugging with gdb.
2683      */
2684     whpx->exception_exit_bitmap = -1;
2685     hr = whpx_set_exception_exit_bitmap(0);
2686 
2687     if (FAILED(hr)) {
2688         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2689         ret = -EINVAL;
2690         goto error;
2691     }
2692 
2693     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2694     if (FAILED(hr)) {
2695         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2696         ret = -EINVAL;
2697         goto error;
2698     }
2699 
2700     whpx_memory_init();
2701 
2702     printf("Windows Hypervisor Platform accelerator is operational\n");
2703     return 0;
2704 
2705 error:
2706 
2707     if (NULL != whpx->partition) {
2708         whp_dispatch.WHvDeletePartition(whpx->partition);
2709         whpx->partition = NULL;
2710     }
2711 
2712     return ret;
2713 }
2714 
2715 bool whpx_apic_in_platform(void) {
2716     return whpx_global.apic_in_platform;
2717 }
2718 
2719 static void whpx_accel_class_init(ObjectClass *oc, const void *data)
2720 {
2721     AccelClass *ac = ACCEL_CLASS(oc);
2722     ac->name = "WHPX";
2723     ac->init_machine = whpx_accel_init;
2724     ac->pre_resume_vm = whpx_pre_resume_vm;
2725     ac->allowed = &whpx_allowed;
2726 
2727     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2728         NULL, whpx_set_kernel_irqchip,
2729         NULL, NULL);
2730     object_class_property_set_description(oc, "kernel-irqchip",
2731         "Configure WHPX in-kernel irqchip");
2732 }
2733 
2734 static void whpx_accel_instance_init(Object *obj)
2735 {
2736     struct whpx_state *whpx = &whpx_global;
2737 
2738     memset(whpx, 0, sizeof(struct whpx_state));
2739     /* Turn on kernel-irqchip, by default */
2740     whpx->kernel_irqchip_allowed = true;
2741 }
2742 
2743 static const TypeInfo whpx_accel_type = {
2744     .name = ACCEL_CLASS_NAME("whpx"),
2745     .parent = TYPE_ACCEL,
2746     .instance_init = whpx_accel_instance_init,
2747     .class_init = whpx_accel_class_init,
2748 };
2749 
2750 static void whpx_type_init(void)
2751 {
2752     type_register_static(&whpx_accel_type);
2753     type_register_static(&whpx_cpu_accel_type);
2754 }
2755 
2756 bool init_whp_dispatch(void)
2757 {
2758     if (whp_dispatch_initialized) {
2759         return true;
2760     }
2761 
2762     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2763         goto error;
2764     }
2765 
2766     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2767         goto error;
2768     }
2769 
2770     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2771         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2772     whp_dispatch_initialized = true;
2773 
2774     return true;
2775 error:
2776     if (hWinHvPlatform) {
2777         FreeLibrary(hWinHvPlatform);
2778     }
2779 
2780     if (hWinHvEmulation) {
2781         FreeLibrary(hWinHvEmulation);
2782     }
2783 
2784     return false;
2785 }
2786 
2787 type_init(whpx_type_init);
2788