xref: /openbmc/qemu/target/i386/whpx/whpx-all.c (revision 06152b89db64bc5ccec1e54576706ba891654df9)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <winhvplatform.h>
35 #include <winhvemulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct AccelCPUState {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240 
241     /* Must be the last field as it may have a tail */
242     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
243 };
244 
245 static bool whpx_allowed;
246 static bool whp_dispatch_initialized;
247 static HMODULE hWinHvPlatform, hWinHvEmulation;
248 static uint32_t max_vcpu_index;
249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
250 
251 struct whpx_state whpx_global;
252 struct WHPDispatch whp_dispatch;
253 
254 static bool whpx_has_xsave(void)
255 {
256     return whpx_xsave_cap.XsaveSupport;
257 }
258 
259 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
260                                              int r86)
261 {
262     WHV_X64_SEGMENT_REGISTER hs;
263     unsigned flags = qs->flags;
264 
265     hs.Base = qs->base;
266     hs.Limit = qs->limit;
267     hs.Selector = qs->selector;
268 
269     if (v86) {
270         hs.Attributes = 0;
271         hs.SegmentType = 3;
272         hs.Present = 1;
273         hs.DescriptorPrivilegeLevel = 3;
274         hs.NonSystemSegment = 1;
275 
276     } else {
277         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
278 
279         if (r86) {
280             /* hs.Base &= 0xfffff; */
281         }
282     }
283 
284     return hs;
285 }
286 
287 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
288 {
289     SegmentCache qs;
290 
291     qs.base = hs->Base;
292     qs.limit = hs->Limit;
293     qs.selector = hs->Selector;
294 
295     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
296 
297     return qs;
298 }
299 
300 /* X64 Extended Control Registers */
301 static void whpx_set_xcrs(CPUState *cpu)
302 {
303     CPUX86State *env = cpu_env(cpu);
304     HRESULT hr;
305     struct whpx_state *whpx = &whpx_global;
306     WHV_REGISTER_VALUE xcr0;
307     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
308 
309     if (!whpx_has_xsave()) {
310         return;
311     }
312 
313     /* Only xcr0 is supported by the hypervisor currently */
314     xcr0.Reg64 = env->xcr0;
315     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
316         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
317     if (FAILED(hr)) {
318         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
319     }
320 }
321 
322 static int whpx_set_tsc(CPUState *cpu)
323 {
324     CPUX86State *env = cpu_env(cpu);
325     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
326     WHV_REGISTER_VALUE tsc_val;
327     HRESULT hr;
328     struct whpx_state *whpx = &whpx_global;
329 
330     /*
331      * Suspend the partition prior to setting the TSC to reduce the variance
332      * in TSC across vCPUs. When the first vCPU runs post suspend, the
333      * partition is automatically resumed.
334      */
335     if (whp_dispatch.WHvSuspendPartitionTime) {
336 
337         /*
338          * Unable to suspend partition while setting TSC is not a fatal
339          * error. It just increases the likelihood of TSC variance between
340          * vCPUs and some guest OS are able to handle that just fine.
341          */
342         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
343         if (FAILED(hr)) {
344             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
345         }
346     }
347 
348     tsc_val.Reg64 = env->tsc;
349     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
350         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
351     if (FAILED(hr)) {
352         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
353         return -1;
354     }
355 
356     return 0;
357 }
358 
359 /*
360  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
361  * however, they use a slightly different encoding. Specifically:
362  *
363  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
364  *
365  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
366  * and IA-32 Architectures Software Developer's Manual.
367  *
368  * The functions below translate the value of CR8 to TPR and vice versa.
369  */
370 
371 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
372 {
373     return tpr >> 4;
374 }
375 
376 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
377 {
378     return cr8 << 4;
379 }
380 
381 static void whpx_set_registers(CPUState *cpu, int level)
382 {
383     struct whpx_state *whpx = &whpx_global;
384     AccelCPUState *vcpu = cpu->accel;
385     X86CPU *x86_cpu = X86_CPU(cpu);
386     CPUX86State *env = &x86_cpu->env;
387     struct whpx_register_set vcxt;
388     HRESULT hr;
389     int idx;
390     int idx_next;
391     int i;
392     int v86, r86;
393 
394     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
395 
396     /*
397      * Following MSRs have side effects on the guest or are too heavy for
398      * runtime. Limit them to full state update.
399      */
400     if (level >= WHPX_SET_RESET_STATE) {
401         whpx_set_tsc(cpu);
402     }
403 
404     memset(&vcxt, 0, sizeof(struct whpx_register_set));
405 
406     v86 = (env->eflags & VM_MASK);
407     r86 = !(env->cr[0] & CR0_PE_MASK);
408 
409     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
410     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
411 
412     idx = 0;
413 
414     /* Indexes for first 16 registers match between HV and QEMU definitions */
415     idx_next = 16;
416     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
417         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
418     }
419     idx = idx_next;
420 
421     /* Same goes for RIP and RFLAGS */
422     assert(whpx_register_names[idx] == WHvX64RegisterRip);
423     vcxt.values[idx++].Reg64 = env->eip;
424 
425     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
426     vcxt.values[idx++].Reg64 = env->eflags;
427 
428     /* Translate 6+4 segment registers. HV and QEMU order matches  */
429     assert(idx == WHvX64RegisterEs);
430     for (i = 0; i < 6; i += 1, idx += 1) {
431         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
432     }
433 
434     assert(idx == WHvX64RegisterLdtr);
435     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
436 
437     assert(idx == WHvX64RegisterTr);
438     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
439 
440     assert(idx == WHvX64RegisterIdtr);
441     vcxt.values[idx].Table.Base = env->idt.base;
442     vcxt.values[idx].Table.Limit = env->idt.limit;
443     idx += 1;
444 
445     assert(idx == WHvX64RegisterGdtr);
446     vcxt.values[idx].Table.Base = env->gdt.base;
447     vcxt.values[idx].Table.Limit = env->gdt.limit;
448     idx += 1;
449 
450     /* CR0, 2, 3, 4, 8 */
451     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
452     vcxt.values[idx++].Reg64 = env->cr[0];
453     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
454     vcxt.values[idx++].Reg64 = env->cr[2];
455     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
456     vcxt.values[idx++].Reg64 = env->cr[3];
457     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
458     vcxt.values[idx++].Reg64 = env->cr[4];
459     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
460     vcxt.values[idx++].Reg64 = vcpu->tpr;
461 
462     /* 8 Debug Registers - Skipped */
463 
464     /*
465      * Extended control registers needs to be handled separately depending
466      * on whether xsave is supported/enabled or not.
467      */
468     whpx_set_xcrs(cpu);
469 
470     /* 16 XMM registers */
471     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
472     idx_next = idx + 16;
473     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
474         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
475         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
476     }
477     idx = idx_next;
478 
479     /* 8 FP registers */
480     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
481     for (i = 0; i < 8; i += 1, idx += 1) {
482         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
483         /* vcxt.values[idx].Fp.AsUINT128.High64 =
484                env->fpregs[i].mmx.MMX_Q(1);
485         */
486     }
487 
488     /* FP control status register */
489     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
490     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
491     vcxt.values[idx].FpControlStatus.FpStatus =
492         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
493     vcxt.values[idx].FpControlStatus.FpTag = 0;
494     for (i = 0; i < 8; ++i) {
495         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
496     }
497     vcxt.values[idx].FpControlStatus.Reserved = 0;
498     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
499     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
500     idx += 1;
501 
502     /* XMM control status register */
503     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
504     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
505     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
506     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
507     idx += 1;
508 
509     /* MSRs */
510     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
511     vcxt.values[idx++].Reg64 = env->efer;
512 #ifdef TARGET_X86_64
513     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
514     vcxt.values[idx++].Reg64 = env->kernelgsbase;
515 #endif
516 
517     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
518     vcxt.values[idx++].Reg64 = vcpu->apic_base;
519 
520     /* WHvX64RegisterPat - Skipped */
521 
522     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
523     vcxt.values[idx++].Reg64 = env->sysenter_cs;
524     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
525     vcxt.values[idx++].Reg64 = env->sysenter_eip;
526     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
527     vcxt.values[idx++].Reg64 = env->sysenter_esp;
528     assert(whpx_register_names[idx] == WHvX64RegisterStar);
529     vcxt.values[idx++].Reg64 = env->star;
530 #ifdef TARGET_X86_64
531     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
532     vcxt.values[idx++].Reg64 = env->lstar;
533     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
534     vcxt.values[idx++].Reg64 = env->cstar;
535     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
536     vcxt.values[idx++].Reg64 = env->fmask;
537 #endif
538 
539     /* Interrupt / Event Registers - Skipped */
540 
541     assert(idx == RTL_NUMBER_OF(whpx_register_names));
542 
543     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
544         whpx->partition, cpu->cpu_index,
545         whpx_register_names,
546         RTL_NUMBER_OF(whpx_register_names),
547         &vcxt.values[0]);
548 
549     if (FAILED(hr)) {
550         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
551                      hr);
552     }
553 
554     return;
555 }
556 
557 static int whpx_get_tsc(CPUState *cpu)
558 {
559     CPUX86State *env = cpu_env(cpu);
560     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
561     WHV_REGISTER_VALUE tsc_val;
562     HRESULT hr;
563     struct whpx_state *whpx = &whpx_global;
564 
565     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
566         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
567     if (FAILED(hr)) {
568         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
569         return -1;
570     }
571 
572     env->tsc = tsc_val.Reg64;
573     return 0;
574 }
575 
576 /* X64 Extended Control Registers */
577 static void whpx_get_xcrs(CPUState *cpu)
578 {
579     CPUX86State *env = cpu_env(cpu);
580     HRESULT hr;
581     struct whpx_state *whpx = &whpx_global;
582     WHV_REGISTER_VALUE xcr0;
583     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
584 
585     if (!whpx_has_xsave()) {
586         return;
587     }
588 
589     /* Only xcr0 is supported by the hypervisor currently */
590     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
591         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
592     if (FAILED(hr)) {
593         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
594         return;
595     }
596 
597     env->xcr0 = xcr0.Reg64;
598 }
599 
600 static void whpx_get_registers(CPUState *cpu)
601 {
602     struct whpx_state *whpx = &whpx_global;
603     AccelCPUState *vcpu = cpu->accel;
604     X86CPU *x86_cpu = X86_CPU(cpu);
605     CPUX86State *env = &x86_cpu->env;
606     struct whpx_register_set vcxt;
607     uint64_t tpr, apic_base;
608     HRESULT hr;
609     int idx;
610     int idx_next;
611     int i;
612 
613     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
614 
615     if (!env->tsc_valid) {
616         whpx_get_tsc(cpu);
617         env->tsc_valid = !runstate_is_running();
618     }
619 
620     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
621         whpx->partition, cpu->cpu_index,
622         whpx_register_names,
623         RTL_NUMBER_OF(whpx_register_names),
624         &vcxt.values[0]);
625     if (FAILED(hr)) {
626         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
627                      hr);
628     }
629 
630     if (whpx_apic_in_platform()) {
631         /*
632          * Fetch the TPR value from the emulated APIC. It may get overwritten
633          * below with the value from CR8 returned by
634          * WHvGetVirtualProcessorRegisters().
635          */
636         whpx_apic_get(x86_cpu->apic_state);
637         vcpu->tpr = whpx_apic_tpr_to_cr8(
638             cpu_get_apic_tpr(x86_cpu->apic_state));
639     }
640 
641     idx = 0;
642 
643     /* Indexes for first 16 registers match between HV and QEMU definitions */
644     idx_next = 16;
645     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
646         env->regs[idx] = vcxt.values[idx].Reg64;
647     }
648     idx = idx_next;
649 
650     /* Same goes for RIP and RFLAGS */
651     assert(whpx_register_names[idx] == WHvX64RegisterRip);
652     env->eip = vcxt.values[idx++].Reg64;
653     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
654     env->eflags = vcxt.values[idx++].Reg64;
655 
656     /* Translate 6+4 segment registers. HV and QEMU order matches  */
657     assert(idx == WHvX64RegisterEs);
658     for (i = 0; i < 6; i += 1, idx += 1) {
659         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
660     }
661 
662     assert(idx == WHvX64RegisterLdtr);
663     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
664     assert(idx == WHvX64RegisterTr);
665     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
666     assert(idx == WHvX64RegisterIdtr);
667     env->idt.base = vcxt.values[idx].Table.Base;
668     env->idt.limit = vcxt.values[idx].Table.Limit;
669     idx += 1;
670     assert(idx == WHvX64RegisterGdtr);
671     env->gdt.base = vcxt.values[idx].Table.Base;
672     env->gdt.limit = vcxt.values[idx].Table.Limit;
673     idx += 1;
674 
675     /* CR0, 2, 3, 4, 8 */
676     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
677     env->cr[0] = vcxt.values[idx++].Reg64;
678     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
679     env->cr[2] = vcxt.values[idx++].Reg64;
680     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
681     env->cr[3] = vcxt.values[idx++].Reg64;
682     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
683     env->cr[4] = vcxt.values[idx++].Reg64;
684     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
685     tpr = vcxt.values[idx++].Reg64;
686     if (tpr != vcpu->tpr) {
687         vcpu->tpr = tpr;
688         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
689     }
690 
691     /* 8 Debug Registers - Skipped */
692 
693     /*
694      * Extended control registers needs to be handled separately depending
695      * on whether xsave is supported/enabled or not.
696      */
697     whpx_get_xcrs(cpu);
698 
699     /* 16 XMM registers */
700     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
701     idx_next = idx + 16;
702     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
703         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
704         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
705     }
706     idx = idx_next;
707 
708     /* 8 FP registers */
709     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
710     for (i = 0; i < 8; i += 1, idx += 1) {
711         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
712         /* env->fpregs[i].mmx.MMX_Q(1) =
713                vcxt.values[idx].Fp.AsUINT128.High64;
714         */
715     }
716 
717     /* FP control status register */
718     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
719     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
720     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
721     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
722     for (i = 0; i < 8; ++i) {
723         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
724     }
725     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
726     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
727     idx += 1;
728 
729     /* XMM control status register */
730     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
731     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
732     idx += 1;
733 
734     /* MSRs */
735     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
736     env->efer = vcxt.values[idx++].Reg64;
737 #ifdef TARGET_X86_64
738     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
739     env->kernelgsbase = vcxt.values[idx++].Reg64;
740 #endif
741 
742     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
743     apic_base = vcxt.values[idx++].Reg64;
744     if (apic_base != vcpu->apic_base) {
745         vcpu->apic_base = apic_base;
746         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
747     }
748 
749     /* WHvX64RegisterPat - Skipped */
750 
751     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
752     env->sysenter_cs = vcxt.values[idx++].Reg64;
753     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
754     env->sysenter_eip = vcxt.values[idx++].Reg64;
755     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
756     env->sysenter_esp = vcxt.values[idx++].Reg64;
757     assert(whpx_register_names[idx] == WHvX64RegisterStar);
758     env->star = vcxt.values[idx++].Reg64;
759 #ifdef TARGET_X86_64
760     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
761     env->lstar = vcxt.values[idx++].Reg64;
762     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
763     env->cstar = vcxt.values[idx++].Reg64;
764     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
765     env->fmask = vcxt.values[idx++].Reg64;
766 #endif
767 
768     /* Interrupt / Event Registers - Skipped */
769 
770     assert(idx == RTL_NUMBER_OF(whpx_register_names));
771 
772     if (whpx_apic_in_platform()) {
773         whpx_apic_get(x86_cpu->apic_state);
774     }
775 
776     x86_update_hflags(env);
777 
778     return;
779 }
780 
781 static HRESULT CALLBACK whpx_emu_ioport_callback(
782     void *ctx,
783     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
784 {
785     MemTxAttrs attrs = { 0 };
786     address_space_rw(&address_space_io, IoAccess->Port, attrs,
787                      &IoAccess->Data, IoAccess->AccessSize,
788                      IoAccess->Direction);
789     return S_OK;
790 }
791 
792 static HRESULT CALLBACK whpx_emu_mmio_callback(
793     void *ctx,
794     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
795 {
796     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
797                            ma->Direction);
798     return S_OK;
799 }
800 
801 static HRESULT CALLBACK whpx_emu_getreg_callback(
802     void *ctx,
803     const WHV_REGISTER_NAME *RegisterNames,
804     UINT32 RegisterCount,
805     WHV_REGISTER_VALUE *RegisterValues)
806 {
807     HRESULT hr;
808     struct whpx_state *whpx = &whpx_global;
809     CPUState *cpu = (CPUState *)ctx;
810 
811     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
812         whpx->partition, cpu->cpu_index,
813         RegisterNames, RegisterCount,
814         RegisterValues);
815     if (FAILED(hr)) {
816         error_report("WHPX: Failed to get virtual processor registers,"
817                      " hr=%08lx", hr);
818     }
819 
820     return hr;
821 }
822 
823 static HRESULT CALLBACK whpx_emu_setreg_callback(
824     void *ctx,
825     const WHV_REGISTER_NAME *RegisterNames,
826     UINT32 RegisterCount,
827     const WHV_REGISTER_VALUE *RegisterValues)
828 {
829     HRESULT hr;
830     struct whpx_state *whpx = &whpx_global;
831     CPUState *cpu = (CPUState *)ctx;
832 
833     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
834         whpx->partition, cpu->cpu_index,
835         RegisterNames, RegisterCount,
836         RegisterValues);
837     if (FAILED(hr)) {
838         error_report("WHPX: Failed to set virtual processor registers,"
839                      " hr=%08lx", hr);
840     }
841 
842     /*
843      * The emulator just successfully wrote the register state. We clear the
844      * dirty state so we avoid the double write on resume of the VP.
845      */
846     cpu->vcpu_dirty = false;
847 
848     return hr;
849 }
850 
851 static HRESULT CALLBACK whpx_emu_translate_callback(
852     void *ctx,
853     WHV_GUEST_VIRTUAL_ADDRESS Gva,
854     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
855     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
856     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
857 {
858     HRESULT hr;
859     struct whpx_state *whpx = &whpx_global;
860     CPUState *cpu = (CPUState *)ctx;
861     WHV_TRANSLATE_GVA_RESULT res;
862 
863     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
864                                       Gva, TranslateFlags, &res, Gpa);
865     if (FAILED(hr)) {
866         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
867     } else {
868         *TranslationResult = res.ResultCode;
869     }
870 
871     return hr;
872 }
873 
874 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
875     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
876     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
877     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
878     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
879     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
880     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
881 };
882 
883 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
884 {
885     HRESULT hr;
886     AccelCPUState *vcpu = cpu->accel;
887     WHV_EMULATOR_STATUS emu_status;
888 
889     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
890         vcpu->emulator, cpu,
891         &vcpu->exit_ctx.VpContext, ctx,
892         &emu_status);
893     if (FAILED(hr)) {
894         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
895         return -1;
896     }
897 
898     if (!emu_status.EmulationSuccessful) {
899         error_report("WHPX: Failed to emulate MMIO access with"
900                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
901         return -1;
902     }
903 
904     return 0;
905 }
906 
907 static int whpx_handle_portio(CPUState *cpu,
908                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
909 {
910     HRESULT hr;
911     AccelCPUState *vcpu = cpu->accel;
912     WHV_EMULATOR_STATUS emu_status;
913 
914     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
915         vcpu->emulator, cpu,
916         &vcpu->exit_ctx.VpContext, ctx,
917         &emu_status);
918     if (FAILED(hr)) {
919         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
920         return -1;
921     }
922 
923     if (!emu_status.EmulationSuccessful) {
924         error_report("WHPX: Failed to emulate PortIO access with"
925                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
926         return -1;
927     }
928 
929     return 0;
930 }
931 
932 /*
933  * Controls whether we should intercept various exceptions on the guest,
934  * namely breakpoint/single-step events.
935  *
936  * The 'exceptions' argument accepts a bitmask, e.g:
937  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
938  */
939 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
940 {
941     struct whpx_state *whpx = &whpx_global;
942     WHV_PARTITION_PROPERTY prop = { 0, };
943     HRESULT hr;
944 
945     if (exceptions == whpx->exception_exit_bitmap) {
946         return S_OK;
947     }
948 
949     prop.ExceptionExitBitmap = exceptions;
950 
951     hr = whp_dispatch.WHvSetPartitionProperty(
952         whpx->partition,
953         WHvPartitionPropertyCodeExceptionExitBitmap,
954         &prop,
955         sizeof(WHV_PARTITION_PROPERTY));
956 
957     if (SUCCEEDED(hr)) {
958         whpx->exception_exit_bitmap = exceptions;
959     }
960 
961     return hr;
962 }
963 
964 
965 /*
966  * This function is called before/after stepping over a single instruction.
967  * It will update the CPU registers to arm/disarm the instruction stepping
968  * accordingly.
969  */
970 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
971     bool set,
972     uint64_t *exit_context_rflags)
973 {
974     WHV_REGISTER_NAME reg_name;
975     WHV_REGISTER_VALUE reg_value;
976     HRESULT hr;
977     struct whpx_state *whpx = &whpx_global;
978 
979     /*
980      * If we are trying to step over a single instruction, we need to set the
981      * TF bit in rflags. Otherwise, clear it.
982      */
983     reg_name = WHvX64RegisterRflags;
984     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
985         whpx->partition,
986         cpu->cpu_index,
987         &reg_name,
988         1,
989         &reg_value);
990 
991     if (FAILED(hr)) {
992         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
993         return hr;
994     }
995 
996     if (exit_context_rflags) {
997         assert(*exit_context_rflags == reg_value.Reg64);
998     }
999 
1000     if (set) {
1001         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
1002         reg_value.Reg64 |= TF_MASK;
1003     } else {
1004         reg_value.Reg64 &= ~TF_MASK;
1005     }
1006 
1007     if (exit_context_rflags) {
1008         *exit_context_rflags = reg_value.Reg64;
1009     }
1010 
1011     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1012         whpx->partition,
1013         cpu->cpu_index,
1014         &reg_name,
1015         1,
1016         &reg_value);
1017 
1018     if (FAILED(hr)) {
1019         error_report("WHPX: Failed to set rflags,"
1020             " hr=%08lx",
1021             hr);
1022         return hr;
1023     }
1024 
1025     reg_name = WHvRegisterInterruptState;
1026     reg_value.Reg64 = 0;
1027 
1028     /* Suspend delivery of hardware interrupts during single-stepping. */
1029     reg_value.InterruptState.InterruptShadow = set != 0;
1030 
1031     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1032     whpx->partition,
1033         cpu->cpu_index,
1034         &reg_name,
1035         1,
1036         &reg_value);
1037 
1038     if (FAILED(hr)) {
1039         error_report("WHPX: Failed to set InterruptState,"
1040             " hr=%08lx",
1041             hr);
1042         return hr;
1043     }
1044 
1045     if (!set) {
1046         /*
1047          * We have just finished stepping over a single instruction,
1048          * and intercepted the INT1 generated by it.
1049          * We need to now hide the INT1 from the guest,
1050          * as it would not be expecting it.
1051          */
1052 
1053         reg_name = WHvX64RegisterPendingDebugException;
1054         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1055         whpx->partition,
1056             cpu->cpu_index,
1057             &reg_name,
1058             1,
1059             &reg_value);
1060 
1061         if (FAILED(hr)) {
1062             error_report("WHPX: Failed to get pending debug exceptions,"
1063                          "hr=%08lx", hr);
1064             return hr;
1065         }
1066 
1067         if (reg_value.PendingDebugException.SingleStep) {
1068             reg_value.PendingDebugException.SingleStep = 0;
1069 
1070             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1071                 whpx->partition,
1072                 cpu->cpu_index,
1073                 &reg_name,
1074                 1,
1075                 &reg_value);
1076 
1077             if (FAILED(hr)) {
1078                 error_report("WHPX: Failed to clear pending debug exceptions,"
1079                              "hr=%08lx", hr);
1080              return hr;
1081             }
1082         }
1083 
1084     }
1085 
1086     return S_OK;
1087 }
1088 
1089 /* Tries to find a breakpoint at the specified address. */
1090 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1091 {
1092     struct whpx_state *whpx = &whpx_global;
1093     int i;
1094 
1095     if (whpx->breakpoints.breakpoints) {
1096         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1097             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1098                 return &whpx->breakpoints.breakpoints->data[i];
1099             }
1100         }
1101     }
1102 
1103     return NULL;
1104 }
1105 
1106 /*
1107  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1108  * debugging user-mode applications. Since the WHPX API does not offer
1109  * an easy way to pass the intercepted exception back to the guest, we
1110  * resort to using INT1 instead, and let the guest always handle INT3.
1111  */
1112 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1113 
1114 /*
1115  * The WHPX QEMU backend implements breakpoints by writing the INT1
1116  * instruction into memory (ignoring the DRx registers). This raises a few
1117  * issues that need to be carefully handled:
1118  *
1119  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1120  *    at the same location, and later remove them in arbitrary order.
1121  *    This should not cause memory corruption, and should only remove the
1122  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1123  *
1124  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1125  *    physical location. Hence, physically adding/removing a breakpoint can
1126  *    theoretically fail at any time. We need to keep track of it.
1127  *
1128  * The function below rebuilds a list of low-level breakpoints (one per
1129  * address, tracking the original instruction and any errors) from the list of
1130  * high-level breakpoints (set via cpu_breakpoint_insert()).
1131  *
1132  * In order to optimize performance, this function stores the list of
1133  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1134  * low-level ones, so that it won't be re-invoked until these breakpoints
1135  * change.
1136  *
1137  * Note that this function decides which breakpoints should be inserted into,
1138  * memory, but doesn't actually do it. The memory accessing is done in
1139  * whpx_apply_breakpoints().
1140  */
1141 static void whpx_translate_cpu_breakpoints(
1142     struct whpx_breakpoints *breakpoints,
1143     CPUState *cpu,
1144     int cpu_breakpoint_count)
1145 {
1146     CPUBreakpoint *bp;
1147     int cpu_bp_index = 0;
1148 
1149     breakpoints->original_addresses =
1150         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1151 
1152     breakpoints->original_address_count = cpu_breakpoint_count;
1153 
1154     int max_breakpoints = cpu_breakpoint_count +
1155         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1156 
1157     struct whpx_breakpoint_collection *new_breakpoints =
1158         g_malloc0(sizeof(struct whpx_breakpoint_collection)
1159                   + max_breakpoints * sizeof(struct whpx_breakpoint));
1160 
1161     new_breakpoints->allocated = max_breakpoints;
1162     new_breakpoints->used = 0;
1163 
1164     /*
1165      * 1. Preserve all old breakpoints that could not be automatically
1166      * cleared when the CPU got stopped.
1167      */
1168     if (breakpoints->breakpoints) {
1169         int i;
1170         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1171             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1172                 new_breakpoints->data[new_breakpoints->used++] =
1173                     breakpoints->breakpoints->data[i];
1174             }
1175         }
1176     }
1177 
1178     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1179     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1180         int i;
1181         bool found = false;
1182 
1183         /* This will be used to detect changed CPU breakpoints later. */
1184         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1185 
1186         for (i = 0; i < new_breakpoints->used; i++) {
1187             /*
1188              * WARNING: This loop has O(N^2) complexity, where N is the
1189              * number of breakpoints. It should not be a bottleneck in
1190              * real-world scenarios, since it only needs to run once after
1191              * the breakpoints have been modified.
1192              * If this ever becomes a concern, it can be optimized by storing
1193              * high-level breakpoint objects in a tree or hash map.
1194              */
1195 
1196             if (new_breakpoints->data[i].address == bp->pc) {
1197                 /* There was already a breakpoint at this address. */
1198                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1199                     new_breakpoints->data[i].state = WHPX_BP_SET;
1200                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1201                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1202                 }
1203 
1204                 found = true;
1205                 break;
1206             }
1207         }
1208 
1209         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1210             /* No WHPX breakpoint at this address. Create one. */
1211             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1212             new_breakpoints->data[new_breakpoints->used].state =
1213                 WHPX_BP_SET_PENDING;
1214             new_breakpoints->used++;
1215         }
1216     }
1217 
1218     /*
1219      * Free the previous breakpoint list. This can be optimized by keeping
1220      * it as shadow buffer for the next computation instead of freeing
1221      * it immediately.
1222      */
1223     g_free(breakpoints->breakpoints);
1224 
1225     breakpoints->breakpoints = new_breakpoints;
1226 }
1227 
1228 /*
1229  * Physically inserts/removes the breakpoints by reading and writing the
1230  * physical memory, keeping a track of the failed attempts.
1231  *
1232  * Passing resuming=true  will try to set all previously unset breakpoints.
1233  * Passing resuming=false will remove all inserted ones.
1234  */
1235 static void whpx_apply_breakpoints(
1236     struct whpx_breakpoint_collection *breakpoints,
1237     CPUState *cpu,
1238     bool resuming)
1239 {
1240     int i, rc;
1241     if (!breakpoints) {
1242         return;
1243     }
1244 
1245     for (i = 0; i < breakpoints->used; i++) {
1246         /* Decide what to do right now based on the last known state. */
1247         WhpxBreakpointState state = breakpoints->data[i].state;
1248         switch (state) {
1249         case WHPX_BP_CLEARED:
1250             if (resuming) {
1251                 state = WHPX_BP_SET_PENDING;
1252             }
1253             break;
1254         case WHPX_BP_SET_PENDING:
1255             if (!resuming) {
1256                 state = WHPX_BP_CLEARED;
1257             }
1258             break;
1259         case WHPX_BP_SET:
1260             if (!resuming) {
1261                 state = WHPX_BP_CLEAR_PENDING;
1262             }
1263             break;
1264         case WHPX_BP_CLEAR_PENDING:
1265             if (resuming) {
1266                 state = WHPX_BP_SET;
1267             }
1268             break;
1269         }
1270 
1271         if (state == WHPX_BP_SET_PENDING) {
1272             /* Remember the original instruction. */
1273             rc = cpu_memory_rw_debug(cpu,
1274                 breakpoints->data[i].address,
1275                 &breakpoints->data[i].original_instruction,
1276                 1,
1277                 false);
1278 
1279             if (!rc) {
1280                 /* Write the breakpoint instruction. */
1281                 rc = cpu_memory_rw_debug(cpu,
1282                     breakpoints->data[i].address,
1283                     (void *)&whpx_breakpoint_instruction,
1284                     1,
1285                     true);
1286             }
1287 
1288             if (!rc) {
1289                 state = WHPX_BP_SET;
1290             }
1291 
1292         }
1293 
1294         if (state == WHPX_BP_CLEAR_PENDING) {
1295             /* Restore the original instruction. */
1296             rc = cpu_memory_rw_debug(cpu,
1297                 breakpoints->data[i].address,
1298                 &breakpoints->data[i].original_instruction,
1299                 1,
1300                 true);
1301 
1302             if (!rc) {
1303                 state = WHPX_BP_CLEARED;
1304             }
1305         }
1306 
1307         breakpoints->data[i].state = state;
1308     }
1309 }
1310 
1311 /*
1312  * This function is called when the a VCPU is about to start and no other
1313  * VCPUs have been started so far. Since the VCPU start order could be
1314  * arbitrary, it doesn't have to be VCPU#0.
1315  *
1316  * It is used to commit the breakpoints into memory, and configure WHPX
1317  * to intercept debug exceptions.
1318  *
1319  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1320  * more VCPUs are already running, so this is the best place to do it.
1321  */
1322 static int whpx_first_vcpu_starting(CPUState *cpu)
1323 {
1324     struct whpx_state *whpx = &whpx_global;
1325     HRESULT hr;
1326 
1327     g_assert(bql_locked());
1328 
1329     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1330             (whpx->breakpoints.breakpoints &&
1331              whpx->breakpoints.breakpoints->used)) {
1332         CPUBreakpoint *bp;
1333         int i = 0;
1334         bool update_pending = false;
1335 
1336         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1337             if (i >= whpx->breakpoints.original_address_count ||
1338                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1339                 update_pending = true;
1340             }
1341 
1342             i++;
1343         }
1344 
1345         if (i != whpx->breakpoints.original_address_count) {
1346             update_pending = true;
1347         }
1348 
1349         if (update_pending) {
1350             /*
1351              * The CPU breakpoints have changed since the last call to
1352              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1353              * now be recomputed.
1354              */
1355             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1356         }
1357 
1358         /* Actually insert the breakpoints into the memory. */
1359         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1360     }
1361 
1362     uint64_t exception_mask;
1363     if (whpx->step_pending ||
1364         (whpx->breakpoints.breakpoints &&
1365          whpx->breakpoints.breakpoints->used)) {
1366         /*
1367          * We are either attempting to single-step one or more CPUs, or
1368          * have one or more breakpoints enabled. Both require intercepting
1369          * the WHvX64ExceptionTypeBreakpointTrap exception.
1370          */
1371 
1372         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1373     } else {
1374         /* Let the guest handle all exceptions. */
1375         exception_mask = 0;
1376     }
1377 
1378     hr = whpx_set_exception_exit_bitmap(exception_mask);
1379     if (!SUCCEEDED(hr)) {
1380         error_report("WHPX: Failed to update exception exit mask,"
1381                      "hr=%08lx.", hr);
1382         return 1;
1383     }
1384 
1385     return 0;
1386 }
1387 
1388 /*
1389  * This function is called when the last VCPU has finished running.
1390  * It is used to remove any previously set breakpoints from memory.
1391  */
1392 static int whpx_last_vcpu_stopping(CPUState *cpu)
1393 {
1394     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1395     return 0;
1396 }
1397 
1398 /* Returns the address of the next instruction that is about to be executed. */
1399 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1400 {
1401     if (cpu->vcpu_dirty) {
1402         /* The CPU registers have been modified by other parts of QEMU. */
1403         CPUArchState *env = cpu_env(cpu);
1404         return env->eip;
1405     } else if (exit_context_valid) {
1406         /*
1407          * The CPU registers have not been modified by neither other parts
1408          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1409          * This is the most common case.
1410          */
1411         AccelCPUState *vcpu = cpu->accel;
1412         return vcpu->exit_ctx.VpContext.Rip;
1413     } else {
1414         /*
1415          * The CPU registers have been modified by a call to
1416          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1417          * the target.
1418          */
1419         WHV_REGISTER_VALUE reg_value;
1420         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1421         HRESULT hr;
1422         struct whpx_state *whpx = &whpx_global;
1423 
1424         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1425             whpx->partition,
1426             cpu->cpu_index,
1427             &reg_name,
1428             1,
1429             &reg_value);
1430 
1431         if (FAILED(hr)) {
1432             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1433             return 0;
1434         }
1435 
1436         return reg_value.Reg64;
1437     }
1438 }
1439 
1440 static int whpx_handle_halt(CPUState *cpu)
1441 {
1442     CPUX86State *env = cpu_env(cpu);
1443     int ret = 0;
1444 
1445     bql_lock();
1446     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1447           (env->eflags & IF_MASK)) &&
1448         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1449         cpu->exception_index = EXCP_HLT;
1450         cpu->halted = true;
1451         ret = 1;
1452     }
1453     bql_unlock();
1454 
1455     return ret;
1456 }
1457 
1458 static void whpx_vcpu_pre_run(CPUState *cpu)
1459 {
1460     HRESULT hr;
1461     struct whpx_state *whpx = &whpx_global;
1462     AccelCPUState *vcpu = cpu->accel;
1463     X86CPU *x86_cpu = X86_CPU(cpu);
1464     CPUX86State *env = &x86_cpu->env;
1465     int irq;
1466     uint8_t tpr;
1467     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1468     UINT32 reg_count = 0;
1469     WHV_REGISTER_VALUE reg_values[3];
1470     WHV_REGISTER_NAME reg_names[3];
1471 
1472     memset(&new_int, 0, sizeof(new_int));
1473     memset(reg_values, 0, sizeof(reg_values));
1474 
1475     bql_lock();
1476 
1477     /* Inject NMI */
1478     if (!vcpu->interruption_pending &&
1479         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1480         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1481             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1482             vcpu->interruptable = false;
1483             new_int.InterruptionType = WHvX64PendingNmi;
1484             new_int.InterruptionPending = 1;
1485             new_int.InterruptionVector = 2;
1486         }
1487         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1488             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1489         }
1490     }
1491 
1492     /*
1493      * Force the VCPU out of its inner loop to process any INIT requests or
1494      * commit pending TPR access.
1495      */
1496     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1497         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1498             !(env->hflags & HF_SMM_MASK)) {
1499             cpu->exit_request = 1;
1500         }
1501         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1502             cpu->exit_request = 1;
1503         }
1504     }
1505 
1506     /* Get pending hard interruption or replay one that was overwritten */
1507     if (!whpx_apic_in_platform()) {
1508         if (!vcpu->interruption_pending &&
1509             vcpu->interruptable && (env->eflags & IF_MASK)) {
1510             assert(!new_int.InterruptionPending);
1511             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1512                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1513                 irq = cpu_get_pic_interrupt(env);
1514                 if (irq >= 0) {
1515                     new_int.InterruptionType = WHvX64PendingInterrupt;
1516                     new_int.InterruptionPending = 1;
1517                     new_int.InterruptionVector = irq;
1518                 }
1519             }
1520         }
1521 
1522         /* Setup interrupt state if new one was prepared */
1523         if (new_int.InterruptionPending) {
1524             reg_values[reg_count].PendingInterruption = new_int;
1525             reg_names[reg_count] = WHvRegisterPendingInterruption;
1526             reg_count += 1;
1527         }
1528     } else if (vcpu->ready_for_pic_interrupt &&
1529                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1530         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1531         irq = cpu_get_pic_interrupt(env);
1532         if (irq >= 0) {
1533             reg_names[reg_count] = WHvRegisterPendingEvent;
1534             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1535             {
1536                 .EventPending = 1,
1537                 .EventType = WHvX64PendingEventExtInt,
1538                 .Vector = irq,
1539             };
1540             reg_count += 1;
1541         }
1542      }
1543 
1544     /* Sync the TPR to the CR8 if was modified during the intercept */
1545     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1546     if (tpr != vcpu->tpr) {
1547         vcpu->tpr = tpr;
1548         reg_values[reg_count].Reg64 = tpr;
1549         cpu->exit_request = 1;
1550         reg_names[reg_count] = WHvX64RegisterCr8;
1551         reg_count += 1;
1552     }
1553 
1554     /* Update the state of the interrupt delivery notification */
1555     if (!vcpu->window_registered &&
1556         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1557         reg_values[reg_count].DeliverabilityNotifications =
1558             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1559                 .InterruptNotification = 1
1560             };
1561         vcpu->window_registered = 1;
1562         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1563         reg_count += 1;
1564     }
1565 
1566     bql_unlock();
1567     vcpu->ready_for_pic_interrupt = false;
1568 
1569     if (reg_count) {
1570         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1571             whpx->partition, cpu->cpu_index,
1572             reg_names, reg_count, reg_values);
1573         if (FAILED(hr)) {
1574             error_report("WHPX: Failed to set interrupt state registers,"
1575                          " hr=%08lx", hr);
1576         }
1577     }
1578 
1579     return;
1580 }
1581 
1582 static void whpx_vcpu_post_run(CPUState *cpu)
1583 {
1584     AccelCPUState *vcpu = cpu->accel;
1585     X86CPU *x86_cpu = X86_CPU(cpu);
1586     CPUX86State *env = &x86_cpu->env;
1587 
1588     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1589 
1590     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1591     if (vcpu->tpr != tpr) {
1592         vcpu->tpr = tpr;
1593         bql_lock();
1594         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1595         bql_unlock();
1596     }
1597 
1598     vcpu->interruption_pending =
1599         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1600 
1601     vcpu->interruptable =
1602         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1603 
1604     return;
1605 }
1606 
1607 static void whpx_vcpu_process_async_events(CPUState *cpu)
1608 {
1609     X86CPU *x86_cpu = X86_CPU(cpu);
1610     CPUX86State *env = &x86_cpu->env;
1611     AccelCPUState *vcpu = cpu->accel;
1612 
1613     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1614         !(env->hflags & HF_SMM_MASK)) {
1615         whpx_cpu_synchronize_state(cpu);
1616         do_cpu_init(x86_cpu);
1617         vcpu->interruptable = true;
1618     }
1619 
1620     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1621         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1622         apic_poll_irq(x86_cpu->apic_state);
1623     }
1624 
1625     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1626          (env->eflags & IF_MASK)) ||
1627         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1628         cpu->halted = false;
1629     }
1630 
1631     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1632         whpx_cpu_synchronize_state(cpu);
1633         do_cpu_sipi(x86_cpu);
1634     }
1635 
1636     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1637         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1638         whpx_cpu_synchronize_state(cpu);
1639         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1640                                       env->tpr_access_type);
1641     }
1642 
1643     return;
1644 }
1645 
1646 static int whpx_vcpu_run(CPUState *cpu)
1647 {
1648     HRESULT hr;
1649     struct whpx_state *whpx = &whpx_global;
1650     AccelCPUState *vcpu = cpu->accel;
1651     struct whpx_breakpoint *stepped_over_bp = NULL;
1652     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1653     int ret;
1654 
1655     g_assert(bql_locked());
1656 
1657     if (whpx->running_cpus++ == 0) {
1658         /* Insert breakpoints into memory, update exception exit bitmap. */
1659         ret = whpx_first_vcpu_starting(cpu);
1660         if (ret != 0) {
1661             return ret;
1662         }
1663     }
1664 
1665     if (whpx->breakpoints.breakpoints &&
1666         whpx->breakpoints.breakpoints->used > 0)
1667     {
1668         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1669         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1670         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1671             stepped_over_bp = NULL;
1672         }
1673 
1674         if (stepped_over_bp) {
1675             /*
1676              * We are trying to run the instruction overwritten by an active
1677              * breakpoint. We will temporarily disable the breakpoint, suspend
1678              * other CPUs, and step over the instruction.
1679              */
1680             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1681         }
1682     }
1683 
1684     if (exclusive_step_mode == WHPX_STEP_NONE) {
1685         whpx_vcpu_process_async_events(cpu);
1686         if (cpu->halted && !whpx_apic_in_platform()) {
1687             cpu->exception_index = EXCP_HLT;
1688             qatomic_set(&cpu->exit_request, false);
1689             return 0;
1690         }
1691     }
1692 
1693     bql_unlock();
1694 
1695     if (exclusive_step_mode != WHPX_STEP_NONE) {
1696         start_exclusive();
1697         g_assert(cpu == current_cpu);
1698         g_assert(!cpu->running);
1699         cpu->running = true;
1700 
1701         hr = whpx_set_exception_exit_bitmap(
1702             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1703         if (!SUCCEEDED(hr)) {
1704             error_report("WHPX: Failed to update exception exit mask, "
1705                          "hr=%08lx.", hr);
1706             return 1;
1707         }
1708 
1709         if (stepped_over_bp) {
1710             /* Temporarily disable the triggered breakpoint. */
1711             cpu_memory_rw_debug(cpu,
1712                 stepped_over_bp->address,
1713                 &stepped_over_bp->original_instruction,
1714                 1,
1715                 true);
1716         }
1717     } else {
1718         cpu_exec_start(cpu);
1719     }
1720 
1721     do {
1722         if (cpu->vcpu_dirty) {
1723             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1724             cpu->vcpu_dirty = false;
1725         }
1726 
1727         if (exclusive_step_mode == WHPX_STEP_NONE) {
1728             whpx_vcpu_pre_run(cpu);
1729 
1730             if (qatomic_read(&cpu->exit_request)) {
1731                 whpx_vcpu_kick(cpu);
1732             }
1733         }
1734 
1735         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1736             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1737         }
1738 
1739         hr = whp_dispatch.WHvRunVirtualProcessor(
1740             whpx->partition, cpu->cpu_index,
1741             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1742 
1743         if (FAILED(hr)) {
1744             error_report("WHPX: Failed to exec a virtual processor,"
1745                          " hr=%08lx", hr);
1746             ret = -1;
1747             break;
1748         }
1749 
1750         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1751             whpx_vcpu_configure_single_stepping(cpu,
1752                 false,
1753                 &vcpu->exit_ctx.VpContext.Rflags);
1754         }
1755 
1756         whpx_vcpu_post_run(cpu);
1757 
1758         switch (vcpu->exit_ctx.ExitReason) {
1759         case WHvRunVpExitReasonMemoryAccess:
1760             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1761             break;
1762 
1763         case WHvRunVpExitReasonX64IoPortAccess:
1764             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1765             break;
1766 
1767         case WHvRunVpExitReasonX64InterruptWindow:
1768             vcpu->ready_for_pic_interrupt = 1;
1769             vcpu->window_registered = 0;
1770             ret = 0;
1771             break;
1772 
1773         case WHvRunVpExitReasonX64ApicEoi:
1774             assert(whpx_apic_in_platform());
1775             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1776             break;
1777 
1778         case WHvRunVpExitReasonX64Halt:
1779             /*
1780              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1781              * longer used.
1782              */
1783             ret = whpx_handle_halt(cpu);
1784             break;
1785 
1786         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1787             WHV_INTERRUPT_CONTROL ipi = {0};
1788             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1789             uint32_t delivery_mode =
1790                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1791             int dest_shorthand =
1792                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1793             bool broadcast = false;
1794             bool include_self = false;
1795             uint32_t i;
1796 
1797             /* We only registered for INIT and SIPI exits. */
1798             if ((delivery_mode != APIC_DM_INIT) &&
1799                 (delivery_mode != APIC_DM_SIPI)) {
1800                 error_report(
1801                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1802                 break;
1803             }
1804 
1805             if (delivery_mode == APIC_DM_INIT) {
1806                 ipi.Type = WHvX64InterruptTypeInit;
1807             } else {
1808                 ipi.Type = WHvX64InterruptTypeSipi;
1809             }
1810 
1811             ipi.DestinationMode =
1812                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1813                     WHvX64InterruptDestinationModeLogical :
1814                     WHvX64InterruptDestinationModePhysical;
1815 
1816             ipi.TriggerMode =
1817                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1818                     WHvX64InterruptTriggerModeLevel :
1819                     WHvX64InterruptTriggerModeEdge;
1820 
1821             ipi.Vector = icr & APIC_VECTOR_MASK;
1822             switch (dest_shorthand) {
1823             /* no shorthand. Bits 56-63 contain the destination. */
1824             case 0:
1825                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1826                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1827                         &ipi, sizeof(ipi));
1828                 if (FAILED(hr)) {
1829                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1830                         hr);
1831                 }
1832 
1833                 break;
1834 
1835             /* self */
1836             case 1:
1837                 include_self = true;
1838                 break;
1839 
1840             /* broadcast, including self */
1841             case 2:
1842                 broadcast = true;
1843                 include_self = true;
1844                 break;
1845 
1846             /* broadcast, excluding self */
1847             case 3:
1848                 broadcast = true;
1849                 break;
1850             }
1851 
1852             if (!broadcast && !include_self) {
1853                 break;
1854             }
1855 
1856             for (i = 0; i <= max_vcpu_index; i++) {
1857                 if (i == cpu->cpu_index && !include_self) {
1858                     continue;
1859                 }
1860 
1861                 /*
1862                  * Assuming that APIC Ids are identity mapped since
1863                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1864                  * are not handled yet and the hypervisor doesn't allow the
1865                  * guest to modify the APIC ID.
1866                  */
1867                 ipi.Destination = i;
1868                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1869                         &ipi, sizeof(ipi));
1870                 if (FAILED(hr)) {
1871                     error_report(
1872                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1873                         i, hr);
1874                 }
1875             }
1876 
1877             break;
1878         }
1879 
1880         case WHvRunVpExitReasonCanceled:
1881             if (exclusive_step_mode != WHPX_STEP_NONE) {
1882                 /*
1883                  * We are trying to step over a single instruction, and
1884                  * likely got a request to stop from another thread.
1885                  * Delay it until we are done stepping
1886                  * over.
1887                  */
1888                 ret = 0;
1889             } else {
1890                 cpu->exception_index = EXCP_INTERRUPT;
1891                 ret = 1;
1892             }
1893             break;
1894         case WHvRunVpExitReasonX64MsrAccess: {
1895             WHV_REGISTER_VALUE reg_values[3] = {0};
1896             WHV_REGISTER_NAME reg_names[3];
1897             UINT32 reg_count;
1898 
1899             reg_names[0] = WHvX64RegisterRip;
1900             reg_names[1] = WHvX64RegisterRax;
1901             reg_names[2] = WHvX64RegisterRdx;
1902 
1903             reg_values[0].Reg64 =
1904                 vcpu->exit_ctx.VpContext.Rip +
1905                 vcpu->exit_ctx.VpContext.InstructionLength;
1906 
1907             /*
1908              * For all unsupported MSR access we:
1909              *     ignore writes
1910              *     return 0 on read.
1911              */
1912             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1913                         1 : 3;
1914 
1915             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1916                 whpx->partition,
1917                 cpu->cpu_index,
1918                 reg_names, reg_count,
1919                 reg_values);
1920 
1921             if (FAILED(hr)) {
1922                 error_report("WHPX: Failed to set MsrAccess state "
1923                              " registers, hr=%08lx", hr);
1924             }
1925             ret = 0;
1926             break;
1927         }
1928         case WHvRunVpExitReasonX64Cpuid: {
1929             WHV_REGISTER_VALUE reg_values[5];
1930             WHV_REGISTER_NAME reg_names[5];
1931             UINT32 reg_count = 5;
1932             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1933             X86CPU *x86_cpu = X86_CPU(cpu);
1934             CPUX86State *env = &x86_cpu->env;
1935 
1936             memset(reg_values, 0, sizeof(reg_values));
1937 
1938             rip = vcpu->exit_ctx.VpContext.Rip +
1939                   vcpu->exit_ctx.VpContext.InstructionLength;
1940             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1941 
1942             /*
1943              * Ideally, these should be supplied to the hypervisor during VCPU
1944              * initialization and it should be able to satisfy this request.
1945              * But, currently, WHPX doesn't support setting CPUID values in the
1946              * hypervisor once the partition has been setup, which is too late
1947              * since VCPUs are realized later. For now, use the values from
1948              * QEMU to satisfy these requests, until WHPX adds support for
1949              * being able to set these values in the hypervisor at runtime.
1950              */
1951             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1952                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1953             switch (cpuid_fn) {
1954             case 0x40000000:
1955                 /* Expose the vmware cpu frequency cpuid leaf */
1956                 rax = 0x40000010;
1957                 rbx = rcx = rdx = 0;
1958                 break;
1959 
1960             case 0x40000010:
1961                 rax = env->tsc_khz;
1962                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1963                 rcx = rdx = 0;
1964                 break;
1965 
1966             case 0x80000001:
1967                 /* Remove any support of OSVW */
1968                 rcx &= ~CPUID_EXT3_OSVW;
1969                 break;
1970             }
1971 
1972             reg_names[0] = WHvX64RegisterRip;
1973             reg_names[1] = WHvX64RegisterRax;
1974             reg_names[2] = WHvX64RegisterRcx;
1975             reg_names[3] = WHvX64RegisterRdx;
1976             reg_names[4] = WHvX64RegisterRbx;
1977 
1978             reg_values[0].Reg64 = rip;
1979             reg_values[1].Reg64 = rax;
1980             reg_values[2].Reg64 = rcx;
1981             reg_values[3].Reg64 = rdx;
1982             reg_values[4].Reg64 = rbx;
1983 
1984             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1985                 whpx->partition, cpu->cpu_index,
1986                 reg_names,
1987                 reg_count,
1988                 reg_values);
1989 
1990             if (FAILED(hr)) {
1991                 error_report("WHPX: Failed to set CpuidAccess state registers,"
1992                              " hr=%08lx", hr);
1993             }
1994             ret = 0;
1995             break;
1996         }
1997         case WHvRunVpExitReasonException:
1998             whpx_get_registers(cpu);
1999 
2000             if ((vcpu->exit_ctx.VpException.ExceptionType ==
2001                  WHvX64ExceptionTypeDebugTrapOrFault) &&
2002                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
2003                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
2004                  whpx_breakpoint_instruction)) {
2005                 /* Stopped at a software breakpoint. */
2006                 cpu->exception_index = EXCP_DEBUG;
2007             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
2008                         WHvX64ExceptionTypeDebugTrapOrFault) &&
2009                        !cpu->singlestep_enabled) {
2010                 /*
2011                  * Just finished stepping over a breakpoint, but the
2012                  * gdb does not expect us to do single-stepping.
2013                  * Don't do anything special.
2014                  */
2015                 cpu->exception_index = EXCP_INTERRUPT;
2016             } else {
2017                 /* Another exception or debug event. Report it to GDB. */
2018                 cpu->exception_index = EXCP_DEBUG;
2019             }
2020 
2021             ret = 1;
2022             break;
2023         case WHvRunVpExitReasonNone:
2024         case WHvRunVpExitReasonUnrecoverableException:
2025         case WHvRunVpExitReasonInvalidVpRegisterValue:
2026         case WHvRunVpExitReasonUnsupportedFeature:
2027         default:
2028             error_report("WHPX: Unexpected VP exit code %d",
2029                          vcpu->exit_ctx.ExitReason);
2030             whpx_get_registers(cpu);
2031             bql_lock();
2032             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2033             bql_unlock();
2034             break;
2035         }
2036 
2037     } while (!ret);
2038 
2039     if (stepped_over_bp) {
2040         /* Restore the breakpoint we stepped over */
2041         cpu_memory_rw_debug(cpu,
2042             stepped_over_bp->address,
2043             (void *)&whpx_breakpoint_instruction,
2044             1,
2045             true);
2046     }
2047 
2048     if (exclusive_step_mode != WHPX_STEP_NONE) {
2049         g_assert(cpu_in_exclusive_context(cpu));
2050         cpu->running = false;
2051         end_exclusive();
2052 
2053         exclusive_step_mode = WHPX_STEP_NONE;
2054     } else {
2055         cpu_exec_end(cpu);
2056     }
2057 
2058     bql_lock();
2059     current_cpu = cpu;
2060 
2061     if (--whpx->running_cpus == 0) {
2062         whpx_last_vcpu_stopping(cpu);
2063     }
2064 
2065     qatomic_set(&cpu->exit_request, false);
2066 
2067     return ret < 0;
2068 }
2069 
2070 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2071 {
2072     if (!cpu->vcpu_dirty) {
2073         whpx_get_registers(cpu);
2074         cpu->vcpu_dirty = true;
2075     }
2076 }
2077 
2078 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2079                                                run_on_cpu_data arg)
2080 {
2081     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2082     cpu->vcpu_dirty = false;
2083 }
2084 
2085 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2086                                               run_on_cpu_data arg)
2087 {
2088     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2089     cpu->vcpu_dirty = false;
2090 }
2091 
2092 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2093                                                run_on_cpu_data arg)
2094 {
2095     cpu->vcpu_dirty = true;
2096 }
2097 
2098 /*
2099  * CPU support.
2100  */
2101 
2102 void whpx_cpu_synchronize_state(CPUState *cpu)
2103 {
2104     if (!cpu->vcpu_dirty) {
2105         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2106     }
2107 }
2108 
2109 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2110 {
2111     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2112 }
2113 
2114 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2115 {
2116     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2117 }
2118 
2119 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2120 {
2121     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2122 }
2123 
2124 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2125 {
2126     whpx_global.step_pending = step_pending;
2127 }
2128 
2129 /*
2130  * Vcpu support.
2131  */
2132 
2133 static Error *whpx_migration_blocker;
2134 
2135 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2136 {
2137     CPUX86State *env = opaque;
2138 
2139     if (running) {
2140         env->tsc_valid = false;
2141     }
2142 }
2143 
2144 int whpx_init_vcpu(CPUState *cpu)
2145 {
2146     HRESULT hr;
2147     struct whpx_state *whpx = &whpx_global;
2148     AccelCPUState *vcpu = NULL;
2149     Error *local_error = NULL;
2150     X86CPU *x86_cpu = X86_CPU(cpu);
2151     CPUX86State *env = &x86_cpu->env;
2152     UINT64 freq = 0;
2153     int ret;
2154 
2155     /* Add migration blockers for all unsupported features of the
2156      * Windows Hypervisor Platform
2157      */
2158     if (whpx_migration_blocker == NULL) {
2159         error_setg(&whpx_migration_blocker,
2160                "State blocked due to non-migratable CPUID feature support,"
2161                "dirty memory tracking support, and XSAVE/XRSTOR support");
2162 
2163         if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2164             error_report_err(local_error);
2165             ret = -EINVAL;
2166             goto error;
2167         }
2168     }
2169 
2170     vcpu = g_new0(AccelCPUState, 1);
2171 
2172     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2173         &whpx_emu_callbacks,
2174         &vcpu->emulator);
2175     if (FAILED(hr)) {
2176         error_report("WHPX: Failed to setup instruction completion support,"
2177                      " hr=%08lx", hr);
2178         ret = -EINVAL;
2179         goto error;
2180     }
2181 
2182     hr = whp_dispatch.WHvCreateVirtualProcessor(
2183         whpx->partition, cpu->cpu_index, 0);
2184     if (FAILED(hr)) {
2185         error_report("WHPX: Failed to create a virtual processor,"
2186                      " hr=%08lx", hr);
2187         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2188         ret = -EINVAL;
2189         goto error;
2190     }
2191 
2192     /*
2193      * vcpu's TSC frequency is either specified by user, or use the value
2194      * provided by Hyper-V if the former is not present. In the latter case, we
2195      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2196      * frequency can be migrated later via this field.
2197      */
2198     if (!env->tsc_khz) {
2199         hr = whp_dispatch.WHvGetCapability(
2200             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2201                 NULL);
2202         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2203             if (FAILED(hr)) {
2204                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2205             } else {
2206                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2207             }
2208         }
2209     }
2210 
2211     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2212     hr = whp_dispatch.WHvGetCapability(
2213         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2214     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2215         if (FAILED(hr)) {
2216             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2217         } else {
2218             env->apic_bus_freq = freq;
2219         }
2220     }
2221 
2222     /*
2223      * If the vmware cpuid frequency leaf option is set, and we have a valid
2224      * tsc value, trap the corresponding cpuid's.
2225      */
2226     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2227         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2228 
2229         hr = whp_dispatch.WHvSetPartitionProperty(
2230                 whpx->partition,
2231                 WHvPartitionPropertyCodeCpuidExitList,
2232                 cpuidExitList,
2233                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2234 
2235         if (FAILED(hr)) {
2236             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2237                         hr);
2238             ret = -EINVAL;
2239             goto error;
2240         }
2241     }
2242 
2243     vcpu->interruptable = true;
2244     cpu->vcpu_dirty = true;
2245     cpu->accel = vcpu;
2246     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2247     qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2248 
2249     return 0;
2250 
2251 error:
2252     g_free(vcpu);
2253 
2254     return ret;
2255 }
2256 
2257 int whpx_vcpu_exec(CPUState *cpu)
2258 {
2259     int ret;
2260     int fatal;
2261 
2262     for (;;) {
2263         if (cpu->exception_index >= EXCP_INTERRUPT) {
2264             ret = cpu->exception_index;
2265             cpu->exception_index = -1;
2266             break;
2267         }
2268 
2269         fatal = whpx_vcpu_run(cpu);
2270 
2271         if (fatal) {
2272             error_report("WHPX: Failed to exec a virtual processor");
2273             abort();
2274         }
2275     }
2276 
2277     return ret;
2278 }
2279 
2280 void whpx_destroy_vcpu(CPUState *cpu)
2281 {
2282     struct whpx_state *whpx = &whpx_global;
2283     AccelCPUState *vcpu = cpu->accel;
2284 
2285     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2286     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2287     g_free(cpu->accel);
2288     return;
2289 }
2290 
2291 void whpx_vcpu_kick(CPUState *cpu)
2292 {
2293     struct whpx_state *whpx = &whpx_global;
2294     whp_dispatch.WHvCancelRunVirtualProcessor(
2295         whpx->partition, cpu->cpu_index, 0);
2296 }
2297 
2298 /*
2299  * Memory support.
2300  */
2301 
2302 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2303                                 void *host_va, int add, int rom,
2304                                 const char *name)
2305 {
2306     struct whpx_state *whpx = &whpx_global;
2307     HRESULT hr;
2308 
2309     /*
2310     if (add) {
2311         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2312                (void*)start_pa, (void*)size, host_va,
2313                (rom ? "ROM" : "RAM"), name);
2314     } else {
2315         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2316                (void*)start_pa, (void*)size, host_va, name);
2317     }
2318     */
2319 
2320     if (add) {
2321         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2322                                          host_va,
2323                                          start_pa,
2324                                          size,
2325                                          (WHvMapGpaRangeFlagRead |
2326                                           WHvMapGpaRangeFlagExecute |
2327                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2328     } else {
2329         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2330                                            start_pa,
2331                                            size);
2332     }
2333 
2334     if (FAILED(hr)) {
2335         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2336                      " Host:%p, hr=%08lx",
2337                      (add ? "MAP" : "UNMAP"), name,
2338                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2339     }
2340 }
2341 
2342 static void whpx_process_section(MemoryRegionSection *section, int add)
2343 {
2344     MemoryRegion *mr = section->mr;
2345     hwaddr start_pa = section->offset_within_address_space;
2346     ram_addr_t size = int128_get64(section->size);
2347     unsigned int delta;
2348     uint64_t host_va;
2349 
2350     if (!memory_region_is_ram(mr)) {
2351         return;
2352     }
2353 
2354     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2355     delta &= ~qemu_real_host_page_mask();
2356     if (delta > size) {
2357         return;
2358     }
2359     start_pa += delta;
2360     size -= delta;
2361     size &= qemu_real_host_page_mask();
2362     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2363         return;
2364     }
2365 
2366     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2367             + section->offset_within_region + delta;
2368 
2369     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2370                         memory_region_is_rom(mr), mr->name);
2371 }
2372 
2373 static void whpx_region_add(MemoryListener *listener,
2374                            MemoryRegionSection *section)
2375 {
2376     memory_region_ref(section->mr);
2377     whpx_process_section(section, 1);
2378 }
2379 
2380 static void whpx_region_del(MemoryListener *listener,
2381                            MemoryRegionSection *section)
2382 {
2383     whpx_process_section(section, 0);
2384     memory_region_unref(section->mr);
2385 }
2386 
2387 static void whpx_transaction_begin(MemoryListener *listener)
2388 {
2389 }
2390 
2391 static void whpx_transaction_commit(MemoryListener *listener)
2392 {
2393 }
2394 
2395 static void whpx_log_sync(MemoryListener *listener,
2396                          MemoryRegionSection *section)
2397 {
2398     MemoryRegion *mr = section->mr;
2399 
2400     if (!memory_region_is_ram(mr)) {
2401         return;
2402     }
2403 
2404     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2405 }
2406 
2407 static MemoryListener whpx_memory_listener = {
2408     .name = "whpx",
2409     .begin = whpx_transaction_begin,
2410     .commit = whpx_transaction_commit,
2411     .region_add = whpx_region_add,
2412     .region_del = whpx_region_del,
2413     .log_sync = whpx_log_sync,
2414     .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2415 };
2416 
2417 static void whpx_memory_init(void)
2418 {
2419     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2420 }
2421 
2422 /*
2423  * Load the functions from the given library, using the given handle. If a
2424  * handle is provided, it is used, otherwise the library is opened. The
2425  * handle will be updated on return with the opened one.
2426  */
2427 static bool load_whp_dispatch_fns(HMODULE *handle,
2428     WHPFunctionList function_list)
2429 {
2430     HMODULE hLib = *handle;
2431 
2432     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2433     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2434     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2435         whp_dispatch.function_name = \
2436             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2437 
2438     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2439         whp_dispatch.function_name = \
2440             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2441         if (!whp_dispatch.function_name) { \
2442             error_report("Could not load function %s", #function_name); \
2443             goto error; \
2444         } \
2445 
2446     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2447     if (!handle_lib) { \
2448         handle_lib = LoadLibrary(lib_name); \
2449         if (!handle_lib) { \
2450             error_report("Could not load library %s.", lib_name); \
2451             goto error; \
2452         } \
2453     } \
2454 
2455     switch (function_list) {
2456     case WINHV_PLATFORM_FNS_DEFAULT:
2457         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2458         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2459         break;
2460 
2461     case WINHV_EMULATION_FNS_DEFAULT:
2462         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2463         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2464         break;
2465 
2466     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2467         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2468         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2469         break;
2470     }
2471 
2472     *handle = hLib;
2473     return true;
2474 
2475 error:
2476     if (hLib) {
2477         FreeLibrary(hLib);
2478     }
2479 
2480     return false;
2481 }
2482 
2483 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2484                                    const char *name, void *opaque,
2485                                    Error **errp)
2486 {
2487     struct whpx_state *whpx = &whpx_global;
2488     OnOffSplit mode;
2489 
2490     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2491         return;
2492     }
2493 
2494     switch (mode) {
2495     case ON_OFF_SPLIT_ON:
2496         whpx->kernel_irqchip_allowed = true;
2497         whpx->kernel_irqchip_required = true;
2498         break;
2499 
2500     case ON_OFF_SPLIT_OFF:
2501         whpx->kernel_irqchip_allowed = false;
2502         whpx->kernel_irqchip_required = false;
2503         break;
2504 
2505     case ON_OFF_SPLIT_SPLIT:
2506         error_setg(errp, "WHPX: split irqchip currently not supported");
2507         error_append_hint(errp,
2508             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2509         break;
2510 
2511     default:
2512         /*
2513          * The value was checked in visit_type_OnOffSplit() above. If
2514          * we get here, then something is wrong in QEMU.
2515          */
2516         abort();
2517     }
2518 }
2519 
2520 /*
2521  * Partition support
2522  */
2523 
2524 static int whpx_accel_init(MachineState *ms)
2525 {
2526     struct whpx_state *whpx;
2527     int ret;
2528     HRESULT hr;
2529     WHV_CAPABILITY whpx_cap;
2530     UINT32 whpx_cap_size;
2531     WHV_PARTITION_PROPERTY prop;
2532     UINT32 cpuidExitList[] = {1, 0x80000001};
2533     WHV_CAPABILITY_FEATURES features = {0};
2534 
2535     whpx = &whpx_global;
2536 
2537     if (!init_whp_dispatch()) {
2538         ret = -ENOSYS;
2539         goto error;
2540     }
2541 
2542     whpx->mem_quota = ms->ram_size;
2543 
2544     hr = whp_dispatch.WHvGetCapability(
2545         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2546         sizeof(whpx_cap), &whpx_cap_size);
2547     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2548         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2549         ret = -ENOSPC;
2550         goto error;
2551     }
2552 
2553     hr = whp_dispatch.WHvGetCapability(
2554         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2555     if (FAILED(hr)) {
2556         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2557         ret = -EINVAL;
2558         goto error;
2559     }
2560 
2561     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2562     if (FAILED(hr)) {
2563         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2564         ret = -EINVAL;
2565         goto error;
2566     }
2567 
2568     /*
2569      * Query the XSAVE capability of the partition. Any error here is not
2570      * considered fatal.
2571      */
2572     hr = whp_dispatch.WHvGetPartitionProperty(
2573         whpx->partition,
2574         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2575         &whpx_xsave_cap,
2576         sizeof(whpx_xsave_cap),
2577         &whpx_cap_size);
2578 
2579     /*
2580      * Windows version which don't support this property will return with the
2581      * specific error code.
2582      */
2583     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2584         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2585     }
2586 
2587     if (!whpx_has_xsave()) {
2588         printf("WHPX: Partition is not XSAVE capable\n");
2589     }
2590 
2591     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2592     prop.ProcessorCount = ms->smp.cpus;
2593     hr = whp_dispatch.WHvSetPartitionProperty(
2594         whpx->partition,
2595         WHvPartitionPropertyCodeProcessorCount,
2596         &prop,
2597         sizeof(WHV_PARTITION_PROPERTY));
2598 
2599     if (FAILED(hr)) {
2600         error_report("WHPX: Failed to set partition processor count to %u,"
2601                      " hr=%08lx", prop.ProcessorCount, hr);
2602         ret = -EINVAL;
2603         goto error;
2604     }
2605 
2606     /*
2607      * Error out if WHP doesn't support apic emulation and user is requiring
2608      * it.
2609      */
2610     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2611             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2612         error_report("WHPX: kernel irqchip requested, but unavailable. "
2613             "Try without kernel-irqchip or with kernel-irqchip=off");
2614         ret = -EINVAL;
2615         goto error;
2616     }
2617 
2618     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2619         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2620         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2621             WHvX64LocalApicEmulationModeXApic;
2622         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2623         hr = whp_dispatch.WHvSetPartitionProperty(
2624             whpx->partition,
2625             WHvPartitionPropertyCodeLocalApicEmulationMode,
2626             &mode,
2627             sizeof(mode));
2628         if (FAILED(hr)) {
2629             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2630             if (whpx->kernel_irqchip_required) {
2631                 error_report("WHPX: kernel irqchip requested, but unavailable");
2632                 ret = -EINVAL;
2633                 goto error;
2634             }
2635         } else {
2636             whpx->apic_in_platform = true;
2637         }
2638     }
2639 
2640     /* Register for MSR and CPUID exits */
2641     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2642     prop.ExtendedVmExits.X64MsrExit = 1;
2643     prop.ExtendedVmExits.X64CpuidExit = 1;
2644     prop.ExtendedVmExits.ExceptionExit = 1;
2645     if (whpx_apic_in_platform()) {
2646         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2647     }
2648 
2649     hr = whp_dispatch.WHvSetPartitionProperty(
2650             whpx->partition,
2651             WHvPartitionPropertyCodeExtendedVmExits,
2652             &prop,
2653             sizeof(WHV_PARTITION_PROPERTY));
2654     if (FAILED(hr)) {
2655         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2656         ret = -EINVAL;
2657         goto error;
2658     }
2659 
2660     hr = whp_dispatch.WHvSetPartitionProperty(
2661         whpx->partition,
2662         WHvPartitionPropertyCodeCpuidExitList,
2663         cpuidExitList,
2664         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2665 
2666     if (FAILED(hr)) {
2667         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2668                      hr);
2669         ret = -EINVAL;
2670         goto error;
2671     }
2672 
2673     /*
2674      * We do not want to intercept any exceptions from the guest,
2675      * until we actually start debugging with gdb.
2676      */
2677     whpx->exception_exit_bitmap = -1;
2678     hr = whpx_set_exception_exit_bitmap(0);
2679 
2680     if (FAILED(hr)) {
2681         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2682         ret = -EINVAL;
2683         goto error;
2684     }
2685 
2686     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2687     if (FAILED(hr)) {
2688         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2689         ret = -EINVAL;
2690         goto error;
2691     }
2692 
2693     whpx_memory_init();
2694 
2695     printf("Windows Hypervisor Platform accelerator is operational\n");
2696     return 0;
2697 
2698 error:
2699 
2700     if (NULL != whpx->partition) {
2701         whp_dispatch.WHvDeletePartition(whpx->partition);
2702         whpx->partition = NULL;
2703     }
2704 
2705     return ret;
2706 }
2707 
2708 int whpx_enabled(void)
2709 {
2710     return whpx_allowed;
2711 }
2712 
2713 bool whpx_apic_in_platform(void) {
2714     return whpx_global.apic_in_platform;
2715 }
2716 
2717 static void whpx_accel_class_init(ObjectClass *oc, void *data)
2718 {
2719     AccelClass *ac = ACCEL_CLASS(oc);
2720     ac->name = "WHPX";
2721     ac->init_machine = whpx_accel_init;
2722     ac->allowed = &whpx_allowed;
2723 
2724     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2725         NULL, whpx_set_kernel_irqchip,
2726         NULL, NULL);
2727     object_class_property_set_description(oc, "kernel-irqchip",
2728         "Configure WHPX in-kernel irqchip");
2729 }
2730 
2731 static void whpx_accel_instance_init(Object *obj)
2732 {
2733     struct whpx_state *whpx = &whpx_global;
2734 
2735     memset(whpx, 0, sizeof(struct whpx_state));
2736     /* Turn on kernel-irqchip, by default */
2737     whpx->kernel_irqchip_allowed = true;
2738 }
2739 
2740 static const TypeInfo whpx_accel_type = {
2741     .name = ACCEL_CLASS_NAME("whpx"),
2742     .parent = TYPE_ACCEL,
2743     .instance_init = whpx_accel_instance_init,
2744     .class_init = whpx_accel_class_init,
2745 };
2746 
2747 static void whpx_type_init(void)
2748 {
2749     type_register_static(&whpx_accel_type);
2750 }
2751 
2752 bool init_whp_dispatch(void)
2753 {
2754     if (whp_dispatch_initialized) {
2755         return true;
2756     }
2757 
2758     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2759         goto error;
2760     }
2761 
2762     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2763         goto error;
2764     }
2765 
2766     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2767         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2768     whp_dispatch_initialized = true;
2769 
2770     return true;
2771 error:
2772     if (hWinHvPlatform) {
2773         FreeLibrary(hWinHvPlatform);
2774     }
2775 
2776     if (hWinHvEmulation) {
2777         FreeLibrary(hWinHvEmulation);
2778     }
2779 
2780     return false;
2781 }
2782 
2783 type_init(whpx_type_init);
2784