xref: /openbmc/qemu/target/i386/whpx/whpx-all.c (revision abb6295b)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "exec/gdbstub.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/i386/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <WinHvPlatform.h>
35 #include <WinHvEmulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct whpx_vcpu {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240 
241     /* Must be the last field as it may have a tail */
242     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
243 };
244 
245 static bool whpx_allowed;
246 static bool whp_dispatch_initialized;
247 static HMODULE hWinHvPlatform, hWinHvEmulation;
248 static uint32_t max_vcpu_index;
249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
250 
251 struct whpx_state whpx_global;
252 struct WHPDispatch whp_dispatch;
253 
254 static bool whpx_has_xsave(void)
255 {
256     return whpx_xsave_cap.XsaveSupport;
257 }
258 
259 /*
260  * VP support
261  */
262 
263 static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu)
264 {
265     return (struct whpx_vcpu *)cpu->hax_vcpu;
266 }
267 
268 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
269                                              int r86)
270 {
271     WHV_X64_SEGMENT_REGISTER hs;
272     unsigned flags = qs->flags;
273 
274     hs.Base = qs->base;
275     hs.Limit = qs->limit;
276     hs.Selector = qs->selector;
277 
278     if (v86) {
279         hs.Attributes = 0;
280         hs.SegmentType = 3;
281         hs.Present = 1;
282         hs.DescriptorPrivilegeLevel = 3;
283         hs.NonSystemSegment = 1;
284 
285     } else {
286         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
287 
288         if (r86) {
289             /* hs.Base &= 0xfffff; */
290         }
291     }
292 
293     return hs;
294 }
295 
296 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
297 {
298     SegmentCache qs;
299 
300     qs.base = hs->Base;
301     qs.limit = hs->Limit;
302     qs.selector = hs->Selector;
303 
304     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
305 
306     return qs;
307 }
308 
309 /* X64 Extended Control Registers */
310 static void whpx_set_xcrs(CPUState *cpu)
311 {
312     CPUX86State *env = cpu->env_ptr;
313     HRESULT hr;
314     struct whpx_state *whpx = &whpx_global;
315     WHV_REGISTER_VALUE xcr0;
316     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
317 
318     if (!whpx_has_xsave()) {
319         return;
320     }
321 
322     /* Only xcr0 is supported by the hypervisor currently */
323     xcr0.Reg64 = env->xcr0;
324     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
325         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
326     if (FAILED(hr)) {
327         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
328     }
329 }
330 
331 static int whpx_set_tsc(CPUState *cpu)
332 {
333     CPUX86State *env = cpu->env_ptr;
334     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
335     WHV_REGISTER_VALUE tsc_val;
336     HRESULT hr;
337     struct whpx_state *whpx = &whpx_global;
338 
339     /*
340      * Suspend the partition prior to setting the TSC to reduce the variance
341      * in TSC across vCPUs. When the first vCPU runs post suspend, the
342      * partition is automatically resumed.
343      */
344     if (whp_dispatch.WHvSuspendPartitionTime) {
345 
346         /*
347          * Unable to suspend partition while setting TSC is not a fatal
348          * error. It just increases the likelihood of TSC variance between
349          * vCPUs and some guest OS are able to handle that just fine.
350          */
351         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
352         if (FAILED(hr)) {
353             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
354         }
355     }
356 
357     tsc_val.Reg64 = env->tsc;
358     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
359         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
360     if (FAILED(hr)) {
361         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
362         return -1;
363     }
364 
365     return 0;
366 }
367 
368 /*
369  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
370  * however, they use a slightly different encoding. Specifically:
371  *
372  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
373  *
374  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
375  * and IA-32 Architectures Software Developer's Manual.
376  */
377 
378 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
379 {
380     return tpr >> 4;
381 }
382 
383 static void whpx_set_registers(CPUState *cpu, int level)
384 {
385     struct whpx_state *whpx = &whpx_global;
386     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
387     CPUX86State *env = cpu->env_ptr;
388     X86CPU *x86_cpu = X86_CPU(cpu);
389     struct whpx_register_set vcxt;
390     HRESULT hr;
391     int idx;
392     int idx_next;
393     int i;
394     int v86, r86;
395 
396     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
397 
398     /*
399      * Following MSRs have side effects on the guest or are too heavy for
400      * runtime. Limit them to full state update.
401      */
402     if (level >= WHPX_SET_RESET_STATE) {
403         whpx_set_tsc(cpu);
404     }
405 
406     memset(&vcxt, 0, sizeof(struct whpx_register_set));
407 
408     v86 = (env->eflags & VM_MASK);
409     r86 = !(env->cr[0] & CR0_PE_MASK);
410 
411     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
412     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
413 
414     idx = 0;
415 
416     /* Indexes for first 16 registers match between HV and QEMU definitions */
417     idx_next = 16;
418     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
419         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
420     }
421     idx = idx_next;
422 
423     /* Same goes for RIP and RFLAGS */
424     assert(whpx_register_names[idx] == WHvX64RegisterRip);
425     vcxt.values[idx++].Reg64 = env->eip;
426 
427     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
428     vcxt.values[idx++].Reg64 = env->eflags;
429 
430     /* Translate 6+4 segment registers. HV and QEMU order matches  */
431     assert(idx == WHvX64RegisterEs);
432     for (i = 0; i < 6; i += 1, idx += 1) {
433         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
434     }
435 
436     assert(idx == WHvX64RegisterLdtr);
437     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
438 
439     assert(idx == WHvX64RegisterTr);
440     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
441 
442     assert(idx == WHvX64RegisterIdtr);
443     vcxt.values[idx].Table.Base = env->idt.base;
444     vcxt.values[idx].Table.Limit = env->idt.limit;
445     idx += 1;
446 
447     assert(idx == WHvX64RegisterGdtr);
448     vcxt.values[idx].Table.Base = env->gdt.base;
449     vcxt.values[idx].Table.Limit = env->gdt.limit;
450     idx += 1;
451 
452     /* CR0, 2, 3, 4, 8 */
453     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
454     vcxt.values[idx++].Reg64 = env->cr[0];
455     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
456     vcxt.values[idx++].Reg64 = env->cr[2];
457     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
458     vcxt.values[idx++].Reg64 = env->cr[3];
459     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
460     vcxt.values[idx++].Reg64 = env->cr[4];
461     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
462     vcxt.values[idx++].Reg64 = vcpu->tpr;
463 
464     /* 8 Debug Registers - Skipped */
465 
466     /*
467      * Extended control registers needs to be handled separately depending
468      * on whether xsave is supported/enabled or not.
469      */
470     whpx_set_xcrs(cpu);
471 
472     /* 16 XMM registers */
473     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
474     idx_next = idx + 16;
475     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
476         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
477         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
478     }
479     idx = idx_next;
480 
481     /* 8 FP registers */
482     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
483     for (i = 0; i < 8; i += 1, idx += 1) {
484         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
485         /* vcxt.values[idx].Fp.AsUINT128.High64 =
486                env->fpregs[i].mmx.MMX_Q(1);
487         */
488     }
489 
490     /* FP control status register */
491     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
492     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
493     vcxt.values[idx].FpControlStatus.FpStatus =
494         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
495     vcxt.values[idx].FpControlStatus.FpTag = 0;
496     for (i = 0; i < 8; ++i) {
497         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
498     }
499     vcxt.values[idx].FpControlStatus.Reserved = 0;
500     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
501     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
502     idx += 1;
503 
504     /* XMM control status register */
505     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
506     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
507     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
508     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
509     idx += 1;
510 
511     /* MSRs */
512     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
513     vcxt.values[idx++].Reg64 = env->efer;
514 #ifdef TARGET_X86_64
515     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
516     vcxt.values[idx++].Reg64 = env->kernelgsbase;
517 #endif
518 
519     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
520     vcxt.values[idx++].Reg64 = vcpu->apic_base;
521 
522     /* WHvX64RegisterPat - Skipped */
523 
524     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
525     vcxt.values[idx++].Reg64 = env->sysenter_cs;
526     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
527     vcxt.values[idx++].Reg64 = env->sysenter_eip;
528     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
529     vcxt.values[idx++].Reg64 = env->sysenter_esp;
530     assert(whpx_register_names[idx] == WHvX64RegisterStar);
531     vcxt.values[idx++].Reg64 = env->star;
532 #ifdef TARGET_X86_64
533     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
534     vcxt.values[idx++].Reg64 = env->lstar;
535     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
536     vcxt.values[idx++].Reg64 = env->cstar;
537     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
538     vcxt.values[idx++].Reg64 = env->fmask;
539 #endif
540 
541     /* Interrupt / Event Registers - Skipped */
542 
543     assert(idx == RTL_NUMBER_OF(whpx_register_names));
544 
545     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
546         whpx->partition, cpu->cpu_index,
547         whpx_register_names,
548         RTL_NUMBER_OF(whpx_register_names),
549         &vcxt.values[0]);
550 
551     if (FAILED(hr)) {
552         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
553                      hr);
554     }
555 
556     return;
557 }
558 
559 static int whpx_get_tsc(CPUState *cpu)
560 {
561     CPUX86State *env = cpu->env_ptr;
562     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
563     WHV_REGISTER_VALUE tsc_val;
564     HRESULT hr;
565     struct whpx_state *whpx = &whpx_global;
566 
567     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
568         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
569     if (FAILED(hr)) {
570         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
571         return -1;
572     }
573 
574     env->tsc = tsc_val.Reg64;
575     return 0;
576 }
577 
578 /* X64 Extended Control Registers */
579 static void whpx_get_xcrs(CPUState *cpu)
580 {
581     CPUX86State *env = cpu->env_ptr;
582     HRESULT hr;
583     struct whpx_state *whpx = &whpx_global;
584     WHV_REGISTER_VALUE xcr0;
585     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
586 
587     if (!whpx_has_xsave()) {
588         return;
589     }
590 
591     /* Only xcr0 is supported by the hypervisor currently */
592     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
593         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
594     if (FAILED(hr)) {
595         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
596         return;
597     }
598 
599     env->xcr0 = xcr0.Reg64;
600 }
601 
602 static void whpx_get_registers(CPUState *cpu)
603 {
604     struct whpx_state *whpx = &whpx_global;
605     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
606     CPUX86State *env = cpu->env_ptr;
607     X86CPU *x86_cpu = X86_CPU(cpu);
608     struct whpx_register_set vcxt;
609     uint64_t tpr, apic_base;
610     HRESULT hr;
611     int idx;
612     int idx_next;
613     int i;
614 
615     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
616 
617     if (!env->tsc_valid) {
618         whpx_get_tsc(cpu);
619         env->tsc_valid = !runstate_is_running();
620     }
621 
622     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
623         whpx->partition, cpu->cpu_index,
624         whpx_register_names,
625         RTL_NUMBER_OF(whpx_register_names),
626         &vcxt.values[0]);
627     if (FAILED(hr)) {
628         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
629                      hr);
630     }
631 
632     if (whpx_apic_in_platform()) {
633         /*
634          * Fetch the TPR value from the emulated APIC. It may get overwritten
635          * below with the value from CR8 returned by
636          * WHvGetVirtualProcessorRegisters().
637          */
638         whpx_apic_get(x86_cpu->apic_state);
639         vcpu->tpr = whpx_apic_tpr_to_cr8(
640             cpu_get_apic_tpr(x86_cpu->apic_state));
641     }
642 
643     idx = 0;
644 
645     /* Indexes for first 16 registers match between HV and QEMU definitions */
646     idx_next = 16;
647     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
648         env->regs[idx] = vcxt.values[idx].Reg64;
649     }
650     idx = idx_next;
651 
652     /* Same goes for RIP and RFLAGS */
653     assert(whpx_register_names[idx] == WHvX64RegisterRip);
654     env->eip = vcxt.values[idx++].Reg64;
655     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
656     env->eflags = vcxt.values[idx++].Reg64;
657 
658     /* Translate 6+4 segment registers. HV and QEMU order matches  */
659     assert(idx == WHvX64RegisterEs);
660     for (i = 0; i < 6; i += 1, idx += 1) {
661         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
662     }
663 
664     assert(idx == WHvX64RegisterLdtr);
665     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
666     assert(idx == WHvX64RegisterTr);
667     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
668     assert(idx == WHvX64RegisterIdtr);
669     env->idt.base = vcxt.values[idx].Table.Base;
670     env->idt.limit = vcxt.values[idx].Table.Limit;
671     idx += 1;
672     assert(idx == WHvX64RegisterGdtr);
673     env->gdt.base = vcxt.values[idx].Table.Base;
674     env->gdt.limit = vcxt.values[idx].Table.Limit;
675     idx += 1;
676 
677     /* CR0, 2, 3, 4, 8 */
678     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
679     env->cr[0] = vcxt.values[idx++].Reg64;
680     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
681     env->cr[2] = vcxt.values[idx++].Reg64;
682     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
683     env->cr[3] = vcxt.values[idx++].Reg64;
684     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
685     env->cr[4] = vcxt.values[idx++].Reg64;
686     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
687     tpr = vcxt.values[idx++].Reg64;
688     if (tpr != vcpu->tpr) {
689         vcpu->tpr = tpr;
690         cpu_set_apic_tpr(x86_cpu->apic_state, tpr);
691     }
692 
693     /* 8 Debug Registers - Skipped */
694 
695     /*
696      * Extended control registers needs to be handled separately depending
697      * on whether xsave is supported/enabled or not.
698      */
699     whpx_get_xcrs(cpu);
700 
701     /* 16 XMM registers */
702     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
703     idx_next = idx + 16;
704     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
705         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
706         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
707     }
708     idx = idx_next;
709 
710     /* 8 FP registers */
711     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
712     for (i = 0; i < 8; i += 1, idx += 1) {
713         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
714         /* env->fpregs[i].mmx.MMX_Q(1) =
715                vcxt.values[idx].Fp.AsUINT128.High64;
716         */
717     }
718 
719     /* FP control status register */
720     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
721     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
722     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
723     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
724     for (i = 0; i < 8; ++i) {
725         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
726     }
727     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
728     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
729     idx += 1;
730 
731     /* XMM control status register */
732     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
733     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
734     idx += 1;
735 
736     /* MSRs */
737     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
738     env->efer = vcxt.values[idx++].Reg64;
739 #ifdef TARGET_X86_64
740     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
741     env->kernelgsbase = vcxt.values[idx++].Reg64;
742 #endif
743 
744     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
745     apic_base = vcxt.values[idx++].Reg64;
746     if (apic_base != vcpu->apic_base) {
747         vcpu->apic_base = apic_base;
748         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
749     }
750 
751     /* WHvX64RegisterPat - Skipped */
752 
753     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
754     env->sysenter_cs = vcxt.values[idx++].Reg64;
755     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
756     env->sysenter_eip = vcxt.values[idx++].Reg64;
757     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
758     env->sysenter_esp = vcxt.values[idx++].Reg64;
759     assert(whpx_register_names[idx] == WHvX64RegisterStar);
760     env->star = vcxt.values[idx++].Reg64;
761 #ifdef TARGET_X86_64
762     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
763     env->lstar = vcxt.values[idx++].Reg64;
764     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
765     env->cstar = vcxt.values[idx++].Reg64;
766     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
767     env->fmask = vcxt.values[idx++].Reg64;
768 #endif
769 
770     /* Interrupt / Event Registers - Skipped */
771 
772     assert(idx == RTL_NUMBER_OF(whpx_register_names));
773 
774     if (whpx_apic_in_platform()) {
775         whpx_apic_get(x86_cpu->apic_state);
776     }
777 
778     x86_update_hflags(env);
779 
780     return;
781 }
782 
783 static HRESULT CALLBACK whpx_emu_ioport_callback(
784     void *ctx,
785     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
786 {
787     MemTxAttrs attrs = { 0 };
788     address_space_rw(&address_space_io, IoAccess->Port, attrs,
789                      &IoAccess->Data, IoAccess->AccessSize,
790                      IoAccess->Direction);
791     return S_OK;
792 }
793 
794 static HRESULT CALLBACK whpx_emu_mmio_callback(
795     void *ctx,
796     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
797 {
798     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
799                            ma->Direction);
800     return S_OK;
801 }
802 
803 static HRESULT CALLBACK whpx_emu_getreg_callback(
804     void *ctx,
805     const WHV_REGISTER_NAME *RegisterNames,
806     UINT32 RegisterCount,
807     WHV_REGISTER_VALUE *RegisterValues)
808 {
809     HRESULT hr;
810     struct whpx_state *whpx = &whpx_global;
811     CPUState *cpu = (CPUState *)ctx;
812 
813     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
814         whpx->partition, cpu->cpu_index,
815         RegisterNames, RegisterCount,
816         RegisterValues);
817     if (FAILED(hr)) {
818         error_report("WHPX: Failed to get virtual processor registers,"
819                      " hr=%08lx", hr);
820     }
821 
822     return hr;
823 }
824 
825 static HRESULT CALLBACK whpx_emu_setreg_callback(
826     void *ctx,
827     const WHV_REGISTER_NAME *RegisterNames,
828     UINT32 RegisterCount,
829     const WHV_REGISTER_VALUE *RegisterValues)
830 {
831     HRESULT hr;
832     struct whpx_state *whpx = &whpx_global;
833     CPUState *cpu = (CPUState *)ctx;
834 
835     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
836         whpx->partition, cpu->cpu_index,
837         RegisterNames, RegisterCount,
838         RegisterValues);
839     if (FAILED(hr)) {
840         error_report("WHPX: Failed to set virtual processor registers,"
841                      " hr=%08lx", hr);
842     }
843 
844     /*
845      * The emulator just successfully wrote the register state. We clear the
846      * dirty state so we avoid the double write on resume of the VP.
847      */
848     cpu->vcpu_dirty = false;
849 
850     return hr;
851 }
852 
853 static HRESULT CALLBACK whpx_emu_translate_callback(
854     void *ctx,
855     WHV_GUEST_VIRTUAL_ADDRESS Gva,
856     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
857     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
858     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
859 {
860     HRESULT hr;
861     struct whpx_state *whpx = &whpx_global;
862     CPUState *cpu = (CPUState *)ctx;
863     WHV_TRANSLATE_GVA_RESULT res;
864 
865     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
866                                       Gva, TranslateFlags, &res, Gpa);
867     if (FAILED(hr)) {
868         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
869     } else {
870         *TranslationResult = res.ResultCode;
871     }
872 
873     return hr;
874 }
875 
876 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
877     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
878     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
879     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
880     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
881     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
882     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
883 };
884 
885 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
886 {
887     HRESULT hr;
888     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
889     WHV_EMULATOR_STATUS emu_status;
890 
891     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
892         vcpu->emulator, cpu,
893         &vcpu->exit_ctx.VpContext, ctx,
894         &emu_status);
895     if (FAILED(hr)) {
896         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
897         return -1;
898     }
899 
900     if (!emu_status.EmulationSuccessful) {
901         error_report("WHPX: Failed to emulate MMIO access with"
902                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
903         return -1;
904     }
905 
906     return 0;
907 }
908 
909 static int whpx_handle_portio(CPUState *cpu,
910                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
911 {
912     HRESULT hr;
913     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
914     WHV_EMULATOR_STATUS emu_status;
915 
916     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
917         vcpu->emulator, cpu,
918         &vcpu->exit_ctx.VpContext, ctx,
919         &emu_status);
920     if (FAILED(hr)) {
921         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
922         return -1;
923     }
924 
925     if (!emu_status.EmulationSuccessful) {
926         error_report("WHPX: Failed to emulate PortIO access with"
927                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
928         return -1;
929     }
930 
931     return 0;
932 }
933 
934 /*
935  * Controls whether we should intercept various exceptions on the guest,
936  * namely breakpoint/single-step events.
937  *
938  * The 'exceptions' argument accepts a bitmask, e.g:
939  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
940  */
941 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
942 {
943     struct whpx_state *whpx = &whpx_global;
944     WHV_PARTITION_PROPERTY prop = { 0, };
945     HRESULT hr;
946 
947     if (exceptions == whpx->exception_exit_bitmap) {
948         return S_OK;
949     }
950 
951     prop.ExceptionExitBitmap = exceptions;
952 
953     hr = whp_dispatch.WHvSetPartitionProperty(
954         whpx->partition,
955         WHvPartitionPropertyCodeExceptionExitBitmap,
956         &prop,
957         sizeof(WHV_PARTITION_PROPERTY));
958 
959     if (SUCCEEDED(hr)) {
960         whpx->exception_exit_bitmap = exceptions;
961     }
962 
963     return hr;
964 }
965 
966 
967 /*
968  * This function is called before/after stepping over a single instruction.
969  * It will update the CPU registers to arm/disarm the instruction stepping
970  * accordingly.
971  */
972 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
973     bool set,
974     uint64_t *exit_context_rflags)
975 {
976     WHV_REGISTER_NAME reg_name;
977     WHV_REGISTER_VALUE reg_value;
978     HRESULT hr;
979     struct whpx_state *whpx = &whpx_global;
980 
981     /*
982      * If we are trying to step over a single instruction, we need to set the
983      * TF bit in rflags. Otherwise, clear it.
984      */
985     reg_name = WHvX64RegisterRflags;
986     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
987         whpx->partition,
988         cpu->cpu_index,
989         &reg_name,
990         1,
991         &reg_value);
992 
993     if (FAILED(hr)) {
994         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
995         return hr;
996     }
997 
998     if (exit_context_rflags) {
999         assert(*exit_context_rflags == reg_value.Reg64);
1000     }
1001 
1002     if (set) {
1003         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
1004         reg_value.Reg64 |= TF_MASK;
1005     } else {
1006         reg_value.Reg64 &= ~TF_MASK;
1007     }
1008 
1009     if (exit_context_rflags) {
1010         *exit_context_rflags = reg_value.Reg64;
1011     }
1012 
1013     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1014         whpx->partition,
1015         cpu->cpu_index,
1016         &reg_name,
1017         1,
1018         &reg_value);
1019 
1020     if (FAILED(hr)) {
1021         error_report("WHPX: Failed to set rflags,"
1022             " hr=%08lx",
1023             hr);
1024         return hr;
1025     }
1026 
1027     reg_name = WHvRegisterInterruptState;
1028     reg_value.Reg64 = 0;
1029 
1030     /* Suspend delivery of hardware interrupts during single-stepping. */
1031     reg_value.InterruptState.InterruptShadow = set != 0;
1032 
1033     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1034     whpx->partition,
1035         cpu->cpu_index,
1036         &reg_name,
1037         1,
1038         &reg_value);
1039 
1040     if (FAILED(hr)) {
1041         error_report("WHPX: Failed to set InterruptState,"
1042             " hr=%08lx",
1043             hr);
1044         return hr;
1045     }
1046 
1047     if (!set) {
1048         /*
1049          * We have just finished stepping over a single instruction,
1050          * and intercepted the INT1 generated by it.
1051          * We need to now hide the INT1 from the guest,
1052          * as it would not be expecting it.
1053          */
1054 
1055         reg_name = WHvX64RegisterPendingDebugException;
1056         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1057         whpx->partition,
1058             cpu->cpu_index,
1059             &reg_name,
1060             1,
1061             &reg_value);
1062 
1063         if (FAILED(hr)) {
1064             error_report("WHPX: Failed to get pending debug exceptions,"
1065                          "hr=%08lx", hr);
1066             return hr;
1067         }
1068 
1069         if (reg_value.PendingDebugException.SingleStep) {
1070             reg_value.PendingDebugException.SingleStep = 0;
1071 
1072             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1073                 whpx->partition,
1074                 cpu->cpu_index,
1075                 &reg_name,
1076                 1,
1077                 &reg_value);
1078 
1079             if (FAILED(hr)) {
1080                 error_report("WHPX: Failed to clear pending debug exceptions,"
1081                              "hr=%08lx", hr);
1082              return hr;
1083             }
1084         }
1085 
1086     }
1087 
1088     return S_OK;
1089 }
1090 
1091 /* Tries to find a breakpoint at the specified address. */
1092 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1093 {
1094     struct whpx_state *whpx = &whpx_global;
1095     int i;
1096 
1097     if (whpx->breakpoints.breakpoints) {
1098         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1099             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1100                 return &whpx->breakpoints.breakpoints->data[i];
1101             }
1102         }
1103     }
1104 
1105     return NULL;
1106 }
1107 
1108 /*
1109  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1110  * debugging user-mode applications. Since the WHPX API does not offer
1111  * an easy way to pass the intercepted exception back to the guest, we
1112  * resort to using INT1 instead, and let the guest always handle INT3.
1113  */
1114 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1115 
1116 /*
1117  * The WHPX QEMU backend implements breakpoints by writing the INT1
1118  * instruction into memory (ignoring the DRx registers). This raises a few
1119  * issues that need to be carefully handled:
1120  *
1121  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1122  *    at the same location, and later remove them in arbitrary order.
1123  *    This should not cause memory corruption, and should only remove the
1124  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1125  *
1126  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1127  *    physical location. Hence, physically adding/removing a breakpoint can
1128  *    theoretically fail at any time. We need to keep track of it.
1129  *
1130  * The function below rebuilds a list of low-level breakpoints (one per
1131  * address, tracking the original instruction and any errors) from the list of
1132  * high-level breakpoints (set via cpu_breakpoint_insert()).
1133  *
1134  * In order to optimize performance, this function stores the list of
1135  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1136  * low-level ones, so that it won't be re-invoked until these breakpoints
1137  * change.
1138  *
1139  * Note that this function decides which breakpoints should be inserted into,
1140  * memory, but doesn't actually do it. The memory accessing is done in
1141  * whpx_apply_breakpoints().
1142  */
1143 static void whpx_translate_cpu_breakpoints(
1144     struct whpx_breakpoints *breakpoints,
1145     CPUState *cpu,
1146     int cpu_breakpoint_count)
1147 {
1148     CPUBreakpoint *bp;
1149     int cpu_bp_index = 0;
1150 
1151     breakpoints->original_addresses =
1152         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1153 
1154     breakpoints->original_address_count = cpu_breakpoint_count;
1155 
1156     int max_breakpoints = cpu_breakpoint_count +
1157         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1158 
1159     struct whpx_breakpoint_collection *new_breakpoints =
1160         (struct whpx_breakpoint_collection *)g_malloc0(
1161         sizeof(struct whpx_breakpoint_collection) +
1162             max_breakpoints * sizeof(struct whpx_breakpoint));
1163 
1164     new_breakpoints->allocated = max_breakpoints;
1165     new_breakpoints->used = 0;
1166 
1167     /*
1168      * 1. Preserve all old breakpoints that could not be automatically
1169      * cleared when the CPU got stopped.
1170      */
1171     if (breakpoints->breakpoints) {
1172         int i;
1173         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1174             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1175                 new_breakpoints->data[new_breakpoints->used++] =
1176                     breakpoints->breakpoints->data[i];
1177             }
1178         }
1179     }
1180 
1181     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1182     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1183         int i;
1184         bool found = false;
1185 
1186         /* This will be used to detect changed CPU breakpoints later. */
1187         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1188 
1189         for (i = 0; i < new_breakpoints->used; i++) {
1190             /*
1191              * WARNING: This loop has O(N^2) complexity, where N is the
1192              * number of breakpoints. It should not be a bottleneck in
1193              * real-world scenarios, since it only needs to run once after
1194              * the breakpoints have been modified.
1195              * If this ever becomes a concern, it can be optimized by storing
1196              * high-level breakpoint objects in a tree or hash map.
1197              */
1198 
1199             if (new_breakpoints->data[i].address == bp->pc) {
1200                 /* There was already a breakpoint at this address. */
1201                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1202                     new_breakpoints->data[i].state = WHPX_BP_SET;
1203                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1204                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1205                 }
1206 
1207                 found = true;
1208                 break;
1209             }
1210         }
1211 
1212         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1213             /* No WHPX breakpoint at this address. Create one. */
1214             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1215             new_breakpoints->data[new_breakpoints->used].state =
1216                 WHPX_BP_SET_PENDING;
1217             new_breakpoints->used++;
1218         }
1219     }
1220 
1221     if (breakpoints->breakpoints) {
1222         /*
1223          * Free the previous breakpoint list. This can be optimized by keeping
1224          * it as shadow buffer for the next computation instead of freeing
1225          * it immediately.
1226          */
1227         g_free(breakpoints->breakpoints);
1228     }
1229 
1230     breakpoints->breakpoints = new_breakpoints;
1231 }
1232 
1233 /*
1234  * Physically inserts/removes the breakpoints by reading and writing the
1235  * physical memory, keeping a track of the failed attempts.
1236  *
1237  * Passing resuming=true  will try to set all previously unset breakpoints.
1238  * Passing resuming=false will remove all inserted ones.
1239  */
1240 static void whpx_apply_breakpoints(
1241     struct whpx_breakpoint_collection *breakpoints,
1242     CPUState *cpu,
1243     bool resuming)
1244 {
1245     int i, rc;
1246     if (!breakpoints) {
1247         return;
1248     }
1249 
1250     for (i = 0; i < breakpoints->used; i++) {
1251         /* Decide what to do right now based on the last known state. */
1252         WhpxBreakpointState state = breakpoints->data[i].state;
1253         switch (state) {
1254         case WHPX_BP_CLEARED:
1255             if (resuming) {
1256                 state = WHPX_BP_SET_PENDING;
1257             }
1258             break;
1259         case WHPX_BP_SET_PENDING:
1260             if (!resuming) {
1261                 state = WHPX_BP_CLEARED;
1262             }
1263             break;
1264         case WHPX_BP_SET:
1265             if (!resuming) {
1266                 state = WHPX_BP_CLEAR_PENDING;
1267             }
1268             break;
1269         case WHPX_BP_CLEAR_PENDING:
1270             if (resuming) {
1271                 state = WHPX_BP_SET;
1272             }
1273             break;
1274         }
1275 
1276         if (state == WHPX_BP_SET_PENDING) {
1277             /* Remember the original instruction. */
1278             rc = cpu_memory_rw_debug(cpu,
1279                 breakpoints->data[i].address,
1280                 &breakpoints->data[i].original_instruction,
1281                 1,
1282                 false);
1283 
1284             if (!rc) {
1285                 /* Write the breakpoint instruction. */
1286                 rc = cpu_memory_rw_debug(cpu,
1287                     breakpoints->data[i].address,
1288                     (void *)&whpx_breakpoint_instruction,
1289                     1,
1290                     true);
1291             }
1292 
1293             if (!rc) {
1294                 state = WHPX_BP_SET;
1295             }
1296 
1297         }
1298 
1299         if (state == WHPX_BP_CLEAR_PENDING) {
1300             /* Restore the original instruction. */
1301             rc = cpu_memory_rw_debug(cpu,
1302                 breakpoints->data[i].address,
1303                 &breakpoints->data[i].original_instruction,
1304                 1,
1305                 true);
1306 
1307             if (!rc) {
1308                 state = WHPX_BP_CLEARED;
1309             }
1310         }
1311 
1312         breakpoints->data[i].state = state;
1313     }
1314 }
1315 
1316 /*
1317  * This function is called when the a VCPU is about to start and no other
1318  * VCPUs have been started so far. Since the VCPU start order could be
1319  * arbitrary, it doesn't have to be VCPU#0.
1320  *
1321  * It is used to commit the breakpoints into memory, and configure WHPX
1322  * to intercept debug exceptions.
1323  *
1324  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1325  * more VCPUs are already running, so this is the best place to do it.
1326  */
1327 static int whpx_first_vcpu_starting(CPUState *cpu)
1328 {
1329     struct whpx_state *whpx = &whpx_global;
1330     HRESULT hr;
1331 
1332     g_assert(qemu_mutex_iothread_locked());
1333 
1334     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1335             (whpx->breakpoints.breakpoints &&
1336              whpx->breakpoints.breakpoints->used)) {
1337         CPUBreakpoint *bp;
1338         int i = 0;
1339         bool update_pending = false;
1340 
1341         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1342             if (i >= whpx->breakpoints.original_address_count ||
1343                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1344                 update_pending = true;
1345             }
1346 
1347             i++;
1348         }
1349 
1350         if (i != whpx->breakpoints.original_address_count) {
1351             update_pending = true;
1352         }
1353 
1354         if (update_pending) {
1355             /*
1356              * The CPU breakpoints have changed since the last call to
1357              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1358              * now be recomputed.
1359              */
1360             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1361         }
1362 
1363         /* Actually insert the breakpoints into the memory. */
1364         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1365     }
1366 
1367     uint64_t exception_mask;
1368     if (whpx->step_pending ||
1369         (whpx->breakpoints.breakpoints &&
1370          whpx->breakpoints.breakpoints->used)) {
1371         /*
1372          * We are either attempting to single-step one or more CPUs, or
1373          * have one or more breakpoints enabled. Both require intercepting
1374          * the WHvX64ExceptionTypeBreakpointTrap exception.
1375          */
1376 
1377         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1378     } else {
1379         /* Let the guest handle all exceptions. */
1380         exception_mask = 0;
1381     }
1382 
1383     hr = whpx_set_exception_exit_bitmap(exception_mask);
1384     if (!SUCCEEDED(hr)) {
1385         error_report("WHPX: Failed to update exception exit mask,"
1386                      "hr=%08lx.", hr);
1387         return 1;
1388     }
1389 
1390     return 0;
1391 }
1392 
1393 /*
1394  * This function is called when the last VCPU has finished running.
1395  * It is used to remove any previously set breakpoints from memory.
1396  */
1397 static int whpx_last_vcpu_stopping(CPUState *cpu)
1398 {
1399     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1400     return 0;
1401 }
1402 
1403 /* Returns the address of the next instruction that is about to be executed. */
1404 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1405 {
1406     if (cpu->vcpu_dirty) {
1407         /* The CPU registers have been modified by other parts of QEMU. */
1408         CPUArchState *env = (CPUArchState *)(cpu->env_ptr);
1409         return env->eip;
1410     } else if (exit_context_valid) {
1411         /*
1412          * The CPU registers have not been modified by neither other parts
1413          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1414          * This is the most common case.
1415          */
1416         struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1417         return vcpu->exit_ctx.VpContext.Rip;
1418     } else {
1419         /*
1420          * The CPU registers have been modified by a call to
1421          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1422          * the target.
1423          */
1424         WHV_REGISTER_VALUE reg_value;
1425         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1426         HRESULT hr;
1427         struct whpx_state *whpx = &whpx_global;
1428 
1429         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1430             whpx->partition,
1431             cpu->cpu_index,
1432             &reg_name,
1433             1,
1434             &reg_value);
1435 
1436         if (FAILED(hr)) {
1437             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1438             return 0;
1439         }
1440 
1441         return reg_value.Reg64;
1442     }
1443 }
1444 
1445 static int whpx_handle_halt(CPUState *cpu)
1446 {
1447     CPUX86State *env = cpu->env_ptr;
1448     int ret = 0;
1449 
1450     qemu_mutex_lock_iothread();
1451     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1452           (env->eflags & IF_MASK)) &&
1453         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1454         cpu->exception_index = EXCP_HLT;
1455         cpu->halted = true;
1456         ret = 1;
1457     }
1458     qemu_mutex_unlock_iothread();
1459 
1460     return ret;
1461 }
1462 
1463 static void whpx_vcpu_pre_run(CPUState *cpu)
1464 {
1465     HRESULT hr;
1466     struct whpx_state *whpx = &whpx_global;
1467     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1468     CPUX86State *env = cpu->env_ptr;
1469     X86CPU *x86_cpu = X86_CPU(cpu);
1470     int irq;
1471     uint8_t tpr;
1472     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1473     UINT32 reg_count = 0;
1474     WHV_REGISTER_VALUE reg_values[3];
1475     WHV_REGISTER_NAME reg_names[3];
1476 
1477     memset(&new_int, 0, sizeof(new_int));
1478     memset(reg_values, 0, sizeof(reg_values));
1479 
1480     qemu_mutex_lock_iothread();
1481 
1482     /* Inject NMI */
1483     if (!vcpu->interruption_pending &&
1484         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1485         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1486             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1487             vcpu->interruptable = false;
1488             new_int.InterruptionType = WHvX64PendingNmi;
1489             new_int.InterruptionPending = 1;
1490             new_int.InterruptionVector = 2;
1491         }
1492         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1493             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1494         }
1495     }
1496 
1497     /*
1498      * Force the VCPU out of its inner loop to process any INIT requests or
1499      * commit pending TPR access.
1500      */
1501     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1502         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1503             !(env->hflags & HF_SMM_MASK)) {
1504             cpu->exit_request = 1;
1505         }
1506         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1507             cpu->exit_request = 1;
1508         }
1509     }
1510 
1511     /* Get pending hard interruption or replay one that was overwritten */
1512     if (!whpx_apic_in_platform()) {
1513         if (!vcpu->interruption_pending &&
1514             vcpu->interruptable && (env->eflags & IF_MASK)) {
1515             assert(!new_int.InterruptionPending);
1516             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1517                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1518                 irq = cpu_get_pic_interrupt(env);
1519                 if (irq >= 0) {
1520                     new_int.InterruptionType = WHvX64PendingInterrupt;
1521                     new_int.InterruptionPending = 1;
1522                     new_int.InterruptionVector = irq;
1523                 }
1524             }
1525         }
1526 
1527         /* Setup interrupt state if new one was prepared */
1528         if (new_int.InterruptionPending) {
1529             reg_values[reg_count].PendingInterruption = new_int;
1530             reg_names[reg_count] = WHvRegisterPendingInterruption;
1531             reg_count += 1;
1532         }
1533     } else if (vcpu->ready_for_pic_interrupt &&
1534                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1535         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1536         irq = cpu_get_pic_interrupt(env);
1537         if (irq >= 0) {
1538             reg_names[reg_count] = WHvRegisterPendingEvent;
1539             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1540             {
1541                 .EventPending = 1,
1542                 .EventType = WHvX64PendingEventExtInt,
1543                 .Vector = irq,
1544             };
1545             reg_count += 1;
1546         }
1547      }
1548 
1549     /* Sync the TPR to the CR8 if was modified during the intercept */
1550     tpr = cpu_get_apic_tpr(x86_cpu->apic_state);
1551     if (tpr != vcpu->tpr) {
1552         vcpu->tpr = tpr;
1553         reg_values[reg_count].Reg64 = tpr;
1554         cpu->exit_request = 1;
1555         reg_names[reg_count] = WHvX64RegisterCr8;
1556         reg_count += 1;
1557     }
1558 
1559     /* Update the state of the interrupt delivery notification */
1560     if (!vcpu->window_registered &&
1561         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1562         reg_values[reg_count].DeliverabilityNotifications =
1563             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1564                 .InterruptNotification = 1
1565             };
1566         vcpu->window_registered = 1;
1567         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1568         reg_count += 1;
1569     }
1570 
1571     qemu_mutex_unlock_iothread();
1572     vcpu->ready_for_pic_interrupt = false;
1573 
1574     if (reg_count) {
1575         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1576             whpx->partition, cpu->cpu_index,
1577             reg_names, reg_count, reg_values);
1578         if (FAILED(hr)) {
1579             error_report("WHPX: Failed to set interrupt state registers,"
1580                          " hr=%08lx", hr);
1581         }
1582     }
1583 
1584     return;
1585 }
1586 
1587 static void whpx_vcpu_post_run(CPUState *cpu)
1588 {
1589     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1590     CPUX86State *env = cpu->env_ptr;
1591     X86CPU *x86_cpu = X86_CPU(cpu);
1592 
1593     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1594 
1595     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1596     if (vcpu->tpr != tpr) {
1597         vcpu->tpr = tpr;
1598         qemu_mutex_lock_iothread();
1599         cpu_set_apic_tpr(x86_cpu->apic_state, vcpu->tpr);
1600         qemu_mutex_unlock_iothread();
1601     }
1602 
1603     vcpu->interruption_pending =
1604         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1605 
1606     vcpu->interruptable =
1607         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1608 
1609     return;
1610 }
1611 
1612 static void whpx_vcpu_process_async_events(CPUState *cpu)
1613 {
1614     CPUX86State *env = cpu->env_ptr;
1615     X86CPU *x86_cpu = X86_CPU(cpu);
1616     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1617 
1618     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1619         !(env->hflags & HF_SMM_MASK)) {
1620         whpx_cpu_synchronize_state(cpu);
1621         do_cpu_init(x86_cpu);
1622         vcpu->interruptable = true;
1623     }
1624 
1625     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1626         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1627         apic_poll_irq(x86_cpu->apic_state);
1628     }
1629 
1630     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1631          (env->eflags & IF_MASK)) ||
1632         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1633         cpu->halted = false;
1634     }
1635 
1636     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1637         whpx_cpu_synchronize_state(cpu);
1638         do_cpu_sipi(x86_cpu);
1639     }
1640 
1641     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1642         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1643         whpx_cpu_synchronize_state(cpu);
1644         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1645                                       env->tpr_access_type);
1646     }
1647 
1648     return;
1649 }
1650 
1651 static int whpx_vcpu_run(CPUState *cpu)
1652 {
1653     HRESULT hr;
1654     struct whpx_state *whpx = &whpx_global;
1655     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1656     struct whpx_breakpoint *stepped_over_bp = NULL;
1657     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1658     int ret;
1659 
1660     g_assert(qemu_mutex_iothread_locked());
1661 
1662     if (whpx->running_cpus++ == 0) {
1663         /* Insert breakpoints into memory, update exception exit bitmap. */
1664         ret = whpx_first_vcpu_starting(cpu);
1665         if (ret != 0) {
1666             return ret;
1667         }
1668     }
1669 
1670     if (whpx->breakpoints.breakpoints &&
1671         whpx->breakpoints.breakpoints->used > 0)
1672     {
1673         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1674         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1675         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1676             stepped_over_bp = NULL;
1677         }
1678 
1679         if (stepped_over_bp) {
1680             /*
1681              * We are trying to run the instruction overwritten by an active
1682              * breakpoint. We will temporarily disable the breakpoint, suspend
1683              * other CPUs, and step over the instruction.
1684              */
1685             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1686         }
1687     }
1688 
1689     if (exclusive_step_mode == WHPX_STEP_NONE) {
1690         whpx_vcpu_process_async_events(cpu);
1691         if (cpu->halted && !whpx_apic_in_platform()) {
1692             cpu->exception_index = EXCP_HLT;
1693             qatomic_set(&cpu->exit_request, false);
1694             return 0;
1695         }
1696     }
1697 
1698     qemu_mutex_unlock_iothread();
1699 
1700     if (exclusive_step_mode != WHPX_STEP_NONE) {
1701         start_exclusive();
1702         g_assert(cpu == current_cpu);
1703         g_assert(!cpu->running);
1704         cpu->running = true;
1705 
1706         hr = whpx_set_exception_exit_bitmap(
1707             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1708         if (!SUCCEEDED(hr)) {
1709             error_report("WHPX: Failed to update exception exit mask, "
1710                          "hr=%08lx.", hr);
1711             return 1;
1712         }
1713 
1714         if (stepped_over_bp) {
1715             /* Temporarily disable the triggered breakpoint. */
1716             cpu_memory_rw_debug(cpu,
1717                 stepped_over_bp->address,
1718                 &stepped_over_bp->original_instruction,
1719                 1,
1720                 true);
1721         }
1722     } else {
1723         cpu_exec_start(cpu);
1724     }
1725 
1726     do {
1727         if (cpu->vcpu_dirty) {
1728             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1729             cpu->vcpu_dirty = false;
1730         }
1731 
1732         if (exclusive_step_mode == WHPX_STEP_NONE) {
1733             whpx_vcpu_pre_run(cpu);
1734 
1735             if (qatomic_read(&cpu->exit_request)) {
1736                 whpx_vcpu_kick(cpu);
1737             }
1738         }
1739 
1740         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1741             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1742         }
1743 
1744         hr = whp_dispatch.WHvRunVirtualProcessor(
1745             whpx->partition, cpu->cpu_index,
1746             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1747 
1748         if (FAILED(hr)) {
1749             error_report("WHPX: Failed to exec a virtual processor,"
1750                          " hr=%08lx", hr);
1751             ret = -1;
1752             break;
1753         }
1754 
1755         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1756             whpx_vcpu_configure_single_stepping(cpu,
1757                 false,
1758                 &vcpu->exit_ctx.VpContext.Rflags);
1759         }
1760 
1761         whpx_vcpu_post_run(cpu);
1762 
1763         switch (vcpu->exit_ctx.ExitReason) {
1764         case WHvRunVpExitReasonMemoryAccess:
1765             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1766             break;
1767 
1768         case WHvRunVpExitReasonX64IoPortAccess:
1769             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1770             break;
1771 
1772         case WHvRunVpExitReasonX64InterruptWindow:
1773             vcpu->ready_for_pic_interrupt = 1;
1774             vcpu->window_registered = 0;
1775             ret = 0;
1776             break;
1777 
1778         case WHvRunVpExitReasonX64ApicEoi:
1779             assert(whpx_apic_in_platform());
1780             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1781             break;
1782 
1783         case WHvRunVpExitReasonX64Halt:
1784             /*
1785              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1786              * longer used.
1787              */
1788             ret = whpx_handle_halt(cpu);
1789             break;
1790 
1791         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1792             WHV_INTERRUPT_CONTROL ipi = {0};
1793             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1794             uint32_t delivery_mode =
1795                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1796             int dest_shorthand =
1797                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1798             bool broadcast = false;
1799             bool include_self = false;
1800             uint32_t i;
1801 
1802             /* We only registered for INIT and SIPI exits. */
1803             if ((delivery_mode != APIC_DM_INIT) &&
1804                 (delivery_mode != APIC_DM_SIPI)) {
1805                 error_report(
1806                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1807                 break;
1808             }
1809 
1810             if (delivery_mode == APIC_DM_INIT) {
1811                 ipi.Type = WHvX64InterruptTypeInit;
1812             } else {
1813                 ipi.Type = WHvX64InterruptTypeSipi;
1814             }
1815 
1816             ipi.DestinationMode =
1817                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1818                     WHvX64InterruptDestinationModeLogical :
1819                     WHvX64InterruptDestinationModePhysical;
1820 
1821             ipi.TriggerMode =
1822                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1823                     WHvX64InterruptTriggerModeLevel :
1824                     WHvX64InterruptTriggerModeEdge;
1825 
1826             ipi.Vector = icr & APIC_VECTOR_MASK;
1827             switch (dest_shorthand) {
1828             /* no shorthand. Bits 56-63 contain the destination. */
1829             case 0:
1830                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1831                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1832                         &ipi, sizeof(ipi));
1833                 if (FAILED(hr)) {
1834                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1835                         hr);
1836                 }
1837 
1838                 break;
1839 
1840             /* self */
1841             case 1:
1842                 include_self = true;
1843                 break;
1844 
1845             /* broadcast, including self */
1846             case 2:
1847                 broadcast = true;
1848                 include_self = true;
1849                 break;
1850 
1851             /* broadcast, excluding self */
1852             case 3:
1853                 broadcast = true;
1854                 break;
1855             }
1856 
1857             if (!broadcast && !include_self) {
1858                 break;
1859             }
1860 
1861             for (i = 0; i <= max_vcpu_index; i++) {
1862                 if (i == cpu->cpu_index && !include_self) {
1863                     continue;
1864                 }
1865 
1866                 /*
1867                  * Assuming that APIC Ids are identity mapped since
1868                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1869                  * are not handled yet and the hypervisor doesn't allow the
1870                  * guest to modify the APIC ID.
1871                  */
1872                 ipi.Destination = i;
1873                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1874                         &ipi, sizeof(ipi));
1875                 if (FAILED(hr)) {
1876                     error_report(
1877                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1878                         i, hr);
1879                 }
1880             }
1881 
1882             break;
1883         }
1884 
1885         case WHvRunVpExitReasonCanceled:
1886             if (exclusive_step_mode != WHPX_STEP_NONE) {
1887                 /*
1888                  * We are trying to step over a single instruction, and
1889                  * likely got a request to stop from another thread.
1890                  * Delay it until we are done stepping
1891                  * over.
1892                  */
1893                 ret = 0;
1894             } else {
1895                 cpu->exception_index = EXCP_INTERRUPT;
1896                 ret = 1;
1897             }
1898             break;
1899         case WHvRunVpExitReasonX64MsrAccess: {
1900             WHV_REGISTER_VALUE reg_values[3] = {0};
1901             WHV_REGISTER_NAME reg_names[3];
1902             UINT32 reg_count;
1903 
1904             reg_names[0] = WHvX64RegisterRip;
1905             reg_names[1] = WHvX64RegisterRax;
1906             reg_names[2] = WHvX64RegisterRdx;
1907 
1908             reg_values[0].Reg64 =
1909                 vcpu->exit_ctx.VpContext.Rip +
1910                 vcpu->exit_ctx.VpContext.InstructionLength;
1911 
1912             /*
1913              * For all unsupported MSR access we:
1914              *     ignore writes
1915              *     return 0 on read.
1916              */
1917             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1918                         1 : 3;
1919 
1920             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1921                 whpx->partition,
1922                 cpu->cpu_index,
1923                 reg_names, reg_count,
1924                 reg_values);
1925 
1926             if (FAILED(hr)) {
1927                 error_report("WHPX: Failed to set MsrAccess state "
1928                              " registers, hr=%08lx", hr);
1929             }
1930             ret = 0;
1931             break;
1932         }
1933         case WHvRunVpExitReasonX64Cpuid: {
1934             WHV_REGISTER_VALUE reg_values[5];
1935             WHV_REGISTER_NAME reg_names[5];
1936             UINT32 reg_count = 5;
1937             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1938             X86CPU *x86_cpu = X86_CPU(cpu);
1939             CPUX86State *env = &x86_cpu->env;
1940 
1941             memset(reg_values, 0, sizeof(reg_values));
1942 
1943             rip = vcpu->exit_ctx.VpContext.Rip +
1944                   vcpu->exit_ctx.VpContext.InstructionLength;
1945             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1946 
1947             /*
1948              * Ideally, these should be supplied to the hypervisor during VCPU
1949              * initialization and it should be able to satisfy this request.
1950              * But, currently, WHPX doesn't support setting CPUID values in the
1951              * hypervisor once the partition has been setup, which is too late
1952              * since VCPUs are realized later. For now, use the values from
1953              * QEMU to satisfy these requests, until WHPX adds support for
1954              * being able to set these values in the hypervisor at runtime.
1955              */
1956             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1957                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1958             switch (cpuid_fn) {
1959             case 0x40000000:
1960                 /* Expose the vmware cpu frequency cpuid leaf */
1961                 rax = 0x40000010;
1962                 rbx = rcx = rdx = 0;
1963                 break;
1964 
1965             case 0x40000010:
1966                 rax = env->tsc_khz;
1967                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1968                 rcx = rdx = 0;
1969                 break;
1970 
1971             case 0x80000001:
1972                 /* Remove any support of OSVW */
1973                 rcx &= ~CPUID_EXT3_OSVW;
1974                 break;
1975             }
1976 
1977             reg_names[0] = WHvX64RegisterRip;
1978             reg_names[1] = WHvX64RegisterRax;
1979             reg_names[2] = WHvX64RegisterRcx;
1980             reg_names[3] = WHvX64RegisterRdx;
1981             reg_names[4] = WHvX64RegisterRbx;
1982 
1983             reg_values[0].Reg64 = rip;
1984             reg_values[1].Reg64 = rax;
1985             reg_values[2].Reg64 = rcx;
1986             reg_values[3].Reg64 = rdx;
1987             reg_values[4].Reg64 = rbx;
1988 
1989             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1990                 whpx->partition, cpu->cpu_index,
1991                 reg_names,
1992                 reg_count,
1993                 reg_values);
1994 
1995             if (FAILED(hr)) {
1996                 error_report("WHPX: Failed to set CpuidAccess state registers,"
1997                              " hr=%08lx", hr);
1998             }
1999             ret = 0;
2000             break;
2001         }
2002         case WHvRunVpExitReasonException:
2003             whpx_get_registers(cpu);
2004 
2005             if ((vcpu->exit_ctx.VpException.ExceptionType ==
2006                  WHvX64ExceptionTypeDebugTrapOrFault) &&
2007                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
2008                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
2009                  whpx_breakpoint_instruction)) {
2010                 /* Stopped at a software breakpoint. */
2011                 cpu->exception_index = EXCP_DEBUG;
2012             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
2013                         WHvX64ExceptionTypeDebugTrapOrFault) &&
2014                        !cpu->singlestep_enabled) {
2015                 /*
2016                  * Just finished stepping over a breakpoint, but the
2017                  * gdb does not expect us to do single-stepping.
2018                  * Don't do anything special.
2019                  */
2020                 cpu->exception_index = EXCP_INTERRUPT;
2021             } else {
2022                 /* Another exception or debug event. Report it to GDB. */
2023                 cpu->exception_index = EXCP_DEBUG;
2024             }
2025 
2026             ret = 1;
2027             break;
2028         case WHvRunVpExitReasonNone:
2029         case WHvRunVpExitReasonUnrecoverableException:
2030         case WHvRunVpExitReasonInvalidVpRegisterValue:
2031         case WHvRunVpExitReasonUnsupportedFeature:
2032         default:
2033             error_report("WHPX: Unexpected VP exit code %d",
2034                          vcpu->exit_ctx.ExitReason);
2035             whpx_get_registers(cpu);
2036             qemu_mutex_lock_iothread();
2037             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2038             qemu_mutex_unlock_iothread();
2039             break;
2040         }
2041 
2042     } while (!ret);
2043 
2044     if (stepped_over_bp) {
2045         /* Restore the breakpoint we stepped over */
2046         cpu_memory_rw_debug(cpu,
2047             stepped_over_bp->address,
2048             (void *)&whpx_breakpoint_instruction,
2049             1,
2050             true);
2051     }
2052 
2053     if (exclusive_step_mode != WHPX_STEP_NONE) {
2054         g_assert(cpu_in_exclusive_context(cpu));
2055         cpu->running = false;
2056         end_exclusive();
2057 
2058         exclusive_step_mode = WHPX_STEP_NONE;
2059     } else {
2060         cpu_exec_end(cpu);
2061     }
2062 
2063     qemu_mutex_lock_iothread();
2064     current_cpu = cpu;
2065 
2066     if (--whpx->running_cpus == 0) {
2067         whpx_last_vcpu_stopping(cpu);
2068     }
2069 
2070     qatomic_set(&cpu->exit_request, false);
2071 
2072     return ret < 0;
2073 }
2074 
2075 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2076 {
2077     if (!cpu->vcpu_dirty) {
2078         whpx_get_registers(cpu);
2079         cpu->vcpu_dirty = true;
2080     }
2081 }
2082 
2083 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2084                                                run_on_cpu_data arg)
2085 {
2086     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2087     cpu->vcpu_dirty = false;
2088 }
2089 
2090 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2091                                               run_on_cpu_data arg)
2092 {
2093     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2094     cpu->vcpu_dirty = false;
2095 }
2096 
2097 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2098                                                run_on_cpu_data arg)
2099 {
2100     cpu->vcpu_dirty = true;
2101 }
2102 
2103 /*
2104  * CPU support.
2105  */
2106 
2107 void whpx_cpu_synchronize_state(CPUState *cpu)
2108 {
2109     if (!cpu->vcpu_dirty) {
2110         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2111     }
2112 }
2113 
2114 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2115 {
2116     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2117 }
2118 
2119 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2120 {
2121     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2122 }
2123 
2124 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2125 {
2126     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2127 }
2128 
2129 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2130 {
2131     whpx_global.step_pending = step_pending;
2132 }
2133 
2134 /*
2135  * Vcpu support.
2136  */
2137 
2138 static Error *whpx_migration_blocker;
2139 
2140 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2141 {
2142     CPUX86State *env = opaque;
2143 
2144     if (running) {
2145         env->tsc_valid = false;
2146     }
2147 }
2148 
2149 int whpx_init_vcpu(CPUState *cpu)
2150 {
2151     HRESULT hr;
2152     struct whpx_state *whpx = &whpx_global;
2153     struct whpx_vcpu *vcpu = NULL;
2154     Error *local_error = NULL;
2155     CPUX86State *env = cpu->env_ptr;
2156     X86CPU *x86_cpu = X86_CPU(cpu);
2157     UINT64 freq = 0;
2158     int ret;
2159 
2160     /* Add migration blockers for all unsupported features of the
2161      * Windows Hypervisor Platform
2162      */
2163     if (whpx_migration_blocker == NULL) {
2164         error_setg(&whpx_migration_blocker,
2165                "State blocked due to non-migratable CPUID feature support,"
2166                "dirty memory tracking support, and XSAVE/XRSTOR support");
2167 
2168         if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) {
2169             error_report_err(local_error);
2170             error_free(whpx_migration_blocker);
2171             ret = -EINVAL;
2172             goto error;
2173         }
2174     }
2175 
2176     vcpu = g_new0(struct whpx_vcpu, 1);
2177 
2178     if (!vcpu) {
2179         error_report("WHPX: Failed to allocte VCPU context.");
2180         ret = -ENOMEM;
2181         goto error;
2182     }
2183 
2184     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2185         &whpx_emu_callbacks,
2186         &vcpu->emulator);
2187     if (FAILED(hr)) {
2188         error_report("WHPX: Failed to setup instruction completion support,"
2189                      " hr=%08lx", hr);
2190         ret = -EINVAL;
2191         goto error;
2192     }
2193 
2194     hr = whp_dispatch.WHvCreateVirtualProcessor(
2195         whpx->partition, cpu->cpu_index, 0);
2196     if (FAILED(hr)) {
2197         error_report("WHPX: Failed to create a virtual processor,"
2198                      " hr=%08lx", hr);
2199         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2200         ret = -EINVAL;
2201         goto error;
2202     }
2203 
2204     /*
2205      * vcpu's TSC frequency is either specified by user, or use the value
2206      * provided by Hyper-V if the former is not present. In the latter case, we
2207      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2208      * frequency can be migrated later via this field.
2209      */
2210     if (!env->tsc_khz) {
2211         hr = whp_dispatch.WHvGetCapability(
2212             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2213                 NULL);
2214         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2215             if (FAILED(hr)) {
2216                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2217             } else {
2218                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2219             }
2220         }
2221     }
2222 
2223     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2224     hr = whp_dispatch.WHvGetCapability(
2225         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2226     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2227         if (FAILED(hr)) {
2228             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2229         } else {
2230             env->apic_bus_freq = freq;
2231         }
2232     }
2233 
2234     /*
2235      * If the vmware cpuid frequency leaf option is set, and we have a valid
2236      * tsc value, trap the corresponding cpuid's.
2237      */
2238     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2239         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2240 
2241         hr = whp_dispatch.WHvSetPartitionProperty(
2242                 whpx->partition,
2243                 WHvPartitionPropertyCodeCpuidExitList,
2244                 cpuidExitList,
2245                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2246 
2247         if (FAILED(hr)) {
2248             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2249                         hr);
2250             ret = -EINVAL;
2251             goto error;
2252         }
2253     }
2254 
2255     vcpu->interruptable = true;
2256     cpu->vcpu_dirty = true;
2257     cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu;
2258     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2259     qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr);
2260 
2261     return 0;
2262 
2263 error:
2264     g_free(vcpu);
2265 
2266     return ret;
2267 }
2268 
2269 int whpx_vcpu_exec(CPUState *cpu)
2270 {
2271     int ret;
2272     int fatal;
2273 
2274     for (;;) {
2275         if (cpu->exception_index >= EXCP_INTERRUPT) {
2276             ret = cpu->exception_index;
2277             cpu->exception_index = -1;
2278             break;
2279         }
2280 
2281         fatal = whpx_vcpu_run(cpu);
2282 
2283         if (fatal) {
2284             error_report("WHPX: Failed to exec a virtual processor");
2285             abort();
2286         }
2287     }
2288 
2289     return ret;
2290 }
2291 
2292 void whpx_destroy_vcpu(CPUState *cpu)
2293 {
2294     struct whpx_state *whpx = &whpx_global;
2295     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
2296 
2297     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2298     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2299     g_free(cpu->hax_vcpu);
2300     return;
2301 }
2302 
2303 void whpx_vcpu_kick(CPUState *cpu)
2304 {
2305     struct whpx_state *whpx = &whpx_global;
2306     whp_dispatch.WHvCancelRunVirtualProcessor(
2307         whpx->partition, cpu->cpu_index, 0);
2308 }
2309 
2310 /*
2311  * Memory support.
2312  */
2313 
2314 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2315                                 void *host_va, int add, int rom,
2316                                 const char *name)
2317 {
2318     struct whpx_state *whpx = &whpx_global;
2319     HRESULT hr;
2320 
2321     /*
2322     if (add) {
2323         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2324                (void*)start_pa, (void*)size, host_va,
2325                (rom ? "ROM" : "RAM"), name);
2326     } else {
2327         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2328                (void*)start_pa, (void*)size, host_va, name);
2329     }
2330     */
2331 
2332     if (add) {
2333         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2334                                          host_va,
2335                                          start_pa,
2336                                          size,
2337                                          (WHvMapGpaRangeFlagRead |
2338                                           WHvMapGpaRangeFlagExecute |
2339                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2340     } else {
2341         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2342                                            start_pa,
2343                                            size);
2344     }
2345 
2346     if (FAILED(hr)) {
2347         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2348                      " Host:%p, hr=%08lx",
2349                      (add ? "MAP" : "UNMAP"), name,
2350                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2351     }
2352 }
2353 
2354 static void whpx_process_section(MemoryRegionSection *section, int add)
2355 {
2356     MemoryRegion *mr = section->mr;
2357     hwaddr start_pa = section->offset_within_address_space;
2358     ram_addr_t size = int128_get64(section->size);
2359     unsigned int delta;
2360     uint64_t host_va;
2361 
2362     if (!memory_region_is_ram(mr)) {
2363         return;
2364     }
2365 
2366     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2367     delta &= ~qemu_real_host_page_mask();
2368     if (delta > size) {
2369         return;
2370     }
2371     start_pa += delta;
2372     size -= delta;
2373     size &= qemu_real_host_page_mask();
2374     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2375         return;
2376     }
2377 
2378     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2379             + section->offset_within_region + delta;
2380 
2381     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2382                         memory_region_is_rom(mr), mr->name);
2383 }
2384 
2385 static void whpx_region_add(MemoryListener *listener,
2386                            MemoryRegionSection *section)
2387 {
2388     memory_region_ref(section->mr);
2389     whpx_process_section(section, 1);
2390 }
2391 
2392 static void whpx_region_del(MemoryListener *listener,
2393                            MemoryRegionSection *section)
2394 {
2395     whpx_process_section(section, 0);
2396     memory_region_unref(section->mr);
2397 }
2398 
2399 static void whpx_transaction_begin(MemoryListener *listener)
2400 {
2401 }
2402 
2403 static void whpx_transaction_commit(MemoryListener *listener)
2404 {
2405 }
2406 
2407 static void whpx_log_sync(MemoryListener *listener,
2408                          MemoryRegionSection *section)
2409 {
2410     MemoryRegion *mr = section->mr;
2411 
2412     if (!memory_region_is_ram(mr)) {
2413         return;
2414     }
2415 
2416     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2417 }
2418 
2419 static MemoryListener whpx_memory_listener = {
2420     .name = "whpx",
2421     .begin = whpx_transaction_begin,
2422     .commit = whpx_transaction_commit,
2423     .region_add = whpx_region_add,
2424     .region_del = whpx_region_del,
2425     .log_sync = whpx_log_sync,
2426     .priority = 10,
2427 };
2428 
2429 static void whpx_memory_init(void)
2430 {
2431     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2432 }
2433 
2434 /*
2435  * Load the functions from the given library, using the given handle. If a
2436  * handle is provided, it is used, otherwise the library is opened. The
2437  * handle will be updated on return with the opened one.
2438  */
2439 static bool load_whp_dispatch_fns(HMODULE *handle,
2440     WHPFunctionList function_list)
2441 {
2442     HMODULE hLib = *handle;
2443 
2444     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2445     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2446     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2447         whp_dispatch.function_name = \
2448             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2449 
2450     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2451         whp_dispatch.function_name = \
2452             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2453         if (!whp_dispatch.function_name) { \
2454             error_report("Could not load function %s", #function_name); \
2455             goto error; \
2456         } \
2457 
2458     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2459     if (!handle_lib) { \
2460         handle_lib = LoadLibrary(lib_name); \
2461         if (!handle_lib) { \
2462             error_report("Could not load library %s.", lib_name); \
2463             goto error; \
2464         } \
2465     } \
2466 
2467     switch (function_list) {
2468     case WINHV_PLATFORM_FNS_DEFAULT:
2469         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2470         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2471         break;
2472 
2473     case WINHV_EMULATION_FNS_DEFAULT:
2474         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2475         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2476         break;
2477 
2478     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2479         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2480         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2481         break;
2482     }
2483 
2484     *handle = hLib;
2485     return true;
2486 
2487 error:
2488     if (hLib) {
2489         FreeLibrary(hLib);
2490     }
2491 
2492     return false;
2493 }
2494 
2495 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2496                                    const char *name, void *opaque,
2497                                    Error **errp)
2498 {
2499     struct whpx_state *whpx = &whpx_global;
2500     OnOffSplit mode;
2501 
2502     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2503         return;
2504     }
2505 
2506     switch (mode) {
2507     case ON_OFF_SPLIT_ON:
2508         whpx->kernel_irqchip_allowed = true;
2509         whpx->kernel_irqchip_required = true;
2510         break;
2511 
2512     case ON_OFF_SPLIT_OFF:
2513         whpx->kernel_irqchip_allowed = false;
2514         whpx->kernel_irqchip_required = false;
2515         break;
2516 
2517     case ON_OFF_SPLIT_SPLIT:
2518         error_setg(errp, "WHPX: split irqchip currently not supported");
2519         error_append_hint(errp,
2520             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2521         break;
2522 
2523     default:
2524         /*
2525          * The value was checked in visit_type_OnOffSplit() above. If
2526          * we get here, then something is wrong in QEMU.
2527          */
2528         abort();
2529     }
2530 }
2531 
2532 /*
2533  * Partition support
2534  */
2535 
2536 static int whpx_accel_init(MachineState *ms)
2537 {
2538     struct whpx_state *whpx;
2539     int ret;
2540     HRESULT hr;
2541     WHV_CAPABILITY whpx_cap;
2542     UINT32 whpx_cap_size;
2543     WHV_PARTITION_PROPERTY prop;
2544     UINT32 cpuidExitList[] = {1, 0x80000001};
2545     WHV_CAPABILITY_FEATURES features = {0};
2546 
2547     whpx = &whpx_global;
2548 
2549     if (!init_whp_dispatch()) {
2550         ret = -ENOSYS;
2551         goto error;
2552     }
2553 
2554     whpx->mem_quota = ms->ram_size;
2555 
2556     hr = whp_dispatch.WHvGetCapability(
2557         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2558         sizeof(whpx_cap), &whpx_cap_size);
2559     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2560         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2561         ret = -ENOSPC;
2562         goto error;
2563     }
2564 
2565     hr = whp_dispatch.WHvGetCapability(
2566         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2567     if (FAILED(hr)) {
2568         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2569         ret = -EINVAL;
2570         goto error;
2571     }
2572 
2573     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2574     if (FAILED(hr)) {
2575         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2576         ret = -EINVAL;
2577         goto error;
2578     }
2579 
2580     /*
2581      * Query the XSAVE capability of the partition. Any error here is not
2582      * considered fatal.
2583      */
2584     hr = whp_dispatch.WHvGetPartitionProperty(
2585         whpx->partition,
2586         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2587         &whpx_xsave_cap,
2588         sizeof(whpx_xsave_cap),
2589         &whpx_cap_size);
2590 
2591     /*
2592      * Windows version which don't support this property will return with the
2593      * specific error code.
2594      */
2595     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2596         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2597     }
2598 
2599     if (!whpx_has_xsave()) {
2600         printf("WHPX: Partition is not XSAVE capable\n");
2601     }
2602 
2603     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2604     prop.ProcessorCount = ms->smp.cpus;
2605     hr = whp_dispatch.WHvSetPartitionProperty(
2606         whpx->partition,
2607         WHvPartitionPropertyCodeProcessorCount,
2608         &prop,
2609         sizeof(WHV_PARTITION_PROPERTY));
2610 
2611     if (FAILED(hr)) {
2612         error_report("WHPX: Failed to set partition core count to %d,"
2613                      " hr=%08lx", ms->smp.cores, hr);
2614         ret = -EINVAL;
2615         goto error;
2616     }
2617 
2618     /*
2619      * Error out if WHP doesn't support apic emulation and user is requiring
2620      * it.
2621      */
2622     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2623             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2624         error_report("WHPX: kernel irqchip requested, but unavailable. "
2625             "Try without kernel-irqchip or with kernel-irqchip=off");
2626         ret = -EINVAL;
2627         goto error;
2628     }
2629 
2630     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2631         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2632         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2633             WHvX64LocalApicEmulationModeXApic;
2634         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2635         hr = whp_dispatch.WHvSetPartitionProperty(
2636             whpx->partition,
2637             WHvPartitionPropertyCodeLocalApicEmulationMode,
2638             &mode,
2639             sizeof(mode));
2640         if (FAILED(hr)) {
2641             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2642             if (whpx->kernel_irqchip_required) {
2643                 error_report("WHPX: kernel irqchip requested, but unavailable");
2644                 ret = -EINVAL;
2645                 goto error;
2646             }
2647         } else {
2648             whpx->apic_in_platform = true;
2649         }
2650     }
2651 
2652     /* Register for MSR and CPUID exits */
2653     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2654     prop.ExtendedVmExits.X64MsrExit = 1;
2655     prop.ExtendedVmExits.X64CpuidExit = 1;
2656     prop.ExtendedVmExits.ExceptionExit = 1;
2657     if (whpx_apic_in_platform()) {
2658         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2659     }
2660 
2661     hr = whp_dispatch.WHvSetPartitionProperty(
2662             whpx->partition,
2663             WHvPartitionPropertyCodeExtendedVmExits,
2664             &prop,
2665             sizeof(WHV_PARTITION_PROPERTY));
2666     if (FAILED(hr)) {
2667         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2668         ret = -EINVAL;
2669         goto error;
2670     }
2671 
2672     hr = whp_dispatch.WHvSetPartitionProperty(
2673         whpx->partition,
2674         WHvPartitionPropertyCodeCpuidExitList,
2675         cpuidExitList,
2676         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2677 
2678     if (FAILED(hr)) {
2679         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2680                      hr);
2681         ret = -EINVAL;
2682         goto error;
2683     }
2684 
2685     /*
2686      * We do not want to intercept any exceptions from the guest,
2687      * until we actually start debugging with gdb.
2688      */
2689     whpx->exception_exit_bitmap = -1;
2690     hr = whpx_set_exception_exit_bitmap(0);
2691 
2692     if (FAILED(hr)) {
2693         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2694         ret = -EINVAL;
2695         goto error;
2696     }
2697 
2698     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2699     if (FAILED(hr)) {
2700         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2701         ret = -EINVAL;
2702         goto error;
2703     }
2704 
2705     whpx_memory_init();
2706 
2707     printf("Windows Hypervisor Platform accelerator is operational\n");
2708     return 0;
2709 
2710 error:
2711 
2712     if (NULL != whpx->partition) {
2713         whp_dispatch.WHvDeletePartition(whpx->partition);
2714         whpx->partition = NULL;
2715     }
2716 
2717     return ret;
2718 }
2719 
2720 int whpx_enabled(void)
2721 {
2722     return whpx_allowed;
2723 }
2724 
2725 bool whpx_apic_in_platform(void) {
2726     return whpx_global.apic_in_platform;
2727 }
2728 
2729 static void whpx_accel_class_init(ObjectClass *oc, void *data)
2730 {
2731     AccelClass *ac = ACCEL_CLASS(oc);
2732     ac->name = "WHPX";
2733     ac->init_machine = whpx_accel_init;
2734     ac->allowed = &whpx_allowed;
2735 
2736     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2737         NULL, whpx_set_kernel_irqchip,
2738         NULL, NULL);
2739     object_class_property_set_description(oc, "kernel-irqchip",
2740         "Configure WHPX in-kernel irqchip");
2741 }
2742 
2743 static void whpx_accel_instance_init(Object *obj)
2744 {
2745     struct whpx_state *whpx = &whpx_global;
2746 
2747     memset(whpx, 0, sizeof(struct whpx_state));
2748     /* Turn on kernel-irqchip, by default */
2749     whpx->kernel_irqchip_allowed = true;
2750 }
2751 
2752 static const TypeInfo whpx_accel_type = {
2753     .name = ACCEL_CLASS_NAME("whpx"),
2754     .parent = TYPE_ACCEL,
2755     .instance_init = whpx_accel_instance_init,
2756     .class_init = whpx_accel_class_init,
2757 };
2758 
2759 static void whpx_type_init(void)
2760 {
2761     type_register_static(&whpx_accel_type);
2762 }
2763 
2764 bool init_whp_dispatch(void)
2765 {
2766     if (whp_dispatch_initialized) {
2767         return true;
2768     }
2769 
2770     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2771         goto error;
2772     }
2773 
2774     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2775         goto error;
2776     }
2777 
2778     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2779         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2780     whp_dispatch_initialized = true;
2781 
2782     return true;
2783 error:
2784     if (hWinHvPlatform) {
2785         FreeLibrary(hWinHvPlatform);
2786     }
2787 
2788     if (hWinHvEmulation) {
2789         FreeLibrary(hWinHvEmulation);
2790     }
2791 
2792     return false;
2793 }
2794 
2795 type_init(whpx_type_init);
2796