xref: /openbmc/qemu/target/i386/whpx/whpx-all.c (revision 756a98dd)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "exec/gdbstub.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/i386/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <WinHvPlatform.h>
35 #include <WinHvEmulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct whpx_vcpu {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240 
241     /* Must be the last field as it may have a tail */
242     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
243 };
244 
245 static bool whpx_allowed;
246 static bool whp_dispatch_initialized;
247 static HMODULE hWinHvPlatform, hWinHvEmulation;
248 static uint32_t max_vcpu_index;
249 struct whpx_state whpx_global;
250 struct WHPDispatch whp_dispatch;
251 
252 
253 /*
254  * VP support
255  */
256 
257 static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu)
258 {
259     return (struct whpx_vcpu *)cpu->hax_vcpu;
260 }
261 
262 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
263                                              int r86)
264 {
265     WHV_X64_SEGMENT_REGISTER hs;
266     unsigned flags = qs->flags;
267 
268     hs.Base = qs->base;
269     hs.Limit = qs->limit;
270     hs.Selector = qs->selector;
271 
272     if (v86) {
273         hs.Attributes = 0;
274         hs.SegmentType = 3;
275         hs.Present = 1;
276         hs.DescriptorPrivilegeLevel = 3;
277         hs.NonSystemSegment = 1;
278 
279     } else {
280         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
281 
282         if (r86) {
283             /* hs.Base &= 0xfffff; */
284         }
285     }
286 
287     return hs;
288 }
289 
290 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
291 {
292     SegmentCache qs;
293 
294     qs.base = hs->Base;
295     qs.limit = hs->Limit;
296     qs.selector = hs->Selector;
297 
298     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
299 
300     return qs;
301 }
302 
303 static int whpx_set_tsc(CPUState *cpu)
304 {
305     CPUX86State *env = cpu->env_ptr;
306     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
307     WHV_REGISTER_VALUE tsc_val;
308     HRESULT hr;
309     struct whpx_state *whpx = &whpx_global;
310 
311     /*
312      * Suspend the partition prior to setting the TSC to reduce the variance
313      * in TSC across vCPUs. When the first vCPU runs post suspend, the
314      * partition is automatically resumed.
315      */
316     if (whp_dispatch.WHvSuspendPartitionTime) {
317 
318         /*
319          * Unable to suspend partition while setting TSC is not a fatal
320          * error. It just increases the likelihood of TSC variance between
321          * vCPUs and some guest OS are able to handle that just fine.
322          */
323         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
324         if (FAILED(hr)) {
325             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
326         }
327     }
328 
329     tsc_val.Reg64 = env->tsc;
330     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
331         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
332     if (FAILED(hr)) {
333         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
334         return -1;
335     }
336 
337     return 0;
338 }
339 
340 /*
341  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
342  * however, they use a slightly different encoding. Specifically:
343  *
344  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
345  *
346  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
347  * and IA-32 Architectures Software Developer's Manual.
348  */
349 
350 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
351 {
352     return tpr >> 4;
353 }
354 
355 static void whpx_set_registers(CPUState *cpu, int level)
356 {
357     struct whpx_state *whpx = &whpx_global;
358     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
359     CPUX86State *env = cpu->env_ptr;
360     X86CPU *x86_cpu = X86_CPU(cpu);
361     struct whpx_register_set vcxt;
362     HRESULT hr;
363     int idx;
364     int idx_next;
365     int i;
366     int v86, r86;
367 
368     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
369 
370     /*
371      * Following MSRs have side effects on the guest or are too heavy for
372      * runtime. Limit them to full state update.
373      */
374     if (level >= WHPX_SET_RESET_STATE) {
375         whpx_set_tsc(cpu);
376     }
377 
378     memset(&vcxt, 0, sizeof(struct whpx_register_set));
379 
380     v86 = (env->eflags & VM_MASK);
381     r86 = !(env->cr[0] & CR0_PE_MASK);
382 
383     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
384     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
385 
386     idx = 0;
387 
388     /* Indexes for first 16 registers match between HV and QEMU definitions */
389     idx_next = 16;
390     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
391         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
392     }
393     idx = idx_next;
394 
395     /* Same goes for RIP and RFLAGS */
396     assert(whpx_register_names[idx] == WHvX64RegisterRip);
397     vcxt.values[idx++].Reg64 = env->eip;
398 
399     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
400     vcxt.values[idx++].Reg64 = env->eflags;
401 
402     /* Translate 6+4 segment registers. HV and QEMU order matches  */
403     assert(idx == WHvX64RegisterEs);
404     for (i = 0; i < 6; i += 1, idx += 1) {
405         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
406     }
407 
408     assert(idx == WHvX64RegisterLdtr);
409     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
410 
411     assert(idx == WHvX64RegisterTr);
412     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
413 
414     assert(idx == WHvX64RegisterIdtr);
415     vcxt.values[idx].Table.Base = env->idt.base;
416     vcxt.values[idx].Table.Limit = env->idt.limit;
417     idx += 1;
418 
419     assert(idx == WHvX64RegisterGdtr);
420     vcxt.values[idx].Table.Base = env->gdt.base;
421     vcxt.values[idx].Table.Limit = env->gdt.limit;
422     idx += 1;
423 
424     /* CR0, 2, 3, 4, 8 */
425     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
426     vcxt.values[idx++].Reg64 = env->cr[0];
427     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
428     vcxt.values[idx++].Reg64 = env->cr[2];
429     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
430     vcxt.values[idx++].Reg64 = env->cr[3];
431     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
432     vcxt.values[idx++].Reg64 = env->cr[4];
433     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
434     vcxt.values[idx++].Reg64 = vcpu->tpr;
435 
436     /* 8 Debug Registers - Skipped */
437 
438     /* 16 XMM registers */
439     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
440     idx_next = idx + 16;
441     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
442         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
443         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
444     }
445     idx = idx_next;
446 
447     /* 8 FP registers */
448     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
449     for (i = 0; i < 8; i += 1, idx += 1) {
450         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
451         /* vcxt.values[idx].Fp.AsUINT128.High64 =
452                env->fpregs[i].mmx.MMX_Q(1);
453         */
454     }
455 
456     /* FP control status register */
457     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
458     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
459     vcxt.values[idx].FpControlStatus.FpStatus =
460         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
461     vcxt.values[idx].FpControlStatus.FpTag = 0;
462     for (i = 0; i < 8; ++i) {
463         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
464     }
465     vcxt.values[idx].FpControlStatus.Reserved = 0;
466     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
467     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
468     idx += 1;
469 
470     /* XMM control status register */
471     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
472     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
473     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
474     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
475     idx += 1;
476 
477     /* MSRs */
478     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
479     vcxt.values[idx++].Reg64 = env->efer;
480 #ifdef TARGET_X86_64
481     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
482     vcxt.values[idx++].Reg64 = env->kernelgsbase;
483 #endif
484 
485     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
486     vcxt.values[idx++].Reg64 = vcpu->apic_base;
487 
488     /* WHvX64RegisterPat - Skipped */
489 
490     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
491     vcxt.values[idx++].Reg64 = env->sysenter_cs;
492     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
493     vcxt.values[idx++].Reg64 = env->sysenter_eip;
494     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
495     vcxt.values[idx++].Reg64 = env->sysenter_esp;
496     assert(whpx_register_names[idx] == WHvX64RegisterStar);
497     vcxt.values[idx++].Reg64 = env->star;
498 #ifdef TARGET_X86_64
499     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
500     vcxt.values[idx++].Reg64 = env->lstar;
501     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
502     vcxt.values[idx++].Reg64 = env->cstar;
503     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
504     vcxt.values[idx++].Reg64 = env->fmask;
505 #endif
506 
507     /* Interrupt / Event Registers - Skipped */
508 
509     assert(idx == RTL_NUMBER_OF(whpx_register_names));
510 
511     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
512         whpx->partition, cpu->cpu_index,
513         whpx_register_names,
514         RTL_NUMBER_OF(whpx_register_names),
515         &vcxt.values[0]);
516 
517     if (FAILED(hr)) {
518         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
519                      hr);
520     }
521 
522     return;
523 }
524 
525 static int whpx_get_tsc(CPUState *cpu)
526 {
527     CPUX86State *env = cpu->env_ptr;
528     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
529     WHV_REGISTER_VALUE tsc_val;
530     HRESULT hr;
531     struct whpx_state *whpx = &whpx_global;
532 
533     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
534         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
535     if (FAILED(hr)) {
536         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
537         return -1;
538     }
539 
540     env->tsc = tsc_val.Reg64;
541     return 0;
542 }
543 
544 static void whpx_get_registers(CPUState *cpu)
545 {
546     struct whpx_state *whpx = &whpx_global;
547     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
548     CPUX86State *env = cpu->env_ptr;
549     X86CPU *x86_cpu = X86_CPU(cpu);
550     struct whpx_register_set vcxt;
551     uint64_t tpr, apic_base;
552     HRESULT hr;
553     int idx;
554     int idx_next;
555     int i;
556 
557     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
558 
559     if (!env->tsc_valid) {
560         whpx_get_tsc(cpu);
561         env->tsc_valid = !runstate_is_running();
562     }
563 
564     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
565         whpx->partition, cpu->cpu_index,
566         whpx_register_names,
567         RTL_NUMBER_OF(whpx_register_names),
568         &vcxt.values[0]);
569     if (FAILED(hr)) {
570         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
571                      hr);
572     }
573 
574     if (whpx_apic_in_platform()) {
575         /*
576          * Fetch the TPR value from the emulated APIC. It may get overwritten
577          * below with the value from CR8 returned by
578          * WHvGetVirtualProcessorRegisters().
579          */
580         whpx_apic_get(x86_cpu->apic_state);
581         vcpu->tpr = whpx_apic_tpr_to_cr8(
582             cpu_get_apic_tpr(x86_cpu->apic_state));
583     }
584 
585     idx = 0;
586 
587     /* Indexes for first 16 registers match between HV and QEMU definitions */
588     idx_next = 16;
589     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
590         env->regs[idx] = vcxt.values[idx].Reg64;
591     }
592     idx = idx_next;
593 
594     /* Same goes for RIP and RFLAGS */
595     assert(whpx_register_names[idx] == WHvX64RegisterRip);
596     env->eip = vcxt.values[idx++].Reg64;
597     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
598     env->eflags = vcxt.values[idx++].Reg64;
599 
600     /* Translate 6+4 segment registers. HV and QEMU order matches  */
601     assert(idx == WHvX64RegisterEs);
602     for (i = 0; i < 6; i += 1, idx += 1) {
603         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
604     }
605 
606     assert(idx == WHvX64RegisterLdtr);
607     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
608     assert(idx == WHvX64RegisterTr);
609     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
610     assert(idx == WHvX64RegisterIdtr);
611     env->idt.base = vcxt.values[idx].Table.Base;
612     env->idt.limit = vcxt.values[idx].Table.Limit;
613     idx += 1;
614     assert(idx == WHvX64RegisterGdtr);
615     env->gdt.base = vcxt.values[idx].Table.Base;
616     env->gdt.limit = vcxt.values[idx].Table.Limit;
617     idx += 1;
618 
619     /* CR0, 2, 3, 4, 8 */
620     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
621     env->cr[0] = vcxt.values[idx++].Reg64;
622     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
623     env->cr[2] = vcxt.values[idx++].Reg64;
624     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
625     env->cr[3] = vcxt.values[idx++].Reg64;
626     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
627     env->cr[4] = vcxt.values[idx++].Reg64;
628     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
629     tpr = vcxt.values[idx++].Reg64;
630     if (tpr != vcpu->tpr) {
631         vcpu->tpr = tpr;
632         cpu_set_apic_tpr(x86_cpu->apic_state, tpr);
633     }
634 
635     /* 8 Debug Registers - Skipped */
636 
637     /* 16 XMM registers */
638     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
639     idx_next = idx + 16;
640     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
641         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
642         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
643     }
644     idx = idx_next;
645 
646     /* 8 FP registers */
647     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
648     for (i = 0; i < 8; i += 1, idx += 1) {
649         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
650         /* env->fpregs[i].mmx.MMX_Q(1) =
651                vcxt.values[idx].Fp.AsUINT128.High64;
652         */
653     }
654 
655     /* FP control status register */
656     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
657     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
658     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
659     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
660     for (i = 0; i < 8; ++i) {
661         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
662     }
663     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
664     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
665     idx += 1;
666 
667     /* XMM control status register */
668     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
669     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
670     idx += 1;
671 
672     /* MSRs */
673     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
674     env->efer = vcxt.values[idx++].Reg64;
675 #ifdef TARGET_X86_64
676     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
677     env->kernelgsbase = vcxt.values[idx++].Reg64;
678 #endif
679 
680     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
681     apic_base = vcxt.values[idx++].Reg64;
682     if (apic_base != vcpu->apic_base) {
683         vcpu->apic_base = apic_base;
684         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
685     }
686 
687     /* WHvX64RegisterPat - Skipped */
688 
689     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
690     env->sysenter_cs = vcxt.values[idx++].Reg64;
691     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
692     env->sysenter_eip = vcxt.values[idx++].Reg64;
693     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
694     env->sysenter_esp = vcxt.values[idx++].Reg64;
695     assert(whpx_register_names[idx] == WHvX64RegisterStar);
696     env->star = vcxt.values[idx++].Reg64;
697 #ifdef TARGET_X86_64
698     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
699     env->lstar = vcxt.values[idx++].Reg64;
700     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
701     env->cstar = vcxt.values[idx++].Reg64;
702     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
703     env->fmask = vcxt.values[idx++].Reg64;
704 #endif
705 
706     /* Interrupt / Event Registers - Skipped */
707 
708     assert(idx == RTL_NUMBER_OF(whpx_register_names));
709 
710     if (whpx_apic_in_platform()) {
711         whpx_apic_get(x86_cpu->apic_state);
712     }
713 
714     x86_update_hflags(env);
715 
716     return;
717 }
718 
719 static HRESULT CALLBACK whpx_emu_ioport_callback(
720     void *ctx,
721     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
722 {
723     MemTxAttrs attrs = { 0 };
724     address_space_rw(&address_space_io, IoAccess->Port, attrs,
725                      &IoAccess->Data, IoAccess->AccessSize,
726                      IoAccess->Direction);
727     return S_OK;
728 }
729 
730 static HRESULT CALLBACK whpx_emu_mmio_callback(
731     void *ctx,
732     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
733 {
734     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
735                            ma->Direction);
736     return S_OK;
737 }
738 
739 static HRESULT CALLBACK whpx_emu_getreg_callback(
740     void *ctx,
741     const WHV_REGISTER_NAME *RegisterNames,
742     UINT32 RegisterCount,
743     WHV_REGISTER_VALUE *RegisterValues)
744 {
745     HRESULT hr;
746     struct whpx_state *whpx = &whpx_global;
747     CPUState *cpu = (CPUState *)ctx;
748 
749     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
750         whpx->partition, cpu->cpu_index,
751         RegisterNames, RegisterCount,
752         RegisterValues);
753     if (FAILED(hr)) {
754         error_report("WHPX: Failed to get virtual processor registers,"
755                      " hr=%08lx", hr);
756     }
757 
758     return hr;
759 }
760 
761 static HRESULT CALLBACK whpx_emu_setreg_callback(
762     void *ctx,
763     const WHV_REGISTER_NAME *RegisterNames,
764     UINT32 RegisterCount,
765     const WHV_REGISTER_VALUE *RegisterValues)
766 {
767     HRESULT hr;
768     struct whpx_state *whpx = &whpx_global;
769     CPUState *cpu = (CPUState *)ctx;
770 
771     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
772         whpx->partition, cpu->cpu_index,
773         RegisterNames, RegisterCount,
774         RegisterValues);
775     if (FAILED(hr)) {
776         error_report("WHPX: Failed to set virtual processor registers,"
777                      " hr=%08lx", hr);
778     }
779 
780     /*
781      * The emulator just successfully wrote the register state. We clear the
782      * dirty state so we avoid the double write on resume of the VP.
783      */
784     cpu->vcpu_dirty = false;
785 
786     return hr;
787 }
788 
789 static HRESULT CALLBACK whpx_emu_translate_callback(
790     void *ctx,
791     WHV_GUEST_VIRTUAL_ADDRESS Gva,
792     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
793     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
794     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
795 {
796     HRESULT hr;
797     struct whpx_state *whpx = &whpx_global;
798     CPUState *cpu = (CPUState *)ctx;
799     WHV_TRANSLATE_GVA_RESULT res;
800 
801     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
802                                       Gva, TranslateFlags, &res, Gpa);
803     if (FAILED(hr)) {
804         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
805     } else {
806         *TranslationResult = res.ResultCode;
807     }
808 
809     return hr;
810 }
811 
812 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
813     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
814     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
815     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
816     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
817     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
818     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
819 };
820 
821 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
822 {
823     HRESULT hr;
824     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
825     WHV_EMULATOR_STATUS emu_status;
826 
827     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
828         vcpu->emulator, cpu,
829         &vcpu->exit_ctx.VpContext, ctx,
830         &emu_status);
831     if (FAILED(hr)) {
832         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
833         return -1;
834     }
835 
836     if (!emu_status.EmulationSuccessful) {
837         error_report("WHPX: Failed to emulate MMIO access with"
838                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
839         return -1;
840     }
841 
842     return 0;
843 }
844 
845 static int whpx_handle_portio(CPUState *cpu,
846                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
847 {
848     HRESULT hr;
849     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
850     WHV_EMULATOR_STATUS emu_status;
851 
852     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
853         vcpu->emulator, cpu,
854         &vcpu->exit_ctx.VpContext, ctx,
855         &emu_status);
856     if (FAILED(hr)) {
857         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
858         return -1;
859     }
860 
861     if (!emu_status.EmulationSuccessful) {
862         error_report("WHPX: Failed to emulate PortIO access with"
863                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
864         return -1;
865     }
866 
867     return 0;
868 }
869 
870 /*
871  * Controls whether we should intercept various exceptions on the guest,
872  * namely breakpoint/single-step events.
873  *
874  * The 'exceptions' argument accepts a bitmask, e.g:
875  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
876  */
877 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
878 {
879     struct whpx_state *whpx = &whpx_global;
880     WHV_PARTITION_PROPERTY prop = { 0, };
881     HRESULT hr;
882 
883     if (exceptions == whpx->exception_exit_bitmap) {
884         return S_OK;
885     }
886 
887     prop.ExceptionExitBitmap = exceptions;
888 
889     hr = whp_dispatch.WHvSetPartitionProperty(
890         whpx->partition,
891         WHvPartitionPropertyCodeExceptionExitBitmap,
892         &prop,
893         sizeof(WHV_PARTITION_PROPERTY));
894 
895     if (SUCCEEDED(hr)) {
896         whpx->exception_exit_bitmap = exceptions;
897     }
898 
899     return hr;
900 }
901 
902 
903 /*
904  * This function is called before/after stepping over a single instruction.
905  * It will update the CPU registers to arm/disarm the instruction stepping
906  * accordingly.
907  */
908 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
909     bool set,
910     uint64_t *exit_context_rflags)
911 {
912     WHV_REGISTER_NAME reg_name;
913     WHV_REGISTER_VALUE reg_value;
914     HRESULT hr;
915     struct whpx_state *whpx = &whpx_global;
916 
917     /*
918      * If we are trying to step over a single instruction, we need to set the
919      * TF bit in rflags. Otherwise, clear it.
920      */
921     reg_name = WHvX64RegisterRflags;
922     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
923         whpx->partition,
924         cpu->cpu_index,
925         &reg_name,
926         1,
927         &reg_value);
928 
929     if (FAILED(hr)) {
930         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
931         return hr;
932     }
933 
934     if (exit_context_rflags) {
935         assert(*exit_context_rflags == reg_value.Reg64);
936     }
937 
938     if (set) {
939         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
940         reg_value.Reg64 |= TF_MASK;
941     } else {
942         reg_value.Reg64 &= ~TF_MASK;
943     }
944 
945     if (exit_context_rflags) {
946         *exit_context_rflags = reg_value.Reg64;
947     }
948 
949     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
950         whpx->partition,
951         cpu->cpu_index,
952         &reg_name,
953         1,
954         &reg_value);
955 
956     if (FAILED(hr)) {
957         error_report("WHPX: Failed to set rflags,"
958             " hr=%08lx",
959             hr);
960         return hr;
961     }
962 
963     reg_name = WHvRegisterInterruptState;
964     reg_value.Reg64 = 0;
965 
966     /* Suspend delivery of hardware interrupts during single-stepping. */
967     reg_value.InterruptState.InterruptShadow = set != 0;
968 
969     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
970     whpx->partition,
971         cpu->cpu_index,
972         &reg_name,
973         1,
974         &reg_value);
975 
976     if (FAILED(hr)) {
977         error_report("WHPX: Failed to set InterruptState,"
978             " hr=%08lx",
979             hr);
980         return hr;
981     }
982 
983     if (!set) {
984         /*
985          * We have just finished stepping over a single instruction,
986          * and intercepted the INT1 generated by it.
987          * We need to now hide the INT1 from the guest,
988          * as it would not be expecting it.
989          */
990 
991         reg_name = WHvX64RegisterPendingDebugException;
992         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
993         whpx->partition,
994             cpu->cpu_index,
995             &reg_name,
996             1,
997             &reg_value);
998 
999         if (FAILED(hr)) {
1000             error_report("WHPX: Failed to get pending debug exceptions,"
1001                          "hr=%08lx", hr);
1002             return hr;
1003         }
1004 
1005         if (reg_value.PendingDebugException.SingleStep) {
1006             reg_value.PendingDebugException.SingleStep = 0;
1007 
1008             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1009                 whpx->partition,
1010                 cpu->cpu_index,
1011                 &reg_name,
1012                 1,
1013                 &reg_value);
1014 
1015             if (FAILED(hr)) {
1016                 error_report("WHPX: Failed to clear pending debug exceptions,"
1017                              "hr=%08lx", hr);
1018              return hr;
1019             }
1020         }
1021 
1022     }
1023 
1024     return S_OK;
1025 }
1026 
1027 /* Tries to find a breakpoint at the specified address. */
1028 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1029 {
1030     struct whpx_state *whpx = &whpx_global;
1031     int i;
1032 
1033     if (whpx->breakpoints.breakpoints) {
1034         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1035             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1036                 return &whpx->breakpoints.breakpoints->data[i];
1037             }
1038         }
1039     }
1040 
1041     return NULL;
1042 }
1043 
1044 /*
1045  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1046  * debugging user-mode applications. Since the WHPX API does not offer
1047  * an easy way to pass the intercepted exception back to the guest, we
1048  * resort to using INT1 instead, and let the guest always handle INT3.
1049  */
1050 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1051 
1052 /*
1053  * The WHPX QEMU backend implements breakpoints by writing the INT1
1054  * instruction into memory (ignoring the DRx registers). This raises a few
1055  * issues that need to be carefully handled:
1056  *
1057  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1058  *    at the same location, and later remove them in arbitrary order.
1059  *    This should not cause memory corruption, and should only remove the
1060  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1061  *
1062  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1063  *    physical location. Hence, physically adding/removing a breakpoint can
1064  *    theoretically fail at any time. We need to keep track of it.
1065  *
1066  * The function below rebuilds a list of low-level breakpoints (one per
1067  * address, tracking the original instruction and any errors) from the list of
1068  * high-level breakpoints (set via cpu_breakpoint_insert()).
1069  *
1070  * In order to optimize performance, this function stores the list of
1071  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1072  * low-level ones, so that it won't be re-invoked until these breakpoints
1073  * change.
1074  *
1075  * Note that this function decides which breakpoints should be inserted into,
1076  * memory, but doesn't actually do it. The memory accessing is done in
1077  * whpx_apply_breakpoints().
1078  */
1079 static void whpx_translate_cpu_breakpoints(
1080     struct whpx_breakpoints *breakpoints,
1081     CPUState *cpu,
1082     int cpu_breakpoint_count)
1083 {
1084     CPUBreakpoint *bp;
1085     int cpu_bp_index = 0;
1086 
1087     breakpoints->original_addresses =
1088         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1089 
1090     breakpoints->original_address_count = cpu_breakpoint_count;
1091 
1092     int max_breakpoints = cpu_breakpoint_count +
1093         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1094 
1095     struct whpx_breakpoint_collection *new_breakpoints =
1096         (struct whpx_breakpoint_collection *)g_malloc0(
1097         sizeof(struct whpx_breakpoint_collection) +
1098             max_breakpoints * sizeof(struct whpx_breakpoint));
1099 
1100     new_breakpoints->allocated = max_breakpoints;
1101     new_breakpoints->used = 0;
1102 
1103     /*
1104      * 1. Preserve all old breakpoints that could not be automatically
1105      * cleared when the CPU got stopped.
1106      */
1107     if (breakpoints->breakpoints) {
1108         int i;
1109         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1110             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1111                 new_breakpoints->data[new_breakpoints->used++] =
1112                     breakpoints->breakpoints->data[i];
1113             }
1114         }
1115     }
1116 
1117     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1118     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1119         int i;
1120         bool found = false;
1121 
1122         /* This will be used to detect changed CPU breakpoints later. */
1123         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1124 
1125         for (i = 0; i < new_breakpoints->used; i++) {
1126             /*
1127              * WARNING: This loop has O(N^2) complexity, where N is the
1128              * number of breakpoints. It should not be a bottleneck in
1129              * real-world scenarios, since it only needs to run once after
1130              * the breakpoints have been modified.
1131              * If this ever becomes a concern, it can be optimized by storing
1132              * high-level breakpoint objects in a tree or hash map.
1133              */
1134 
1135             if (new_breakpoints->data[i].address == bp->pc) {
1136                 /* There was already a breakpoint at this address. */
1137                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1138                     new_breakpoints->data[i].state = WHPX_BP_SET;
1139                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1140                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1141                 }
1142 
1143                 found = true;
1144                 break;
1145             }
1146         }
1147 
1148         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1149             /* No WHPX breakpoint at this address. Create one. */
1150             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1151             new_breakpoints->data[new_breakpoints->used].state =
1152                 WHPX_BP_SET_PENDING;
1153             new_breakpoints->used++;
1154         }
1155     }
1156 
1157     if (breakpoints->breakpoints) {
1158         /*
1159          * Free the previous breakpoint list. This can be optimized by keeping
1160          * it as shadow buffer for the next computation instead of freeing
1161          * it immediately.
1162          */
1163         g_free(breakpoints->breakpoints);
1164     }
1165 
1166     breakpoints->breakpoints = new_breakpoints;
1167 }
1168 
1169 /*
1170  * Physically inserts/removes the breakpoints by reading and writing the
1171  * physical memory, keeping a track of the failed attempts.
1172  *
1173  * Passing resuming=true  will try to set all previously unset breakpoints.
1174  * Passing resuming=false will remove all inserted ones.
1175  */
1176 static void whpx_apply_breakpoints(
1177     struct whpx_breakpoint_collection *breakpoints,
1178     CPUState *cpu,
1179     bool resuming)
1180 {
1181     int i, rc;
1182     if (!breakpoints) {
1183         return;
1184     }
1185 
1186     for (i = 0; i < breakpoints->used; i++) {
1187         /* Decide what to do right now based on the last known state. */
1188         WhpxBreakpointState state = breakpoints->data[i].state;
1189         switch (state) {
1190         case WHPX_BP_CLEARED:
1191             if (resuming) {
1192                 state = WHPX_BP_SET_PENDING;
1193             }
1194             break;
1195         case WHPX_BP_SET_PENDING:
1196             if (!resuming) {
1197                 state = WHPX_BP_CLEARED;
1198             }
1199             break;
1200         case WHPX_BP_SET:
1201             if (!resuming) {
1202                 state = WHPX_BP_CLEAR_PENDING;
1203             }
1204             break;
1205         case WHPX_BP_CLEAR_PENDING:
1206             if (resuming) {
1207                 state = WHPX_BP_SET;
1208             }
1209             break;
1210         }
1211 
1212         if (state == WHPX_BP_SET_PENDING) {
1213             /* Remember the original instruction. */
1214             rc = cpu_memory_rw_debug(cpu,
1215                 breakpoints->data[i].address,
1216                 &breakpoints->data[i].original_instruction,
1217                 1,
1218                 false);
1219 
1220             if (!rc) {
1221                 /* Write the breakpoint instruction. */
1222                 rc = cpu_memory_rw_debug(cpu,
1223                     breakpoints->data[i].address,
1224                     (void *)&whpx_breakpoint_instruction,
1225                     1,
1226                     true);
1227             }
1228 
1229             if (!rc) {
1230                 state = WHPX_BP_SET;
1231             }
1232 
1233         }
1234 
1235         if (state == WHPX_BP_CLEAR_PENDING) {
1236             /* Restore the original instruction. */
1237             rc = cpu_memory_rw_debug(cpu,
1238                 breakpoints->data[i].address,
1239                 &breakpoints->data[i].original_instruction,
1240                 1,
1241                 true);
1242 
1243             if (!rc) {
1244                 state = WHPX_BP_CLEARED;
1245             }
1246         }
1247 
1248         breakpoints->data[i].state = state;
1249     }
1250 }
1251 
1252 /*
1253  * This function is called when the a VCPU is about to start and no other
1254  * VCPUs have been started so far. Since the VCPU start order could be
1255  * arbitrary, it doesn't have to be VCPU#0.
1256  *
1257  * It is used to commit the breakpoints into memory, and configure WHPX
1258  * to intercept debug exceptions.
1259  *
1260  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1261  * more VCPUs are already running, so this is the best place to do it.
1262  */
1263 static int whpx_first_vcpu_starting(CPUState *cpu)
1264 {
1265     struct whpx_state *whpx = &whpx_global;
1266     HRESULT hr;
1267 
1268     g_assert(qemu_mutex_iothread_locked());
1269 
1270     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1271             (whpx->breakpoints.breakpoints &&
1272              whpx->breakpoints.breakpoints->used)) {
1273         CPUBreakpoint *bp;
1274         int i = 0;
1275         bool update_pending = false;
1276 
1277         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1278             if (i >= whpx->breakpoints.original_address_count ||
1279                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1280                 update_pending = true;
1281             }
1282 
1283             i++;
1284         }
1285 
1286         if (i != whpx->breakpoints.original_address_count) {
1287             update_pending = true;
1288         }
1289 
1290         if (update_pending) {
1291             /*
1292              * The CPU breakpoints have changed since the last call to
1293              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1294              * now be recomputed.
1295              */
1296             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1297         }
1298 
1299         /* Actually insert the breakpoints into the memory. */
1300         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1301     }
1302 
1303     uint64_t exception_mask;
1304     if (whpx->step_pending ||
1305         (whpx->breakpoints.breakpoints &&
1306          whpx->breakpoints.breakpoints->used)) {
1307         /*
1308          * We are either attempting to single-step one or more CPUs, or
1309          * have one or more breakpoints enabled. Both require intercepting
1310          * the WHvX64ExceptionTypeBreakpointTrap exception.
1311          */
1312 
1313         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1314     } else {
1315         /* Let the guest handle all exceptions. */
1316         exception_mask = 0;
1317     }
1318 
1319     hr = whpx_set_exception_exit_bitmap(exception_mask);
1320     if (!SUCCEEDED(hr)) {
1321         error_report("WHPX: Failed to update exception exit mask,"
1322                      "hr=%08lx.", hr);
1323         return 1;
1324     }
1325 
1326     return 0;
1327 }
1328 
1329 /*
1330  * This function is called when the last VCPU has finished running.
1331  * It is used to remove any previously set breakpoints from memory.
1332  */
1333 static int whpx_last_vcpu_stopping(CPUState *cpu)
1334 {
1335     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1336     return 0;
1337 }
1338 
1339 /* Returns the address of the next instruction that is about to be executed. */
1340 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1341 {
1342     if (cpu->vcpu_dirty) {
1343         /* The CPU registers have been modified by other parts of QEMU. */
1344         CPUArchState *env = (CPUArchState *)(cpu->env_ptr);
1345         return env->eip;
1346     } else if (exit_context_valid) {
1347         /*
1348          * The CPU registers have not been modified by neither other parts
1349          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1350          * This is the most common case.
1351          */
1352         struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1353         return vcpu->exit_ctx.VpContext.Rip;
1354     } else {
1355         /*
1356          * The CPU registers have been modified by a call to
1357          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1358          * the target.
1359          */
1360         WHV_REGISTER_VALUE reg_value;
1361         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1362         HRESULT hr;
1363         struct whpx_state *whpx = &whpx_global;
1364 
1365         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1366             whpx->partition,
1367             cpu->cpu_index,
1368             &reg_name,
1369             1,
1370             &reg_value);
1371 
1372         if (FAILED(hr)) {
1373             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1374             return 0;
1375         }
1376 
1377         return reg_value.Reg64;
1378     }
1379 }
1380 
1381 static int whpx_handle_halt(CPUState *cpu)
1382 {
1383     CPUX86State *env = cpu->env_ptr;
1384     int ret = 0;
1385 
1386     qemu_mutex_lock_iothread();
1387     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1388           (env->eflags & IF_MASK)) &&
1389         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1390         cpu->exception_index = EXCP_HLT;
1391         cpu->halted = true;
1392         ret = 1;
1393     }
1394     qemu_mutex_unlock_iothread();
1395 
1396     return ret;
1397 }
1398 
1399 static void whpx_vcpu_pre_run(CPUState *cpu)
1400 {
1401     HRESULT hr;
1402     struct whpx_state *whpx = &whpx_global;
1403     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1404     CPUX86State *env = cpu->env_ptr;
1405     X86CPU *x86_cpu = X86_CPU(cpu);
1406     int irq;
1407     uint8_t tpr;
1408     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1409     UINT32 reg_count = 0;
1410     WHV_REGISTER_VALUE reg_values[3];
1411     WHV_REGISTER_NAME reg_names[3];
1412 
1413     memset(&new_int, 0, sizeof(new_int));
1414     memset(reg_values, 0, sizeof(reg_values));
1415 
1416     qemu_mutex_lock_iothread();
1417 
1418     /* Inject NMI */
1419     if (!vcpu->interruption_pending &&
1420         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1421         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1422             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1423             vcpu->interruptable = false;
1424             new_int.InterruptionType = WHvX64PendingNmi;
1425             new_int.InterruptionPending = 1;
1426             new_int.InterruptionVector = 2;
1427         }
1428         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1429             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1430         }
1431     }
1432 
1433     /*
1434      * Force the VCPU out of its inner loop to process any INIT requests or
1435      * commit pending TPR access.
1436      */
1437     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1438         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1439             !(env->hflags & HF_SMM_MASK)) {
1440             cpu->exit_request = 1;
1441         }
1442         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1443             cpu->exit_request = 1;
1444         }
1445     }
1446 
1447     /* Get pending hard interruption or replay one that was overwritten */
1448     if (!whpx_apic_in_platform()) {
1449         if (!vcpu->interruption_pending &&
1450             vcpu->interruptable && (env->eflags & IF_MASK)) {
1451             assert(!new_int.InterruptionPending);
1452             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1453                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1454                 irq = cpu_get_pic_interrupt(env);
1455                 if (irq >= 0) {
1456                     new_int.InterruptionType = WHvX64PendingInterrupt;
1457                     new_int.InterruptionPending = 1;
1458                     new_int.InterruptionVector = irq;
1459                 }
1460             }
1461         }
1462 
1463         /* Setup interrupt state if new one was prepared */
1464         if (new_int.InterruptionPending) {
1465             reg_values[reg_count].PendingInterruption = new_int;
1466             reg_names[reg_count] = WHvRegisterPendingInterruption;
1467             reg_count += 1;
1468         }
1469     } else if (vcpu->ready_for_pic_interrupt &&
1470                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1471         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1472         irq = cpu_get_pic_interrupt(env);
1473         if (irq >= 0) {
1474             reg_names[reg_count] = WHvRegisterPendingEvent;
1475             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1476             {
1477                 .EventPending = 1,
1478                 .EventType = WHvX64PendingEventExtInt,
1479                 .Vector = irq,
1480             };
1481             reg_count += 1;
1482         }
1483      }
1484 
1485     /* Sync the TPR to the CR8 if was modified during the intercept */
1486     tpr = cpu_get_apic_tpr(x86_cpu->apic_state);
1487     if (tpr != vcpu->tpr) {
1488         vcpu->tpr = tpr;
1489         reg_values[reg_count].Reg64 = tpr;
1490         cpu->exit_request = 1;
1491         reg_names[reg_count] = WHvX64RegisterCr8;
1492         reg_count += 1;
1493     }
1494 
1495     /* Update the state of the interrupt delivery notification */
1496     if (!vcpu->window_registered &&
1497         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1498         reg_values[reg_count].DeliverabilityNotifications =
1499             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1500                 .InterruptNotification = 1
1501             };
1502         vcpu->window_registered = 1;
1503         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1504         reg_count += 1;
1505     }
1506 
1507     qemu_mutex_unlock_iothread();
1508     vcpu->ready_for_pic_interrupt = false;
1509 
1510     if (reg_count) {
1511         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1512             whpx->partition, cpu->cpu_index,
1513             reg_names, reg_count, reg_values);
1514         if (FAILED(hr)) {
1515             error_report("WHPX: Failed to set interrupt state registers,"
1516                          " hr=%08lx", hr);
1517         }
1518     }
1519 
1520     return;
1521 }
1522 
1523 static void whpx_vcpu_post_run(CPUState *cpu)
1524 {
1525     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1526     CPUX86State *env = cpu->env_ptr;
1527     X86CPU *x86_cpu = X86_CPU(cpu);
1528 
1529     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1530 
1531     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1532     if (vcpu->tpr != tpr) {
1533         vcpu->tpr = tpr;
1534         qemu_mutex_lock_iothread();
1535         cpu_set_apic_tpr(x86_cpu->apic_state, vcpu->tpr);
1536         qemu_mutex_unlock_iothread();
1537     }
1538 
1539     vcpu->interruption_pending =
1540         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1541 
1542     vcpu->interruptable =
1543         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1544 
1545     return;
1546 }
1547 
1548 static void whpx_vcpu_process_async_events(CPUState *cpu)
1549 {
1550     CPUX86State *env = cpu->env_ptr;
1551     X86CPU *x86_cpu = X86_CPU(cpu);
1552     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1553 
1554     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1555         !(env->hflags & HF_SMM_MASK)) {
1556         whpx_cpu_synchronize_state(cpu);
1557         do_cpu_init(x86_cpu);
1558         vcpu->interruptable = true;
1559     }
1560 
1561     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1562         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1563         apic_poll_irq(x86_cpu->apic_state);
1564     }
1565 
1566     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1567          (env->eflags & IF_MASK)) ||
1568         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1569         cpu->halted = false;
1570     }
1571 
1572     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1573         whpx_cpu_synchronize_state(cpu);
1574         do_cpu_sipi(x86_cpu);
1575     }
1576 
1577     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1578         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1579         whpx_cpu_synchronize_state(cpu);
1580         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1581                                       env->tpr_access_type);
1582     }
1583 
1584     return;
1585 }
1586 
1587 static int whpx_vcpu_run(CPUState *cpu)
1588 {
1589     HRESULT hr;
1590     struct whpx_state *whpx = &whpx_global;
1591     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1592     struct whpx_breakpoint *stepped_over_bp = NULL;
1593     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1594     int ret;
1595 
1596     g_assert(qemu_mutex_iothread_locked());
1597 
1598     if (whpx->running_cpus++ == 0) {
1599         /* Insert breakpoints into memory, update exception exit bitmap. */
1600         ret = whpx_first_vcpu_starting(cpu);
1601         if (ret != 0) {
1602             return ret;
1603         }
1604     }
1605 
1606     if (whpx->breakpoints.breakpoints &&
1607         whpx->breakpoints.breakpoints->used > 0)
1608     {
1609         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1610         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1611         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1612             stepped_over_bp = NULL;
1613         }
1614 
1615         if (stepped_over_bp) {
1616             /*
1617              * We are trying to run the instruction overwritten by an active
1618              * breakpoint. We will temporarily disable the breakpoint, suspend
1619              * other CPUs, and step over the instruction.
1620              */
1621             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1622         }
1623     }
1624 
1625     if (exclusive_step_mode == WHPX_STEP_NONE) {
1626         whpx_vcpu_process_async_events(cpu);
1627         if (cpu->halted && !whpx_apic_in_platform()) {
1628             cpu->exception_index = EXCP_HLT;
1629             qatomic_set(&cpu->exit_request, false);
1630             return 0;
1631         }
1632     }
1633 
1634     qemu_mutex_unlock_iothread();
1635 
1636     if (exclusive_step_mode != WHPX_STEP_NONE) {
1637         start_exclusive();
1638         g_assert(cpu == current_cpu);
1639         g_assert(!cpu->running);
1640         cpu->running = true;
1641 
1642         hr = whpx_set_exception_exit_bitmap(
1643             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1644         if (!SUCCEEDED(hr)) {
1645             error_report("WHPX: Failed to update exception exit mask, "
1646                          "hr=%08lx.", hr);
1647             return 1;
1648         }
1649 
1650         if (stepped_over_bp) {
1651             /* Temporarily disable the triggered breakpoint. */
1652             cpu_memory_rw_debug(cpu,
1653                 stepped_over_bp->address,
1654                 &stepped_over_bp->original_instruction,
1655                 1,
1656                 true);
1657         }
1658     } else {
1659         cpu_exec_start(cpu);
1660     }
1661 
1662     do {
1663         if (cpu->vcpu_dirty) {
1664             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1665             cpu->vcpu_dirty = false;
1666         }
1667 
1668         if (exclusive_step_mode == WHPX_STEP_NONE) {
1669             whpx_vcpu_pre_run(cpu);
1670 
1671             if (qatomic_read(&cpu->exit_request)) {
1672                 whpx_vcpu_kick(cpu);
1673             }
1674         }
1675 
1676         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1677             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1678         }
1679 
1680         hr = whp_dispatch.WHvRunVirtualProcessor(
1681             whpx->partition, cpu->cpu_index,
1682             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1683 
1684         if (FAILED(hr)) {
1685             error_report("WHPX: Failed to exec a virtual processor,"
1686                          " hr=%08lx", hr);
1687             ret = -1;
1688             break;
1689         }
1690 
1691         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1692             whpx_vcpu_configure_single_stepping(cpu,
1693                 false,
1694                 &vcpu->exit_ctx.VpContext.Rflags);
1695         }
1696 
1697         whpx_vcpu_post_run(cpu);
1698 
1699         switch (vcpu->exit_ctx.ExitReason) {
1700         case WHvRunVpExitReasonMemoryAccess:
1701             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1702             break;
1703 
1704         case WHvRunVpExitReasonX64IoPortAccess:
1705             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1706             break;
1707 
1708         case WHvRunVpExitReasonX64InterruptWindow:
1709             vcpu->ready_for_pic_interrupt = 1;
1710             vcpu->window_registered = 0;
1711             ret = 0;
1712             break;
1713 
1714         case WHvRunVpExitReasonX64ApicEoi:
1715             assert(whpx_apic_in_platform());
1716             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1717             break;
1718 
1719         case WHvRunVpExitReasonX64Halt:
1720             /*
1721              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1722              * longer used.
1723              */
1724             ret = whpx_handle_halt(cpu);
1725             break;
1726 
1727         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1728             WHV_INTERRUPT_CONTROL ipi = {0};
1729             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1730             uint32_t delivery_mode =
1731                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1732             int dest_shorthand =
1733                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1734             bool broadcast = false;
1735             bool include_self = false;
1736             uint32_t i;
1737 
1738             /* We only registered for INIT and SIPI exits. */
1739             if ((delivery_mode != APIC_DM_INIT) &&
1740                 (delivery_mode != APIC_DM_SIPI)) {
1741                 error_report(
1742                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1743                 break;
1744             }
1745 
1746             if (delivery_mode == APIC_DM_INIT) {
1747                 ipi.Type = WHvX64InterruptTypeInit;
1748             } else {
1749                 ipi.Type = WHvX64InterruptTypeSipi;
1750             }
1751 
1752             ipi.DestinationMode =
1753                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1754                     WHvX64InterruptDestinationModeLogical :
1755                     WHvX64InterruptDestinationModePhysical;
1756 
1757             ipi.TriggerMode =
1758                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1759                     WHvX64InterruptTriggerModeLevel :
1760                     WHvX64InterruptTriggerModeEdge;
1761 
1762             ipi.Vector = icr & APIC_VECTOR_MASK;
1763             switch (dest_shorthand) {
1764             /* no shorthand. Bits 56-63 contain the destination. */
1765             case 0:
1766                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1767                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1768                         &ipi, sizeof(ipi));
1769                 if (FAILED(hr)) {
1770                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1771                         hr);
1772                 }
1773 
1774                 break;
1775 
1776             /* self */
1777             case 1:
1778                 include_self = true;
1779                 break;
1780 
1781             /* broadcast, including self */
1782             case 2:
1783                 broadcast = true;
1784                 include_self = true;
1785                 break;
1786 
1787             /* broadcast, excluding self */
1788             case 3:
1789                 broadcast = true;
1790                 break;
1791             }
1792 
1793             if (!broadcast && !include_self) {
1794                 break;
1795             }
1796 
1797             for (i = 0; i <= max_vcpu_index; i++) {
1798                 if (i == cpu->cpu_index && !include_self) {
1799                     continue;
1800                 }
1801 
1802                 /*
1803                  * Assuming that APIC Ids are identity mapped since
1804                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1805                  * are not handled yet and the hypervisor doesn't allow the
1806                  * guest to modify the APIC ID.
1807                  */
1808                 ipi.Destination = i;
1809                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1810                         &ipi, sizeof(ipi));
1811                 if (FAILED(hr)) {
1812                     error_report(
1813                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1814                         i, hr);
1815                 }
1816             }
1817 
1818             break;
1819         }
1820 
1821         case WHvRunVpExitReasonCanceled:
1822             if (exclusive_step_mode != WHPX_STEP_NONE) {
1823                 /*
1824                  * We are trying to step over a single instruction, and
1825                  * likely got a request to stop from another thread.
1826                  * Delay it until we are done stepping
1827                  * over.
1828                  */
1829                 ret = 0;
1830             } else {
1831                 cpu->exception_index = EXCP_INTERRUPT;
1832                 ret = 1;
1833             }
1834             break;
1835         case WHvRunVpExitReasonX64MsrAccess: {
1836             WHV_REGISTER_VALUE reg_values[3] = {0};
1837             WHV_REGISTER_NAME reg_names[3];
1838             UINT32 reg_count;
1839 
1840             reg_names[0] = WHvX64RegisterRip;
1841             reg_names[1] = WHvX64RegisterRax;
1842             reg_names[2] = WHvX64RegisterRdx;
1843 
1844             reg_values[0].Reg64 =
1845                 vcpu->exit_ctx.VpContext.Rip +
1846                 vcpu->exit_ctx.VpContext.InstructionLength;
1847 
1848             /*
1849              * For all unsupported MSR access we:
1850              *     ignore writes
1851              *     return 0 on read.
1852              */
1853             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1854                         1 : 3;
1855 
1856             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1857                 whpx->partition,
1858                 cpu->cpu_index,
1859                 reg_names, reg_count,
1860                 reg_values);
1861 
1862             if (FAILED(hr)) {
1863                 error_report("WHPX: Failed to set MsrAccess state "
1864                              " registers, hr=%08lx", hr);
1865             }
1866             ret = 0;
1867             break;
1868         }
1869         case WHvRunVpExitReasonX64Cpuid: {
1870             WHV_REGISTER_VALUE reg_values[5];
1871             WHV_REGISTER_NAME reg_names[5];
1872             UINT32 reg_count = 5;
1873             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1874             X86CPU *x86_cpu = X86_CPU(cpu);
1875             CPUX86State *env = &x86_cpu->env;
1876 
1877             memset(reg_values, 0, sizeof(reg_values));
1878 
1879             rip = vcpu->exit_ctx.VpContext.Rip +
1880                   vcpu->exit_ctx.VpContext.InstructionLength;
1881             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1882 
1883             /*
1884              * Ideally, these should be supplied to the hypervisor during VCPU
1885              * initialization and it should be able to satisfy this request.
1886              * But, currently, WHPX doesn't support setting CPUID values in the
1887              * hypervisor once the partition has been setup, which is too late
1888              * since VCPUs are realized later. For now, use the values from
1889              * QEMU to satisfy these requests, until WHPX adds support for
1890              * being able to set these values in the hypervisor at runtime.
1891              */
1892             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1893                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1894             switch (cpuid_fn) {
1895             case 0x40000000:
1896                 /* Expose the vmware cpu frequency cpuid leaf */
1897                 rax = 0x40000010;
1898                 rbx = rcx = rdx = 0;
1899                 break;
1900 
1901             case 0x40000010:
1902                 rax = env->tsc_khz;
1903                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1904                 rcx = rdx = 0;
1905                 break;
1906 
1907             case 0x80000001:
1908                 /* Remove any support of OSVW */
1909                 rcx &= ~CPUID_EXT3_OSVW;
1910                 break;
1911             }
1912 
1913             reg_names[0] = WHvX64RegisterRip;
1914             reg_names[1] = WHvX64RegisterRax;
1915             reg_names[2] = WHvX64RegisterRcx;
1916             reg_names[3] = WHvX64RegisterRdx;
1917             reg_names[4] = WHvX64RegisterRbx;
1918 
1919             reg_values[0].Reg64 = rip;
1920             reg_values[1].Reg64 = rax;
1921             reg_values[2].Reg64 = rcx;
1922             reg_values[3].Reg64 = rdx;
1923             reg_values[4].Reg64 = rbx;
1924 
1925             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1926                 whpx->partition, cpu->cpu_index,
1927                 reg_names,
1928                 reg_count,
1929                 reg_values);
1930 
1931             if (FAILED(hr)) {
1932                 error_report("WHPX: Failed to set CpuidAccess state registers,"
1933                              " hr=%08lx", hr);
1934             }
1935             ret = 0;
1936             break;
1937         }
1938         case WHvRunVpExitReasonException:
1939             whpx_get_registers(cpu);
1940 
1941             if ((vcpu->exit_ctx.VpException.ExceptionType ==
1942                  WHvX64ExceptionTypeDebugTrapOrFault) &&
1943                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1944                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1945                  whpx_breakpoint_instruction)) {
1946                 /* Stopped at a software breakpoint. */
1947                 cpu->exception_index = EXCP_DEBUG;
1948             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
1949                         WHvX64ExceptionTypeDebugTrapOrFault) &&
1950                        !cpu->singlestep_enabled) {
1951                 /*
1952                  * Just finished stepping over a breakpoint, but the
1953                  * gdb does not expect us to do single-stepping.
1954                  * Don't do anything special.
1955                  */
1956                 cpu->exception_index = EXCP_INTERRUPT;
1957             } else {
1958                 /* Another exception or debug event. Report it to GDB. */
1959                 cpu->exception_index = EXCP_DEBUG;
1960             }
1961 
1962             ret = 1;
1963             break;
1964         case WHvRunVpExitReasonNone:
1965         case WHvRunVpExitReasonUnrecoverableException:
1966         case WHvRunVpExitReasonInvalidVpRegisterValue:
1967         case WHvRunVpExitReasonUnsupportedFeature:
1968         default:
1969             error_report("WHPX: Unexpected VP exit code %d",
1970                          vcpu->exit_ctx.ExitReason);
1971             whpx_get_registers(cpu);
1972             qemu_mutex_lock_iothread();
1973             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
1974             qemu_mutex_unlock_iothread();
1975             break;
1976         }
1977 
1978     } while (!ret);
1979 
1980     if (stepped_over_bp) {
1981         /* Restore the breakpoint we stepped over */
1982         cpu_memory_rw_debug(cpu,
1983             stepped_over_bp->address,
1984             (void *)&whpx_breakpoint_instruction,
1985             1,
1986             true);
1987     }
1988 
1989     if (exclusive_step_mode != WHPX_STEP_NONE) {
1990         g_assert(cpu_in_exclusive_context(cpu));
1991         cpu->running = false;
1992         end_exclusive();
1993 
1994         exclusive_step_mode = WHPX_STEP_NONE;
1995     } else {
1996         cpu_exec_end(cpu);
1997     }
1998 
1999     qemu_mutex_lock_iothread();
2000     current_cpu = cpu;
2001 
2002     if (--whpx->running_cpus == 0) {
2003         whpx_last_vcpu_stopping(cpu);
2004     }
2005 
2006     qatomic_set(&cpu->exit_request, false);
2007 
2008     return ret < 0;
2009 }
2010 
2011 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2012 {
2013     if (!cpu->vcpu_dirty) {
2014         whpx_get_registers(cpu);
2015         cpu->vcpu_dirty = true;
2016     }
2017 }
2018 
2019 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2020                                                run_on_cpu_data arg)
2021 {
2022     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2023     cpu->vcpu_dirty = false;
2024 }
2025 
2026 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2027                                               run_on_cpu_data arg)
2028 {
2029     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2030     cpu->vcpu_dirty = false;
2031 }
2032 
2033 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2034                                                run_on_cpu_data arg)
2035 {
2036     cpu->vcpu_dirty = true;
2037 }
2038 
2039 /*
2040  * CPU support.
2041  */
2042 
2043 void whpx_cpu_synchronize_state(CPUState *cpu)
2044 {
2045     if (!cpu->vcpu_dirty) {
2046         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2047     }
2048 }
2049 
2050 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2051 {
2052     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2053 }
2054 
2055 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2056 {
2057     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2058 }
2059 
2060 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2061 {
2062     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2063 }
2064 
2065 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2066 {
2067     whpx_global.step_pending = step_pending;
2068 }
2069 
2070 /*
2071  * Vcpu support.
2072  */
2073 
2074 static Error *whpx_migration_blocker;
2075 
2076 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2077 {
2078     CPUX86State *env = opaque;
2079 
2080     if (running) {
2081         env->tsc_valid = false;
2082     }
2083 }
2084 
2085 int whpx_init_vcpu(CPUState *cpu)
2086 {
2087     HRESULT hr;
2088     struct whpx_state *whpx = &whpx_global;
2089     struct whpx_vcpu *vcpu = NULL;
2090     Error *local_error = NULL;
2091     CPUX86State *env = cpu->env_ptr;
2092     X86CPU *x86_cpu = X86_CPU(cpu);
2093     UINT64 freq = 0;
2094     int ret;
2095 
2096     /* Add migration blockers for all unsupported features of the
2097      * Windows Hypervisor Platform
2098      */
2099     if (whpx_migration_blocker == NULL) {
2100         error_setg(&whpx_migration_blocker,
2101                "State blocked due to non-migratable CPUID feature support,"
2102                "dirty memory tracking support, and XSAVE/XRSTOR support");
2103 
2104         if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) {
2105             error_report_err(local_error);
2106             error_free(whpx_migration_blocker);
2107             ret = -EINVAL;
2108             goto error;
2109         }
2110     }
2111 
2112     vcpu = g_new0(struct whpx_vcpu, 1);
2113 
2114     if (!vcpu) {
2115         error_report("WHPX: Failed to allocte VCPU context.");
2116         ret = -ENOMEM;
2117         goto error;
2118     }
2119 
2120     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2121         &whpx_emu_callbacks,
2122         &vcpu->emulator);
2123     if (FAILED(hr)) {
2124         error_report("WHPX: Failed to setup instruction completion support,"
2125                      " hr=%08lx", hr);
2126         ret = -EINVAL;
2127         goto error;
2128     }
2129 
2130     hr = whp_dispatch.WHvCreateVirtualProcessor(
2131         whpx->partition, cpu->cpu_index, 0);
2132     if (FAILED(hr)) {
2133         error_report("WHPX: Failed to create a virtual processor,"
2134                      " hr=%08lx", hr);
2135         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2136         ret = -EINVAL;
2137         goto error;
2138     }
2139 
2140     /*
2141      * vcpu's TSC frequency is either specified by user, or use the value
2142      * provided by Hyper-V if the former is not present. In the latter case, we
2143      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2144      * frequency can be migrated later via this field.
2145      */
2146     if (!env->tsc_khz) {
2147         hr = whp_dispatch.WHvGetCapability(
2148             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2149                 NULL);
2150         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2151             if (FAILED(hr)) {
2152                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2153             } else {
2154                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2155             }
2156         }
2157     }
2158 
2159     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2160     hr = whp_dispatch.WHvGetCapability(
2161         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2162     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2163         if (FAILED(hr)) {
2164             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2165         } else {
2166             env->apic_bus_freq = freq;
2167         }
2168     }
2169 
2170     /*
2171      * If the vmware cpuid frequency leaf option is set, and we have a valid
2172      * tsc value, trap the corresponding cpuid's.
2173      */
2174     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2175         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2176 
2177         hr = whp_dispatch.WHvSetPartitionProperty(
2178                 whpx->partition,
2179                 WHvPartitionPropertyCodeCpuidExitList,
2180                 cpuidExitList,
2181                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2182 
2183         if (FAILED(hr)) {
2184             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2185                         hr);
2186             ret = -EINVAL;
2187             goto error;
2188         }
2189     }
2190 
2191     vcpu->interruptable = true;
2192     cpu->vcpu_dirty = true;
2193     cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu;
2194     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2195     qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr);
2196 
2197     return 0;
2198 
2199 error:
2200     g_free(vcpu);
2201 
2202     return ret;
2203 }
2204 
2205 int whpx_vcpu_exec(CPUState *cpu)
2206 {
2207     int ret;
2208     int fatal;
2209 
2210     for (;;) {
2211         if (cpu->exception_index >= EXCP_INTERRUPT) {
2212             ret = cpu->exception_index;
2213             cpu->exception_index = -1;
2214             break;
2215         }
2216 
2217         fatal = whpx_vcpu_run(cpu);
2218 
2219         if (fatal) {
2220             error_report("WHPX: Failed to exec a virtual processor");
2221             abort();
2222         }
2223     }
2224 
2225     return ret;
2226 }
2227 
2228 void whpx_destroy_vcpu(CPUState *cpu)
2229 {
2230     struct whpx_state *whpx = &whpx_global;
2231     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
2232 
2233     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2234     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2235     g_free(cpu->hax_vcpu);
2236     return;
2237 }
2238 
2239 void whpx_vcpu_kick(CPUState *cpu)
2240 {
2241     struct whpx_state *whpx = &whpx_global;
2242     whp_dispatch.WHvCancelRunVirtualProcessor(
2243         whpx->partition, cpu->cpu_index, 0);
2244 }
2245 
2246 /*
2247  * Memory support.
2248  */
2249 
2250 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2251                                 void *host_va, int add, int rom,
2252                                 const char *name)
2253 {
2254     struct whpx_state *whpx = &whpx_global;
2255     HRESULT hr;
2256 
2257     /*
2258     if (add) {
2259         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2260                (void*)start_pa, (void*)size, host_va,
2261                (rom ? "ROM" : "RAM"), name);
2262     } else {
2263         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2264                (void*)start_pa, (void*)size, host_va, name);
2265     }
2266     */
2267 
2268     if (add) {
2269         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2270                                          host_va,
2271                                          start_pa,
2272                                          size,
2273                                          (WHvMapGpaRangeFlagRead |
2274                                           WHvMapGpaRangeFlagExecute |
2275                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2276     } else {
2277         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2278                                            start_pa,
2279                                            size);
2280     }
2281 
2282     if (FAILED(hr)) {
2283         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2284                      " Host:%p, hr=%08lx",
2285                      (add ? "MAP" : "UNMAP"), name,
2286                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2287     }
2288 }
2289 
2290 static void whpx_process_section(MemoryRegionSection *section, int add)
2291 {
2292     MemoryRegion *mr = section->mr;
2293     hwaddr start_pa = section->offset_within_address_space;
2294     ram_addr_t size = int128_get64(section->size);
2295     unsigned int delta;
2296     uint64_t host_va;
2297 
2298     if (!memory_region_is_ram(mr)) {
2299         return;
2300     }
2301 
2302     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2303     delta &= ~qemu_real_host_page_mask();
2304     if (delta > size) {
2305         return;
2306     }
2307     start_pa += delta;
2308     size -= delta;
2309     size &= qemu_real_host_page_mask();
2310     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2311         return;
2312     }
2313 
2314     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2315             + section->offset_within_region + delta;
2316 
2317     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2318                         memory_region_is_rom(mr), mr->name);
2319 }
2320 
2321 static void whpx_region_add(MemoryListener *listener,
2322                            MemoryRegionSection *section)
2323 {
2324     memory_region_ref(section->mr);
2325     whpx_process_section(section, 1);
2326 }
2327 
2328 static void whpx_region_del(MemoryListener *listener,
2329                            MemoryRegionSection *section)
2330 {
2331     whpx_process_section(section, 0);
2332     memory_region_unref(section->mr);
2333 }
2334 
2335 static void whpx_transaction_begin(MemoryListener *listener)
2336 {
2337 }
2338 
2339 static void whpx_transaction_commit(MemoryListener *listener)
2340 {
2341 }
2342 
2343 static void whpx_log_sync(MemoryListener *listener,
2344                          MemoryRegionSection *section)
2345 {
2346     MemoryRegion *mr = section->mr;
2347 
2348     if (!memory_region_is_ram(mr)) {
2349         return;
2350     }
2351 
2352     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2353 }
2354 
2355 static MemoryListener whpx_memory_listener = {
2356     .name = "whpx",
2357     .begin = whpx_transaction_begin,
2358     .commit = whpx_transaction_commit,
2359     .region_add = whpx_region_add,
2360     .region_del = whpx_region_del,
2361     .log_sync = whpx_log_sync,
2362     .priority = 10,
2363 };
2364 
2365 static void whpx_memory_init(void)
2366 {
2367     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2368 }
2369 
2370 /*
2371  * Load the functions from the given library, using the given handle. If a
2372  * handle is provided, it is used, otherwise the library is opened. The
2373  * handle will be updated on return with the opened one.
2374  */
2375 static bool load_whp_dispatch_fns(HMODULE *handle,
2376     WHPFunctionList function_list)
2377 {
2378     HMODULE hLib = *handle;
2379 
2380     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2381     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2382     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2383         whp_dispatch.function_name = \
2384             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2385 
2386     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2387         whp_dispatch.function_name = \
2388             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2389         if (!whp_dispatch.function_name) { \
2390             error_report("Could not load function %s", #function_name); \
2391             goto error; \
2392         } \
2393 
2394     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2395     if (!handle_lib) { \
2396         handle_lib = LoadLibrary(lib_name); \
2397         if (!handle_lib) { \
2398             error_report("Could not load library %s.", lib_name); \
2399             goto error; \
2400         } \
2401     } \
2402 
2403     switch (function_list) {
2404     case WINHV_PLATFORM_FNS_DEFAULT:
2405         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2406         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2407         break;
2408 
2409     case WINHV_EMULATION_FNS_DEFAULT:
2410         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2411         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2412         break;
2413 
2414     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2415         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2416         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2417         break;
2418     }
2419 
2420     *handle = hLib;
2421     return true;
2422 
2423 error:
2424     if (hLib) {
2425         FreeLibrary(hLib);
2426     }
2427 
2428     return false;
2429 }
2430 
2431 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2432                                    const char *name, void *opaque,
2433                                    Error **errp)
2434 {
2435     struct whpx_state *whpx = &whpx_global;
2436     OnOffSplit mode;
2437 
2438     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2439         return;
2440     }
2441 
2442     switch (mode) {
2443     case ON_OFF_SPLIT_ON:
2444         whpx->kernel_irqchip_allowed = true;
2445         whpx->kernel_irqchip_required = true;
2446         break;
2447 
2448     case ON_OFF_SPLIT_OFF:
2449         whpx->kernel_irqchip_allowed = false;
2450         whpx->kernel_irqchip_required = false;
2451         break;
2452 
2453     case ON_OFF_SPLIT_SPLIT:
2454         error_setg(errp, "WHPX: split irqchip currently not supported");
2455         error_append_hint(errp,
2456             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2457         break;
2458 
2459     default:
2460         /*
2461          * The value was checked in visit_type_OnOffSplit() above. If
2462          * we get here, then something is wrong in QEMU.
2463          */
2464         abort();
2465     }
2466 }
2467 
2468 /*
2469  * Partition support
2470  */
2471 
2472 static int whpx_accel_init(MachineState *ms)
2473 {
2474     struct whpx_state *whpx;
2475     int ret;
2476     HRESULT hr;
2477     WHV_CAPABILITY whpx_cap;
2478     UINT32 whpx_cap_size;
2479     WHV_PARTITION_PROPERTY prop;
2480     UINT32 cpuidExitList[] = {1, 0x80000001};
2481     WHV_CAPABILITY_FEATURES features = {0};
2482 
2483     whpx = &whpx_global;
2484 
2485     if (!init_whp_dispatch()) {
2486         ret = -ENOSYS;
2487         goto error;
2488     }
2489 
2490     whpx->mem_quota = ms->ram_size;
2491 
2492     hr = whp_dispatch.WHvGetCapability(
2493         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2494         sizeof(whpx_cap), &whpx_cap_size);
2495     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2496         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2497         ret = -ENOSPC;
2498         goto error;
2499     }
2500 
2501     hr = whp_dispatch.WHvGetCapability(
2502         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2503     if (FAILED(hr)) {
2504         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2505         ret = -EINVAL;
2506         goto error;
2507     }
2508 
2509     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2510     if (FAILED(hr)) {
2511         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2512         ret = -EINVAL;
2513         goto error;
2514     }
2515 
2516     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2517     prop.ProcessorCount = ms->smp.cpus;
2518     hr = whp_dispatch.WHvSetPartitionProperty(
2519         whpx->partition,
2520         WHvPartitionPropertyCodeProcessorCount,
2521         &prop,
2522         sizeof(WHV_PARTITION_PROPERTY));
2523 
2524     if (FAILED(hr)) {
2525         error_report("WHPX: Failed to set partition core count to %d,"
2526                      " hr=%08lx", ms->smp.cores, hr);
2527         ret = -EINVAL;
2528         goto error;
2529     }
2530 
2531     /*
2532      * Error out if WHP doesn't support apic emulation and user is requiring
2533      * it.
2534      */
2535     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2536             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2537         error_report("WHPX: kernel irqchip requested, but unavailable. "
2538             "Try without kernel-irqchip or with kernel-irqchip=off");
2539         ret = -EINVAL;
2540         goto error;
2541     }
2542 
2543     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2544         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2545         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2546             WHvX64LocalApicEmulationModeXApic;
2547         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2548         hr = whp_dispatch.WHvSetPartitionProperty(
2549             whpx->partition,
2550             WHvPartitionPropertyCodeLocalApicEmulationMode,
2551             &mode,
2552             sizeof(mode));
2553         if (FAILED(hr)) {
2554             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2555             if (whpx->kernel_irqchip_required) {
2556                 error_report("WHPX: kernel irqchip requested, but unavailable");
2557                 ret = -EINVAL;
2558                 goto error;
2559             }
2560         } else {
2561             whpx->apic_in_platform = true;
2562         }
2563     }
2564 
2565     /* Register for MSR and CPUID exits */
2566     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2567     prop.ExtendedVmExits.X64MsrExit = 1;
2568     prop.ExtendedVmExits.X64CpuidExit = 1;
2569     prop.ExtendedVmExits.ExceptionExit = 1;
2570     if (whpx_apic_in_platform()) {
2571         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2572     }
2573 
2574     hr = whp_dispatch.WHvSetPartitionProperty(
2575             whpx->partition,
2576             WHvPartitionPropertyCodeExtendedVmExits,
2577             &prop,
2578             sizeof(WHV_PARTITION_PROPERTY));
2579     if (FAILED(hr)) {
2580         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2581         ret = -EINVAL;
2582         goto error;
2583     }
2584 
2585     hr = whp_dispatch.WHvSetPartitionProperty(
2586         whpx->partition,
2587         WHvPartitionPropertyCodeCpuidExitList,
2588         cpuidExitList,
2589         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2590 
2591     if (FAILED(hr)) {
2592         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2593                      hr);
2594         ret = -EINVAL;
2595         goto error;
2596     }
2597 
2598     /*
2599      * We do not want to intercept any exceptions from the guest,
2600      * until we actually start debugging with gdb.
2601      */
2602     whpx->exception_exit_bitmap = -1;
2603     hr = whpx_set_exception_exit_bitmap(0);
2604 
2605     if (FAILED(hr)) {
2606         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2607         ret = -EINVAL;
2608         goto error;
2609     }
2610 
2611     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2612     if (FAILED(hr)) {
2613         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2614         ret = -EINVAL;
2615         goto error;
2616     }
2617 
2618     whpx_memory_init();
2619 
2620     printf("Windows Hypervisor Platform accelerator is operational\n");
2621     return 0;
2622 
2623 error:
2624 
2625     if (NULL != whpx->partition) {
2626         whp_dispatch.WHvDeletePartition(whpx->partition);
2627         whpx->partition = NULL;
2628     }
2629 
2630     return ret;
2631 }
2632 
2633 int whpx_enabled(void)
2634 {
2635     return whpx_allowed;
2636 }
2637 
2638 bool whpx_apic_in_platform(void) {
2639     return whpx_global.apic_in_platform;
2640 }
2641 
2642 static void whpx_accel_class_init(ObjectClass *oc, void *data)
2643 {
2644     AccelClass *ac = ACCEL_CLASS(oc);
2645     ac->name = "WHPX";
2646     ac->init_machine = whpx_accel_init;
2647     ac->allowed = &whpx_allowed;
2648 
2649     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2650         NULL, whpx_set_kernel_irqchip,
2651         NULL, NULL);
2652     object_class_property_set_description(oc, "kernel-irqchip",
2653         "Configure WHPX in-kernel irqchip");
2654 }
2655 
2656 static void whpx_accel_instance_init(Object *obj)
2657 {
2658     struct whpx_state *whpx = &whpx_global;
2659 
2660     memset(whpx, 0, sizeof(struct whpx_state));
2661     /* Turn on kernel-irqchip, by default */
2662     whpx->kernel_irqchip_allowed = true;
2663 }
2664 
2665 static const TypeInfo whpx_accel_type = {
2666     .name = ACCEL_CLASS_NAME("whpx"),
2667     .parent = TYPE_ACCEL,
2668     .instance_init = whpx_accel_instance_init,
2669     .class_init = whpx_accel_class_init,
2670 };
2671 
2672 static void whpx_type_init(void)
2673 {
2674     type_register_static(&whpx_accel_type);
2675 }
2676 
2677 bool init_whp_dispatch(void)
2678 {
2679     if (whp_dispatch_initialized) {
2680         return true;
2681     }
2682 
2683     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2684         goto error;
2685     }
2686 
2687     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2688         goto error;
2689     }
2690 
2691     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2692         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2693     whp_dispatch_initialized = true;
2694 
2695     return true;
2696 error:
2697     if (hWinHvPlatform) {
2698         FreeLibrary(hWinHvPlatform);
2699     }
2700 
2701     if (hWinHvEmulation) {
2702         FreeLibrary(hWinHvEmulation);
2703     }
2704 
2705     return false;
2706 }
2707 
2708 type_init(whpx_type_init);
2709