xref: /openbmc/qemu/target/i386/whpx/whpx-all.c (revision 98107c5d4c1c0a16f1a02a5efbfe01b567215cc6)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "system/address-spaces.h"
14 #include "system/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "accel/accel-ops.h"
18 #include "system/whpx.h"
19 #include "system/cpus.h"
20 #include "system/runstate.h"
21 #include "qemu/main-loop.h"
22 #include "hw/boards.h"
23 #include "hw/intc/ioapic.h"
24 #include "hw/i386/apic_internal.h"
25 #include "qemu/error-report.h"
26 #include "qapi/error.h"
27 #include "qapi/qapi-types-common.h"
28 #include "qapi/qapi-visit-common.h"
29 #include "migration/blocker.h"
30 #include "host-cpu.h"
31 #include "accel/accel-cpu-target.h"
32 #include <winerror.h>
33 
34 #include "whpx-internal.h"
35 #include "whpx-accel-ops.h"
36 
37 #include <winhvplatform.h>
38 #include <winhvemulation.h>
39 
40 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
41 
42 static const WHV_REGISTER_NAME whpx_register_names[] = {
43 
44     /* X64 General purpose registers */
45     WHvX64RegisterRax,
46     WHvX64RegisterRcx,
47     WHvX64RegisterRdx,
48     WHvX64RegisterRbx,
49     WHvX64RegisterRsp,
50     WHvX64RegisterRbp,
51     WHvX64RegisterRsi,
52     WHvX64RegisterRdi,
53     WHvX64RegisterR8,
54     WHvX64RegisterR9,
55     WHvX64RegisterR10,
56     WHvX64RegisterR11,
57     WHvX64RegisterR12,
58     WHvX64RegisterR13,
59     WHvX64RegisterR14,
60     WHvX64RegisterR15,
61     WHvX64RegisterRip,
62     WHvX64RegisterRflags,
63 
64     /* X64 Segment registers */
65     WHvX64RegisterEs,
66     WHvX64RegisterCs,
67     WHvX64RegisterSs,
68     WHvX64RegisterDs,
69     WHvX64RegisterFs,
70     WHvX64RegisterGs,
71     WHvX64RegisterLdtr,
72     WHvX64RegisterTr,
73 
74     /* X64 Table registers */
75     WHvX64RegisterIdtr,
76     WHvX64RegisterGdtr,
77 
78     /* X64 Control Registers */
79     WHvX64RegisterCr0,
80     WHvX64RegisterCr2,
81     WHvX64RegisterCr3,
82     WHvX64RegisterCr4,
83     WHvX64RegisterCr8,
84 
85     /* X64 Debug Registers */
86     /*
87      * WHvX64RegisterDr0,
88      * WHvX64RegisterDr1,
89      * WHvX64RegisterDr2,
90      * WHvX64RegisterDr3,
91      * WHvX64RegisterDr6,
92      * WHvX64RegisterDr7,
93      */
94 
95     /* X64 Floating Point and Vector Registers */
96     WHvX64RegisterXmm0,
97     WHvX64RegisterXmm1,
98     WHvX64RegisterXmm2,
99     WHvX64RegisterXmm3,
100     WHvX64RegisterXmm4,
101     WHvX64RegisterXmm5,
102     WHvX64RegisterXmm6,
103     WHvX64RegisterXmm7,
104     WHvX64RegisterXmm8,
105     WHvX64RegisterXmm9,
106     WHvX64RegisterXmm10,
107     WHvX64RegisterXmm11,
108     WHvX64RegisterXmm12,
109     WHvX64RegisterXmm13,
110     WHvX64RegisterXmm14,
111     WHvX64RegisterXmm15,
112     WHvX64RegisterFpMmx0,
113     WHvX64RegisterFpMmx1,
114     WHvX64RegisterFpMmx2,
115     WHvX64RegisterFpMmx3,
116     WHvX64RegisterFpMmx4,
117     WHvX64RegisterFpMmx5,
118     WHvX64RegisterFpMmx6,
119     WHvX64RegisterFpMmx7,
120     WHvX64RegisterFpControlStatus,
121     WHvX64RegisterXmmControlStatus,
122 
123     /* X64 MSRs */
124     WHvX64RegisterEfer,
125 #ifdef TARGET_X86_64
126     WHvX64RegisterKernelGsBase,
127 #endif
128     WHvX64RegisterApicBase,
129     /* WHvX64RegisterPat, */
130     WHvX64RegisterSysenterCs,
131     WHvX64RegisterSysenterEip,
132     WHvX64RegisterSysenterEsp,
133     WHvX64RegisterStar,
134 #ifdef TARGET_X86_64
135     WHvX64RegisterLstar,
136     WHvX64RegisterCstar,
137     WHvX64RegisterSfmask,
138 #endif
139 
140     /* Interrupt / Event Registers */
141     /*
142      * WHvRegisterPendingInterruption,
143      * WHvRegisterInterruptState,
144      * WHvRegisterPendingEvent0,
145      * WHvRegisterPendingEvent1
146      * WHvX64RegisterDeliverabilityNotifications,
147      */
148 };
149 
150 struct whpx_register_set {
151     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
152 };
153 
154 /*
155  * The current implementation of instruction stepping sets the TF flag
156  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
157  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
158  *
159  * This approach has a few limitations:
160  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
161  *        along with the other flags, possibly restoring it later. It would
162  *        result in another INT1 when the flags are restored, triggering
163  *        a stop in gdb that could be cleared by doing another step.
164  *
165  *        Stepping over a POPF/LAHF instruction will let it overwrite the
166  *        TF flags, ending the stepping mode.
167  *
168  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
169  *        or anything that could result in a page fault) will save the flags
170  *        to the stack, clear the TF flag, and let the guest execute the
171  *        handler. Normally, the guest will restore the original flags,
172  *        that will continue single-stepping.
173  *
174  *     3. Debuggers running on the guest may wish to set TF to do instruction
175  *        stepping. INT1 events generated by it would be intercepted by us,
176  *        as long as the gdb is connected to QEMU.
177  *
178  * In practice this means that:
179  *     1. Stepping through flags-modifying instructions may cause gdb to
180  *        continue or stop in unexpected places. This will be fully recoverable
181  *        and will not crash the target.
182  *
183  *     2. Stepping over an instruction that triggers an exception will step
184  *        over the exception handler, not into it.
185  *
186  *     3. Debugging the guest via gdb, while running debugger on the guest
187  *        at the same time may lead to unexpected effects. Removing all
188  *        breakpoints set via QEMU will prevent any further interference
189  *        with the guest-level debuggers.
190  *
191  * The limitations can be addressed as shown below:
192  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
193  *        stepping through them. The exact semantics of the instructions is
194  *        defined in the "Combined Volume Set of Intel 64 and IA-32
195  *        Architectures Software Developer's Manuals", however it involves a
196  *        fair amount of corner cases due to compatibility with real mode,
197  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
198  *
199  *     2. We could step into the guest's exception handlers using the following
200  *        sequence:
201  *          a. Temporarily enable catching of all exception types via
202  *             whpx_set_exception_exit_bitmap().
203  *          b. Once an exception is intercepted, read the IDT/GDT and locate
204  *             the original handler.
205  *          c. Patch the original handler, injecting an INT3 at the beginning.
206  *          d. Update the exception exit bitmap to only catch the
207  *             WHvX64ExceptionTypeBreakpointTrap exception.
208  *          e. Let the affected CPU run in the exclusive mode.
209  *          f. Restore the original handler and the exception exit bitmap.
210  *        Note that handling all corner cases related to IDT/GDT is harder
211  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
212  *        rough idea.
213  *
214  *     3. In order to properly support guest-level debugging in parallel with
215  *        the QEMU-level debugging, we would need to be able to pass some INT1
216  *        events to the guest. This could be done via the following methods:
217  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
218  *             it seems to only work for interrupts and not software
219  *             exceptions.
220  *          b. Locating and patching the original handler by parsing IDT/GDT.
221  *             This involves relatively complex logic outlined in the previous
222  *             paragraph.
223  *          c. Emulating the exception invocation (i.e. manually updating RIP,
224  *             RFLAGS, and pushing the old values to stack). This is even more
225  *             complicated than the previous option, since it involves checking
226  *             CPL, gate attributes, and doing various adjustments depending
227  *             on the current CPU mode, whether the CPL is changing, etc.
228  */
229 typedef enum WhpxStepMode {
230     WHPX_STEP_NONE = 0,
231     /* Halt other VCPUs */
232     WHPX_STEP_EXCLUSIVE,
233 } WhpxStepMode;
234 
235 struct AccelCPUState {
236     WHV_EMULATOR_HANDLE emulator;
237     bool window_registered;
238     bool interruptable;
239     bool ready_for_pic_interrupt;
240     uint64_t tpr;
241     uint64_t apic_base;
242     bool interruption_pending;
243 
244     /* Must be the last field as it may have a tail */
245     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
246 };
247 
248 bool whpx_allowed;
249 static bool whp_dispatch_initialized;
250 static HMODULE hWinHvPlatform, hWinHvEmulation;
251 static uint32_t max_vcpu_index;
252 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
253 
254 struct whpx_state whpx_global;
255 struct WHPDispatch whp_dispatch;
256 
whpx_has_xsave(void)257 static bool whpx_has_xsave(void)
258 {
259     return whpx_xsave_cap.XsaveSupport;
260 }
261 
whpx_seg_q2h(const SegmentCache * qs,int v86,int r86)262 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
263                                              int r86)
264 {
265     WHV_X64_SEGMENT_REGISTER hs;
266     unsigned flags = qs->flags;
267 
268     hs.Base = qs->base;
269     hs.Limit = qs->limit;
270     hs.Selector = qs->selector;
271 
272     if (v86) {
273         hs.Attributes = 0;
274         hs.SegmentType = 3;
275         hs.Present = 1;
276         hs.DescriptorPrivilegeLevel = 3;
277         hs.NonSystemSegment = 1;
278 
279     } else {
280         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
281 
282         if (r86) {
283             /* hs.Base &= 0xfffff; */
284         }
285     }
286 
287     return hs;
288 }
289 
whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER * hs)290 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
291 {
292     SegmentCache qs;
293 
294     qs.base = hs->Base;
295     qs.limit = hs->Limit;
296     qs.selector = hs->Selector;
297 
298     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
299 
300     return qs;
301 }
302 
303 /* X64 Extended Control Registers */
whpx_set_xcrs(CPUState * cpu)304 static void whpx_set_xcrs(CPUState *cpu)
305 {
306     HRESULT hr;
307     struct whpx_state *whpx = &whpx_global;
308     WHV_REGISTER_VALUE xcr0;
309     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
310 
311     if (!whpx_has_xsave()) {
312         return;
313     }
314 
315     /* Only xcr0 is supported by the hypervisor currently */
316     xcr0.Reg64 = cpu_env(cpu)->xcr0;
317     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
318         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
319     if (FAILED(hr)) {
320         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
321     }
322 }
323 
whpx_set_tsc(CPUState * cpu)324 static int whpx_set_tsc(CPUState *cpu)
325 {
326     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
327     WHV_REGISTER_VALUE tsc_val;
328     HRESULT hr;
329     struct whpx_state *whpx = &whpx_global;
330 
331     /*
332      * Suspend the partition prior to setting the TSC to reduce the variance
333      * in TSC across vCPUs. When the first vCPU runs post suspend, the
334      * partition is automatically resumed.
335      */
336     if (whp_dispatch.WHvSuspendPartitionTime) {
337 
338         /*
339          * Unable to suspend partition while setting TSC is not a fatal
340          * error. It just increases the likelihood of TSC variance between
341          * vCPUs and some guest OS are able to handle that just fine.
342          */
343         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
344         if (FAILED(hr)) {
345             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
346         }
347     }
348 
349     tsc_val.Reg64 = cpu_env(cpu)->tsc;
350     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
351         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
352     if (FAILED(hr)) {
353         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
354         return -1;
355     }
356 
357     return 0;
358 }
359 
360 /*
361  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
362  * however, they use a slightly different encoding. Specifically:
363  *
364  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
365  *
366  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
367  * and IA-32 Architectures Software Developer's Manual.
368  *
369  * The functions below translate the value of CR8 to TPR and vice versa.
370  */
371 
whpx_apic_tpr_to_cr8(uint64_t tpr)372 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
373 {
374     return tpr >> 4;
375 }
376 
whpx_cr8_to_apic_tpr(uint64_t cr8)377 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
378 {
379     return cr8 << 4;
380 }
381 
whpx_set_registers(CPUState * cpu,int level)382 static void whpx_set_registers(CPUState *cpu, int level)
383 {
384     struct whpx_state *whpx = &whpx_global;
385     AccelCPUState *vcpu = cpu->accel;
386     X86CPU *x86_cpu = X86_CPU(cpu);
387     CPUX86State *env = &x86_cpu->env;
388     struct whpx_register_set vcxt;
389     HRESULT hr;
390     int idx;
391     int idx_next;
392     int i;
393     int v86, r86;
394 
395     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
396 
397     /*
398      * Following MSRs have side effects on the guest or are too heavy for
399      * runtime. Limit them to full state update.
400      */
401     if (level >= WHPX_SET_RESET_STATE) {
402         whpx_set_tsc(cpu);
403     }
404 
405     memset(&vcxt, 0, sizeof(struct whpx_register_set));
406 
407     v86 = (env->eflags & VM_MASK);
408     r86 = !(env->cr[0] & CR0_PE_MASK);
409 
410     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
411     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
412 
413     idx = 0;
414 
415     /* Indexes for first 16 registers match between HV and QEMU definitions */
416     idx_next = 16;
417     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
418         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
419     }
420     idx = idx_next;
421 
422     /* Same goes for RIP and RFLAGS */
423     assert(whpx_register_names[idx] == WHvX64RegisterRip);
424     vcxt.values[idx++].Reg64 = env->eip;
425 
426     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
427     vcxt.values[idx++].Reg64 = env->eflags;
428 
429     /* Translate 6+4 segment registers. HV and QEMU order matches  */
430     assert(idx == WHvX64RegisterEs);
431     for (i = 0; i < 6; i += 1, idx += 1) {
432         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
433     }
434 
435     assert(idx == WHvX64RegisterLdtr);
436     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
437 
438     assert(idx == WHvX64RegisterTr);
439     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
440 
441     assert(idx == WHvX64RegisterIdtr);
442     vcxt.values[idx].Table.Base = env->idt.base;
443     vcxt.values[idx].Table.Limit = env->idt.limit;
444     idx += 1;
445 
446     assert(idx == WHvX64RegisterGdtr);
447     vcxt.values[idx].Table.Base = env->gdt.base;
448     vcxt.values[idx].Table.Limit = env->gdt.limit;
449     idx += 1;
450 
451     /* CR0, 2, 3, 4, 8 */
452     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
453     vcxt.values[idx++].Reg64 = env->cr[0];
454     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
455     vcxt.values[idx++].Reg64 = env->cr[2];
456     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
457     vcxt.values[idx++].Reg64 = env->cr[3];
458     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
459     vcxt.values[idx++].Reg64 = env->cr[4];
460     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
461     vcxt.values[idx++].Reg64 = vcpu->tpr;
462 
463     /* 8 Debug Registers - Skipped */
464 
465     /*
466      * Extended control registers needs to be handled separately depending
467      * on whether xsave is supported/enabled or not.
468      */
469     whpx_set_xcrs(cpu);
470 
471     /* 16 XMM registers */
472     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
473     idx_next = idx + 16;
474     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
475         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
476         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
477     }
478     idx = idx_next;
479 
480     /* 8 FP registers */
481     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
482     for (i = 0; i < 8; i += 1, idx += 1) {
483         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
484         /* vcxt.values[idx].Fp.AsUINT128.High64 =
485                env->fpregs[i].mmx.MMX_Q(1);
486         */
487     }
488 
489     /* FP control status register */
490     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
491     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
492     vcxt.values[idx].FpControlStatus.FpStatus =
493         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
494     vcxt.values[idx].FpControlStatus.FpTag = 0;
495     for (i = 0; i < 8; ++i) {
496         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
497     }
498     vcxt.values[idx].FpControlStatus.Reserved = 0;
499     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
500     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
501     idx += 1;
502 
503     /* XMM control status register */
504     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
505     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
506     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
507     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
508     idx += 1;
509 
510     /* MSRs */
511     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
512     vcxt.values[idx++].Reg64 = env->efer;
513 #ifdef TARGET_X86_64
514     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
515     vcxt.values[idx++].Reg64 = env->kernelgsbase;
516 #endif
517 
518     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
519     vcxt.values[idx++].Reg64 = vcpu->apic_base;
520 
521     /* WHvX64RegisterPat - Skipped */
522 
523     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
524     vcxt.values[idx++].Reg64 = env->sysenter_cs;
525     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
526     vcxt.values[idx++].Reg64 = env->sysenter_eip;
527     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
528     vcxt.values[idx++].Reg64 = env->sysenter_esp;
529     assert(whpx_register_names[idx] == WHvX64RegisterStar);
530     vcxt.values[idx++].Reg64 = env->star;
531 #ifdef TARGET_X86_64
532     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
533     vcxt.values[idx++].Reg64 = env->lstar;
534     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
535     vcxt.values[idx++].Reg64 = env->cstar;
536     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
537     vcxt.values[idx++].Reg64 = env->fmask;
538 #endif
539 
540     /* Interrupt / Event Registers - Skipped */
541 
542     assert(idx == RTL_NUMBER_OF(whpx_register_names));
543 
544     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
545         whpx->partition, cpu->cpu_index,
546         whpx_register_names,
547         RTL_NUMBER_OF(whpx_register_names),
548         &vcxt.values[0]);
549 
550     if (FAILED(hr)) {
551         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
552                      hr);
553     }
554 }
555 
whpx_get_tsc(CPUState * cpu)556 static int whpx_get_tsc(CPUState *cpu)
557 {
558     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
559     WHV_REGISTER_VALUE tsc_val;
560     HRESULT hr;
561     struct whpx_state *whpx = &whpx_global;
562 
563     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
564         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
565     if (FAILED(hr)) {
566         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
567         return -1;
568     }
569 
570     cpu_env(cpu)->tsc = tsc_val.Reg64;
571     return 0;
572 }
573 
574 /* X64 Extended Control Registers */
whpx_get_xcrs(CPUState * cpu)575 static void whpx_get_xcrs(CPUState *cpu)
576 {
577     HRESULT hr;
578     struct whpx_state *whpx = &whpx_global;
579     WHV_REGISTER_VALUE xcr0;
580     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
581 
582     if (!whpx_has_xsave()) {
583         return;
584     }
585 
586     /* Only xcr0 is supported by the hypervisor currently */
587     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
588         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
589     if (FAILED(hr)) {
590         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
591         return;
592     }
593 
594     cpu_env(cpu)->xcr0 = xcr0.Reg64;
595 }
596 
whpx_get_registers(CPUState * cpu)597 static void whpx_get_registers(CPUState *cpu)
598 {
599     struct whpx_state *whpx = &whpx_global;
600     AccelCPUState *vcpu = cpu->accel;
601     X86CPU *x86_cpu = X86_CPU(cpu);
602     CPUX86State *env = &x86_cpu->env;
603     struct whpx_register_set vcxt;
604     uint64_t tpr, apic_base;
605     HRESULT hr;
606     int idx;
607     int idx_next;
608     int i;
609 
610     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
611 
612     if (!env->tsc_valid) {
613         whpx_get_tsc(cpu);
614         env->tsc_valid = !runstate_is_running();
615     }
616 
617     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
618         whpx->partition, cpu->cpu_index,
619         whpx_register_names,
620         RTL_NUMBER_OF(whpx_register_names),
621         &vcxt.values[0]);
622     if (FAILED(hr)) {
623         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
624                      hr);
625     }
626 
627     if (whpx_apic_in_platform()) {
628         /*
629          * Fetch the TPR value from the emulated APIC. It may get overwritten
630          * below with the value from CR8 returned by
631          * WHvGetVirtualProcessorRegisters().
632          */
633         whpx_apic_get(x86_cpu->apic_state);
634         vcpu->tpr = whpx_apic_tpr_to_cr8(
635             cpu_get_apic_tpr(x86_cpu->apic_state));
636     }
637 
638     idx = 0;
639 
640     /* Indexes for first 16 registers match between HV and QEMU definitions */
641     idx_next = 16;
642     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
643         env->regs[idx] = vcxt.values[idx].Reg64;
644     }
645     idx = idx_next;
646 
647     /* Same goes for RIP and RFLAGS */
648     assert(whpx_register_names[idx] == WHvX64RegisterRip);
649     env->eip = vcxt.values[idx++].Reg64;
650     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
651     env->eflags = vcxt.values[idx++].Reg64;
652 
653     /* Translate 6+4 segment registers. HV and QEMU order matches  */
654     assert(idx == WHvX64RegisterEs);
655     for (i = 0; i < 6; i += 1, idx += 1) {
656         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
657     }
658 
659     assert(idx == WHvX64RegisterLdtr);
660     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
661     assert(idx == WHvX64RegisterTr);
662     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
663     assert(idx == WHvX64RegisterIdtr);
664     env->idt.base = vcxt.values[idx].Table.Base;
665     env->idt.limit = vcxt.values[idx].Table.Limit;
666     idx += 1;
667     assert(idx == WHvX64RegisterGdtr);
668     env->gdt.base = vcxt.values[idx].Table.Base;
669     env->gdt.limit = vcxt.values[idx].Table.Limit;
670     idx += 1;
671 
672     /* CR0, 2, 3, 4, 8 */
673     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
674     env->cr[0] = vcxt.values[idx++].Reg64;
675     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
676     env->cr[2] = vcxt.values[idx++].Reg64;
677     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
678     env->cr[3] = vcxt.values[idx++].Reg64;
679     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
680     env->cr[4] = vcxt.values[idx++].Reg64;
681     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
682     tpr = vcxt.values[idx++].Reg64;
683     if (tpr != vcpu->tpr) {
684         vcpu->tpr = tpr;
685         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
686     }
687 
688     /* 8 Debug Registers - Skipped */
689 
690     /*
691      * Extended control registers needs to be handled separately depending
692      * on whether xsave is supported/enabled or not.
693      */
694     whpx_get_xcrs(cpu);
695 
696     /* 16 XMM registers */
697     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
698     idx_next = idx + 16;
699     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
700         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
701         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
702     }
703     idx = idx_next;
704 
705     /* 8 FP registers */
706     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
707     for (i = 0; i < 8; i += 1, idx += 1) {
708         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
709         /* env->fpregs[i].mmx.MMX_Q(1) =
710                vcxt.values[idx].Fp.AsUINT128.High64;
711         */
712     }
713 
714     /* FP control status register */
715     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
716     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
717     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
718     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
719     for (i = 0; i < 8; ++i) {
720         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
721     }
722     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
723     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
724     idx += 1;
725 
726     /* XMM control status register */
727     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
728     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
729     idx += 1;
730 
731     /* MSRs */
732     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
733     env->efer = vcxt.values[idx++].Reg64;
734 #ifdef TARGET_X86_64
735     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
736     env->kernelgsbase = vcxt.values[idx++].Reg64;
737 #endif
738 
739     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
740     apic_base = vcxt.values[idx++].Reg64;
741     if (apic_base != vcpu->apic_base) {
742         vcpu->apic_base = apic_base;
743         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
744     }
745 
746     /* WHvX64RegisterPat - Skipped */
747 
748     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
749     env->sysenter_cs = vcxt.values[idx++].Reg64;
750     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
751     env->sysenter_eip = vcxt.values[idx++].Reg64;
752     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
753     env->sysenter_esp = vcxt.values[idx++].Reg64;
754     assert(whpx_register_names[idx] == WHvX64RegisterStar);
755     env->star = vcxt.values[idx++].Reg64;
756 #ifdef TARGET_X86_64
757     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
758     env->lstar = vcxt.values[idx++].Reg64;
759     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
760     env->cstar = vcxt.values[idx++].Reg64;
761     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
762     env->fmask = vcxt.values[idx++].Reg64;
763 #endif
764 
765     /* Interrupt / Event Registers - Skipped */
766 
767     assert(idx == RTL_NUMBER_OF(whpx_register_names));
768 
769     if (whpx_apic_in_platform()) {
770         whpx_apic_get(x86_cpu->apic_state);
771     }
772 
773     x86_update_hflags(env);
774 }
775 
whpx_emu_ioport_callback(void * ctx,WHV_EMULATOR_IO_ACCESS_INFO * IoAccess)776 static HRESULT CALLBACK whpx_emu_ioport_callback(
777     void *ctx,
778     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
779 {
780     MemTxAttrs attrs = { 0 };
781     address_space_rw(&address_space_io, IoAccess->Port, attrs,
782                      &IoAccess->Data, IoAccess->AccessSize,
783                      IoAccess->Direction);
784     return S_OK;
785 }
786 
whpx_emu_mmio_callback(void * ctx,WHV_EMULATOR_MEMORY_ACCESS_INFO * ma)787 static HRESULT CALLBACK whpx_emu_mmio_callback(
788     void *ctx,
789     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
790 {
791     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
792                            ma->Direction);
793     return S_OK;
794 }
795 
whpx_emu_getreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,WHV_REGISTER_VALUE * RegisterValues)796 static HRESULT CALLBACK whpx_emu_getreg_callback(
797     void *ctx,
798     const WHV_REGISTER_NAME *RegisterNames,
799     UINT32 RegisterCount,
800     WHV_REGISTER_VALUE *RegisterValues)
801 {
802     HRESULT hr;
803     struct whpx_state *whpx = &whpx_global;
804     CPUState *cpu = (CPUState *)ctx;
805 
806     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
807         whpx->partition, cpu->cpu_index,
808         RegisterNames, RegisterCount,
809         RegisterValues);
810     if (FAILED(hr)) {
811         error_report("WHPX: Failed to get virtual processor registers,"
812                      " hr=%08lx", hr);
813     }
814 
815     return hr;
816 }
817 
whpx_emu_setreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,const WHV_REGISTER_VALUE * RegisterValues)818 static HRESULT CALLBACK whpx_emu_setreg_callback(
819     void *ctx,
820     const WHV_REGISTER_NAME *RegisterNames,
821     UINT32 RegisterCount,
822     const WHV_REGISTER_VALUE *RegisterValues)
823 {
824     HRESULT hr;
825     struct whpx_state *whpx = &whpx_global;
826     CPUState *cpu = (CPUState *)ctx;
827 
828     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
829         whpx->partition, cpu->cpu_index,
830         RegisterNames, RegisterCount,
831         RegisterValues);
832     if (FAILED(hr)) {
833         error_report("WHPX: Failed to set virtual processor registers,"
834                      " hr=%08lx", hr);
835     }
836 
837     /*
838      * The emulator just successfully wrote the register state. We clear the
839      * dirty state so we avoid the double write on resume of the VP.
840      */
841     cpu->vcpu_dirty = false;
842 
843     return hr;
844 }
845 
whpx_emu_translate_callback(void * ctx,WHV_GUEST_VIRTUAL_ADDRESS Gva,WHV_TRANSLATE_GVA_FLAGS TranslateFlags,WHV_TRANSLATE_GVA_RESULT_CODE * TranslationResult,WHV_GUEST_PHYSICAL_ADDRESS * Gpa)846 static HRESULT CALLBACK whpx_emu_translate_callback(
847     void *ctx,
848     WHV_GUEST_VIRTUAL_ADDRESS Gva,
849     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
850     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
851     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
852 {
853     HRESULT hr;
854     struct whpx_state *whpx = &whpx_global;
855     CPUState *cpu = (CPUState *)ctx;
856     WHV_TRANSLATE_GVA_RESULT res;
857 
858     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
859                                       Gva, TranslateFlags, &res, Gpa);
860     if (FAILED(hr)) {
861         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
862     } else {
863         *TranslationResult = res.ResultCode;
864     }
865 
866     return hr;
867 }
868 
869 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
870     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
871     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
872     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
873     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
874     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
875     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
876 };
877 
whpx_handle_mmio(CPUState * cpu,WHV_MEMORY_ACCESS_CONTEXT * ctx)878 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
879 {
880     HRESULT hr;
881     AccelCPUState *vcpu = cpu->accel;
882     WHV_EMULATOR_STATUS emu_status;
883 
884     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
885         vcpu->emulator, cpu,
886         &vcpu->exit_ctx.VpContext, ctx,
887         &emu_status);
888     if (FAILED(hr)) {
889         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
890         return -1;
891     }
892 
893     if (!emu_status.EmulationSuccessful) {
894         error_report("WHPX: Failed to emulate MMIO access with"
895                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
896         return -1;
897     }
898 
899     return 0;
900 }
901 
whpx_handle_portio(CPUState * cpu,WHV_X64_IO_PORT_ACCESS_CONTEXT * ctx)902 static int whpx_handle_portio(CPUState *cpu,
903                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
904 {
905     HRESULT hr;
906     AccelCPUState *vcpu = cpu->accel;
907     WHV_EMULATOR_STATUS emu_status;
908 
909     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
910         vcpu->emulator, cpu,
911         &vcpu->exit_ctx.VpContext, ctx,
912         &emu_status);
913     if (FAILED(hr)) {
914         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
915         return -1;
916     }
917 
918     if (!emu_status.EmulationSuccessful) {
919         error_report("WHPX: Failed to emulate PortIO access with"
920                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
921         return -1;
922     }
923 
924     return 0;
925 }
926 
927 /*
928  * Controls whether we should intercept various exceptions on the guest,
929  * namely breakpoint/single-step events.
930  *
931  * The 'exceptions' argument accepts a bitmask, e.g:
932  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
933  */
whpx_set_exception_exit_bitmap(UINT64 exceptions)934 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
935 {
936     struct whpx_state *whpx = &whpx_global;
937     WHV_PARTITION_PROPERTY prop = { 0, };
938     HRESULT hr;
939 
940     if (exceptions == whpx->exception_exit_bitmap) {
941         return S_OK;
942     }
943 
944     prop.ExceptionExitBitmap = exceptions;
945 
946     hr = whp_dispatch.WHvSetPartitionProperty(
947         whpx->partition,
948         WHvPartitionPropertyCodeExceptionExitBitmap,
949         &prop,
950         sizeof(WHV_PARTITION_PROPERTY));
951 
952     if (SUCCEEDED(hr)) {
953         whpx->exception_exit_bitmap = exceptions;
954     }
955 
956     return hr;
957 }
958 
959 
960 /*
961  * This function is called before/after stepping over a single instruction.
962  * It will update the CPU registers to arm/disarm the instruction stepping
963  * accordingly.
964  */
whpx_vcpu_configure_single_stepping(CPUState * cpu,bool set,uint64_t * exit_context_rflags)965 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
966     bool set,
967     uint64_t *exit_context_rflags)
968 {
969     WHV_REGISTER_NAME reg_name;
970     WHV_REGISTER_VALUE reg_value;
971     HRESULT hr;
972     struct whpx_state *whpx = &whpx_global;
973 
974     /*
975      * If we are trying to step over a single instruction, we need to set the
976      * TF bit in rflags. Otherwise, clear it.
977      */
978     reg_name = WHvX64RegisterRflags;
979     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
980         whpx->partition,
981         cpu->cpu_index,
982         &reg_name,
983         1,
984         &reg_value);
985 
986     if (FAILED(hr)) {
987         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
988         return hr;
989     }
990 
991     if (exit_context_rflags) {
992         assert(*exit_context_rflags == reg_value.Reg64);
993     }
994 
995     if (set) {
996         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
997         reg_value.Reg64 |= TF_MASK;
998     } else {
999         reg_value.Reg64 &= ~TF_MASK;
1000     }
1001 
1002     if (exit_context_rflags) {
1003         *exit_context_rflags = reg_value.Reg64;
1004     }
1005 
1006     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1007         whpx->partition,
1008         cpu->cpu_index,
1009         &reg_name,
1010         1,
1011         &reg_value);
1012 
1013     if (FAILED(hr)) {
1014         error_report("WHPX: Failed to set rflags,"
1015             " hr=%08lx",
1016             hr);
1017         return hr;
1018     }
1019 
1020     reg_name = WHvRegisterInterruptState;
1021     reg_value.Reg64 = 0;
1022 
1023     /* Suspend delivery of hardware interrupts during single-stepping. */
1024     reg_value.InterruptState.InterruptShadow = set != 0;
1025 
1026     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1027     whpx->partition,
1028         cpu->cpu_index,
1029         &reg_name,
1030         1,
1031         &reg_value);
1032 
1033     if (FAILED(hr)) {
1034         error_report("WHPX: Failed to set InterruptState,"
1035             " hr=%08lx",
1036             hr);
1037         return hr;
1038     }
1039 
1040     if (!set) {
1041         /*
1042          * We have just finished stepping over a single instruction,
1043          * and intercepted the INT1 generated by it.
1044          * We need to now hide the INT1 from the guest,
1045          * as it would not be expecting it.
1046          */
1047 
1048         reg_name = WHvX64RegisterPendingDebugException;
1049         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1050         whpx->partition,
1051             cpu->cpu_index,
1052             &reg_name,
1053             1,
1054             &reg_value);
1055 
1056         if (FAILED(hr)) {
1057             error_report("WHPX: Failed to get pending debug exceptions,"
1058                          "hr=%08lx", hr);
1059             return hr;
1060         }
1061 
1062         if (reg_value.PendingDebugException.SingleStep) {
1063             reg_value.PendingDebugException.SingleStep = 0;
1064 
1065             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1066                 whpx->partition,
1067                 cpu->cpu_index,
1068                 &reg_name,
1069                 1,
1070                 &reg_value);
1071 
1072             if (FAILED(hr)) {
1073                 error_report("WHPX: Failed to clear pending debug exceptions,"
1074                              "hr=%08lx", hr);
1075              return hr;
1076             }
1077         }
1078 
1079     }
1080 
1081     return S_OK;
1082 }
1083 
1084 /* Tries to find a breakpoint at the specified address. */
whpx_lookup_breakpoint_by_addr(uint64_t address)1085 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1086 {
1087     struct whpx_state *whpx = &whpx_global;
1088     int i;
1089 
1090     if (whpx->breakpoints.breakpoints) {
1091         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1092             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1093                 return &whpx->breakpoints.breakpoints->data[i];
1094             }
1095         }
1096     }
1097 
1098     return NULL;
1099 }
1100 
1101 /*
1102  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1103  * debugging user-mode applications. Since the WHPX API does not offer
1104  * an easy way to pass the intercepted exception back to the guest, we
1105  * resort to using INT1 instead, and let the guest always handle INT3.
1106  */
1107 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1108 
1109 /*
1110  * The WHPX QEMU backend implements breakpoints by writing the INT1
1111  * instruction into memory (ignoring the DRx registers). This raises a few
1112  * issues that need to be carefully handled:
1113  *
1114  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1115  *    at the same location, and later remove them in arbitrary order.
1116  *    This should not cause memory corruption, and should only remove the
1117  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1118  *
1119  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1120  *    physical location. Hence, physically adding/removing a breakpoint can
1121  *    theoretically fail at any time. We need to keep track of it.
1122  *
1123  * The function below rebuilds a list of low-level breakpoints (one per
1124  * address, tracking the original instruction and any errors) from the list of
1125  * high-level breakpoints (set via cpu_breakpoint_insert()).
1126  *
1127  * In order to optimize performance, this function stores the list of
1128  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1129  * low-level ones, so that it won't be re-invoked until these breakpoints
1130  * change.
1131  *
1132  * Note that this function decides which breakpoints should be inserted into,
1133  * memory, but doesn't actually do it. The memory accessing is done in
1134  * whpx_apply_breakpoints().
1135  */
whpx_translate_cpu_breakpoints(struct whpx_breakpoints * breakpoints,CPUState * cpu,int cpu_breakpoint_count)1136 static void whpx_translate_cpu_breakpoints(
1137     struct whpx_breakpoints *breakpoints,
1138     CPUState *cpu,
1139     int cpu_breakpoint_count)
1140 {
1141     CPUBreakpoint *bp;
1142     int cpu_bp_index = 0;
1143 
1144     breakpoints->original_addresses =
1145         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1146 
1147     breakpoints->original_address_count = cpu_breakpoint_count;
1148 
1149     int max_breakpoints = cpu_breakpoint_count +
1150         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1151 
1152     struct whpx_breakpoint_collection *new_breakpoints =
1153         g_malloc0(sizeof(struct whpx_breakpoint_collection)
1154                   + max_breakpoints * sizeof(struct whpx_breakpoint));
1155 
1156     new_breakpoints->allocated = max_breakpoints;
1157     new_breakpoints->used = 0;
1158 
1159     /*
1160      * 1. Preserve all old breakpoints that could not be automatically
1161      * cleared when the CPU got stopped.
1162      */
1163     if (breakpoints->breakpoints) {
1164         int i;
1165         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1166             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1167                 new_breakpoints->data[new_breakpoints->used++] =
1168                     breakpoints->breakpoints->data[i];
1169             }
1170         }
1171     }
1172 
1173     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1174     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1175         int i;
1176         bool found = false;
1177 
1178         /* This will be used to detect changed CPU breakpoints later. */
1179         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1180 
1181         for (i = 0; i < new_breakpoints->used; i++) {
1182             /*
1183              * WARNING: This loop has O(N^2) complexity, where N is the
1184              * number of breakpoints. It should not be a bottleneck in
1185              * real-world scenarios, since it only needs to run once after
1186              * the breakpoints have been modified.
1187              * If this ever becomes a concern, it can be optimized by storing
1188              * high-level breakpoint objects in a tree or hash map.
1189              */
1190 
1191             if (new_breakpoints->data[i].address == bp->pc) {
1192                 /* There was already a breakpoint at this address. */
1193                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1194                     new_breakpoints->data[i].state = WHPX_BP_SET;
1195                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1196                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1197                 }
1198 
1199                 found = true;
1200                 break;
1201             }
1202         }
1203 
1204         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1205             /* No WHPX breakpoint at this address. Create one. */
1206             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1207             new_breakpoints->data[new_breakpoints->used].state =
1208                 WHPX_BP_SET_PENDING;
1209             new_breakpoints->used++;
1210         }
1211     }
1212 
1213     /*
1214      * Free the previous breakpoint list. This can be optimized by keeping
1215      * it as shadow buffer for the next computation instead of freeing
1216      * it immediately.
1217      */
1218     g_free(breakpoints->breakpoints);
1219 
1220     breakpoints->breakpoints = new_breakpoints;
1221 }
1222 
1223 /*
1224  * Physically inserts/removes the breakpoints by reading and writing the
1225  * physical memory, keeping a track of the failed attempts.
1226  *
1227  * Passing resuming=true  will try to set all previously unset breakpoints.
1228  * Passing resuming=false will remove all inserted ones.
1229  */
whpx_apply_breakpoints(struct whpx_breakpoint_collection * breakpoints,CPUState * cpu,bool resuming)1230 static void whpx_apply_breakpoints(
1231     struct whpx_breakpoint_collection *breakpoints,
1232     CPUState *cpu,
1233     bool resuming)
1234 {
1235     int i, rc;
1236     if (!breakpoints) {
1237         return;
1238     }
1239 
1240     for (i = 0; i < breakpoints->used; i++) {
1241         /* Decide what to do right now based on the last known state. */
1242         WhpxBreakpointState state = breakpoints->data[i].state;
1243         switch (state) {
1244         case WHPX_BP_CLEARED:
1245             if (resuming) {
1246                 state = WHPX_BP_SET_PENDING;
1247             }
1248             break;
1249         case WHPX_BP_SET_PENDING:
1250             if (!resuming) {
1251                 state = WHPX_BP_CLEARED;
1252             }
1253             break;
1254         case WHPX_BP_SET:
1255             if (!resuming) {
1256                 state = WHPX_BP_CLEAR_PENDING;
1257             }
1258             break;
1259         case WHPX_BP_CLEAR_PENDING:
1260             if (resuming) {
1261                 state = WHPX_BP_SET;
1262             }
1263             break;
1264         }
1265 
1266         if (state == WHPX_BP_SET_PENDING) {
1267             /* Remember the original instruction. */
1268             rc = cpu_memory_rw_debug(cpu,
1269                 breakpoints->data[i].address,
1270                 &breakpoints->data[i].original_instruction,
1271                 1,
1272                 false);
1273 
1274             if (!rc) {
1275                 /* Write the breakpoint instruction. */
1276                 rc = cpu_memory_rw_debug(cpu,
1277                     breakpoints->data[i].address,
1278                     (void *)&whpx_breakpoint_instruction,
1279                     1,
1280                     true);
1281             }
1282 
1283             if (!rc) {
1284                 state = WHPX_BP_SET;
1285             }
1286 
1287         }
1288 
1289         if (state == WHPX_BP_CLEAR_PENDING) {
1290             /* Restore the original instruction. */
1291             rc = cpu_memory_rw_debug(cpu,
1292                 breakpoints->data[i].address,
1293                 &breakpoints->data[i].original_instruction,
1294                 1,
1295                 true);
1296 
1297             if (!rc) {
1298                 state = WHPX_BP_CLEARED;
1299             }
1300         }
1301 
1302         breakpoints->data[i].state = state;
1303     }
1304 }
1305 
1306 /*
1307  * This function is called when the a VCPU is about to start and no other
1308  * VCPUs have been started so far. Since the VCPU start order could be
1309  * arbitrary, it doesn't have to be VCPU#0.
1310  *
1311  * It is used to commit the breakpoints into memory, and configure WHPX
1312  * to intercept debug exceptions.
1313  *
1314  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1315  * more VCPUs are already running, so this is the best place to do it.
1316  */
whpx_first_vcpu_starting(CPUState * cpu)1317 static int whpx_first_vcpu_starting(CPUState *cpu)
1318 {
1319     struct whpx_state *whpx = &whpx_global;
1320     HRESULT hr;
1321 
1322     g_assert(bql_locked());
1323 
1324     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1325             (whpx->breakpoints.breakpoints &&
1326              whpx->breakpoints.breakpoints->used)) {
1327         CPUBreakpoint *bp;
1328         int i = 0;
1329         bool update_pending = false;
1330 
1331         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1332             if (i >= whpx->breakpoints.original_address_count ||
1333                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1334                 update_pending = true;
1335             }
1336 
1337             i++;
1338         }
1339 
1340         if (i != whpx->breakpoints.original_address_count) {
1341             update_pending = true;
1342         }
1343 
1344         if (update_pending) {
1345             /*
1346              * The CPU breakpoints have changed since the last call to
1347              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1348              * now be recomputed.
1349              */
1350             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1351         }
1352 
1353         /* Actually insert the breakpoints into the memory. */
1354         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1355     }
1356 
1357     uint64_t exception_mask;
1358     if (whpx->step_pending ||
1359         (whpx->breakpoints.breakpoints &&
1360          whpx->breakpoints.breakpoints->used)) {
1361         /*
1362          * We are either attempting to single-step one or more CPUs, or
1363          * have one or more breakpoints enabled. Both require intercepting
1364          * the WHvX64ExceptionTypeBreakpointTrap exception.
1365          */
1366 
1367         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1368     } else {
1369         /* Let the guest handle all exceptions. */
1370         exception_mask = 0;
1371     }
1372 
1373     hr = whpx_set_exception_exit_bitmap(exception_mask);
1374     if (!SUCCEEDED(hr)) {
1375         error_report("WHPX: Failed to update exception exit mask,"
1376                      "hr=%08lx.", hr);
1377         return 1;
1378     }
1379 
1380     return 0;
1381 }
1382 
1383 /*
1384  * This function is called when the last VCPU has finished running.
1385  * It is used to remove any previously set breakpoints from memory.
1386  */
whpx_last_vcpu_stopping(CPUState * cpu)1387 static int whpx_last_vcpu_stopping(CPUState *cpu)
1388 {
1389     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1390     return 0;
1391 }
1392 
1393 /* Returns the address of the next instruction that is about to be executed. */
whpx_vcpu_get_pc(CPUState * cpu,bool exit_context_valid)1394 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1395 {
1396     if (cpu->vcpu_dirty) {
1397         /* The CPU registers have been modified by other parts of QEMU. */
1398         return cpu_env(cpu)->eip;
1399     } else if (exit_context_valid) {
1400         /*
1401          * The CPU registers have not been modified by neither other parts
1402          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1403          * This is the most common case.
1404          */
1405         AccelCPUState *vcpu = cpu->accel;
1406         return vcpu->exit_ctx.VpContext.Rip;
1407     } else {
1408         /*
1409          * The CPU registers have been modified by a call to
1410          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1411          * the target.
1412          */
1413         WHV_REGISTER_VALUE reg_value;
1414         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1415         HRESULT hr;
1416         struct whpx_state *whpx = &whpx_global;
1417 
1418         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1419             whpx->partition,
1420             cpu->cpu_index,
1421             &reg_name,
1422             1,
1423             &reg_value);
1424 
1425         if (FAILED(hr)) {
1426             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1427             return 0;
1428         }
1429 
1430         return reg_value.Reg64;
1431     }
1432 }
1433 
whpx_handle_halt(CPUState * cpu)1434 static int whpx_handle_halt(CPUState *cpu)
1435 {
1436     int ret = 0;
1437 
1438     bql_lock();
1439     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1440           (cpu_env(cpu)->eflags & IF_MASK)) &&
1441         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1442         cpu->exception_index = EXCP_HLT;
1443         cpu->halted = true;
1444         ret = 1;
1445     }
1446     bql_unlock();
1447 
1448     return ret;
1449 }
1450 
whpx_vcpu_pre_run(CPUState * cpu)1451 static void whpx_vcpu_pre_run(CPUState *cpu)
1452 {
1453     HRESULT hr;
1454     struct whpx_state *whpx = &whpx_global;
1455     AccelCPUState *vcpu = cpu->accel;
1456     X86CPU *x86_cpu = X86_CPU(cpu);
1457     CPUX86State *env = &x86_cpu->env;
1458     int irq;
1459     uint8_t tpr;
1460     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1461     UINT32 reg_count = 0;
1462     WHV_REGISTER_VALUE reg_values[3];
1463     WHV_REGISTER_NAME reg_names[3];
1464 
1465     memset(&new_int, 0, sizeof(new_int));
1466     memset(reg_values, 0, sizeof(reg_values));
1467 
1468     bql_lock();
1469 
1470     /* Inject NMI */
1471     if (!vcpu->interruption_pending &&
1472         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1473         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1474             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1475             vcpu->interruptable = false;
1476             new_int.InterruptionType = WHvX64PendingNmi;
1477             new_int.InterruptionPending = 1;
1478             new_int.InterruptionVector = 2;
1479         }
1480         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1481             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1482         }
1483     }
1484 
1485     /*
1486      * Force the VCPU out of its inner loop to process any INIT requests or
1487      * commit pending TPR access.
1488      */
1489     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1490         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1491             !(env->hflags & HF_SMM_MASK)) {
1492             cpu->exit_request = 1;
1493         }
1494         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1495             cpu->exit_request = 1;
1496         }
1497     }
1498 
1499     /* Get pending hard interruption or replay one that was overwritten */
1500     if (!whpx_apic_in_platform()) {
1501         if (!vcpu->interruption_pending &&
1502             vcpu->interruptable && (env->eflags & IF_MASK)) {
1503             assert(!new_int.InterruptionPending);
1504             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1505                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1506                 irq = cpu_get_pic_interrupt(env);
1507                 if (irq >= 0) {
1508                     new_int.InterruptionType = WHvX64PendingInterrupt;
1509                     new_int.InterruptionPending = 1;
1510                     new_int.InterruptionVector = irq;
1511                 }
1512             }
1513         }
1514 
1515         /* Setup interrupt state if new one was prepared */
1516         if (new_int.InterruptionPending) {
1517             reg_values[reg_count].PendingInterruption = new_int;
1518             reg_names[reg_count] = WHvRegisterPendingInterruption;
1519             reg_count += 1;
1520         }
1521     } else if (vcpu->ready_for_pic_interrupt &&
1522                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1523         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1524         irq = cpu_get_pic_interrupt(env);
1525         if (irq >= 0) {
1526             reg_names[reg_count] = WHvRegisterPendingEvent;
1527             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1528             {
1529                 .EventPending = 1,
1530                 .EventType = WHvX64PendingEventExtInt,
1531                 .Vector = irq,
1532             };
1533             reg_count += 1;
1534         }
1535      }
1536 
1537     /* Sync the TPR to the CR8 if was modified during the intercept */
1538     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1539     if (tpr != vcpu->tpr) {
1540         vcpu->tpr = tpr;
1541         reg_values[reg_count].Reg64 = tpr;
1542         cpu->exit_request = 1;
1543         reg_names[reg_count] = WHvX64RegisterCr8;
1544         reg_count += 1;
1545     }
1546 
1547     /* Update the state of the interrupt delivery notification */
1548     if (!vcpu->window_registered &&
1549         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1550         reg_values[reg_count].DeliverabilityNotifications =
1551             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1552                 .InterruptNotification = 1
1553             };
1554         vcpu->window_registered = 1;
1555         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1556         reg_count += 1;
1557     }
1558 
1559     bql_unlock();
1560     vcpu->ready_for_pic_interrupt = false;
1561 
1562     if (reg_count) {
1563         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1564             whpx->partition, cpu->cpu_index,
1565             reg_names, reg_count, reg_values);
1566         if (FAILED(hr)) {
1567             error_report("WHPX: Failed to set interrupt state registers,"
1568                          " hr=%08lx", hr);
1569         }
1570     }
1571 }
1572 
whpx_vcpu_post_run(CPUState * cpu)1573 static void whpx_vcpu_post_run(CPUState *cpu)
1574 {
1575     AccelCPUState *vcpu = cpu->accel;
1576     X86CPU *x86_cpu = X86_CPU(cpu);
1577     CPUX86State *env = &x86_cpu->env;
1578 
1579     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1580 
1581     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1582     if (vcpu->tpr != tpr) {
1583         vcpu->tpr = tpr;
1584         bql_lock();
1585         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1586         bql_unlock();
1587     }
1588 
1589     vcpu->interruption_pending =
1590         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1591 
1592     vcpu->interruptable =
1593         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1594 }
1595 
whpx_vcpu_process_async_events(CPUState * cpu)1596 static void whpx_vcpu_process_async_events(CPUState *cpu)
1597 {
1598     X86CPU *x86_cpu = X86_CPU(cpu);
1599     CPUX86State *env = &x86_cpu->env;
1600     AccelCPUState *vcpu = cpu->accel;
1601 
1602     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1603         !(env->hflags & HF_SMM_MASK)) {
1604         whpx_cpu_synchronize_state(cpu);
1605         do_cpu_init(x86_cpu);
1606         vcpu->interruptable = true;
1607     }
1608 
1609     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1610         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1611         apic_poll_irq(x86_cpu->apic_state);
1612     }
1613 
1614     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1615          (env->eflags & IF_MASK)) ||
1616         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1617         cpu->halted = false;
1618     }
1619 
1620     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1621         cpu_reset_interrupt(cpu, CPU_INTERRUPT_SIPI);
1622         whpx_cpu_synchronize_state(cpu);
1623         do_cpu_sipi(x86_cpu);
1624     }
1625 
1626     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1627         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1628         whpx_cpu_synchronize_state(cpu);
1629         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1630                                       env->tpr_access_type);
1631     }
1632 }
1633 
whpx_vcpu_run(CPUState * cpu)1634 static int whpx_vcpu_run(CPUState *cpu)
1635 {
1636     HRESULT hr;
1637     struct whpx_state *whpx = &whpx_global;
1638     AccelCPUState *vcpu = cpu->accel;
1639     struct whpx_breakpoint *stepped_over_bp = NULL;
1640     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1641     int ret;
1642 
1643     g_assert(bql_locked());
1644 
1645     if (whpx->running_cpus++ == 0) {
1646         /* Insert breakpoints into memory, update exception exit bitmap. */
1647         ret = whpx_first_vcpu_starting(cpu);
1648         if (ret != 0) {
1649             return ret;
1650         }
1651     }
1652 
1653     if (whpx->breakpoints.breakpoints &&
1654         whpx->breakpoints.breakpoints->used > 0)
1655     {
1656         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1657         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1658         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1659             stepped_over_bp = NULL;
1660         }
1661 
1662         if (stepped_over_bp) {
1663             /*
1664              * We are trying to run the instruction overwritten by an active
1665              * breakpoint. We will temporarily disable the breakpoint, suspend
1666              * other CPUs, and step over the instruction.
1667              */
1668             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1669         }
1670     }
1671 
1672     if (exclusive_step_mode == WHPX_STEP_NONE) {
1673         whpx_vcpu_process_async_events(cpu);
1674         if (cpu->halted && !whpx_apic_in_platform()) {
1675             cpu->exception_index = EXCP_HLT;
1676             qatomic_set(&cpu->exit_request, false);
1677             return 0;
1678         }
1679     }
1680 
1681     bql_unlock();
1682 
1683     if (exclusive_step_mode != WHPX_STEP_NONE) {
1684         start_exclusive();
1685         g_assert(cpu == current_cpu);
1686         g_assert(!cpu->running);
1687         cpu->running = true;
1688 
1689         hr = whpx_set_exception_exit_bitmap(
1690             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1691         if (!SUCCEEDED(hr)) {
1692             error_report("WHPX: Failed to update exception exit mask, "
1693                          "hr=%08lx.", hr);
1694             return 1;
1695         }
1696 
1697         if (stepped_over_bp) {
1698             /* Temporarily disable the triggered breakpoint. */
1699             cpu_memory_rw_debug(cpu,
1700                 stepped_over_bp->address,
1701                 &stepped_over_bp->original_instruction,
1702                 1,
1703                 true);
1704         }
1705     } else {
1706         cpu_exec_start(cpu);
1707     }
1708 
1709     do {
1710         if (cpu->vcpu_dirty) {
1711             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1712             cpu->vcpu_dirty = false;
1713         }
1714 
1715         if (exclusive_step_mode == WHPX_STEP_NONE) {
1716             whpx_vcpu_pre_run(cpu);
1717 
1718             if (qatomic_read(&cpu->exit_request)) {
1719                 whpx_vcpu_kick(cpu);
1720             }
1721         }
1722 
1723         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1724             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1725         }
1726 
1727         hr = whp_dispatch.WHvRunVirtualProcessor(
1728             whpx->partition, cpu->cpu_index,
1729             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1730 
1731         if (FAILED(hr)) {
1732             error_report("WHPX: Failed to exec a virtual processor,"
1733                          " hr=%08lx", hr);
1734             ret = -1;
1735             break;
1736         }
1737 
1738         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1739             whpx_vcpu_configure_single_stepping(cpu,
1740                 false,
1741                 &vcpu->exit_ctx.VpContext.Rflags);
1742         }
1743 
1744         whpx_vcpu_post_run(cpu);
1745 
1746         switch (vcpu->exit_ctx.ExitReason) {
1747         case WHvRunVpExitReasonMemoryAccess:
1748             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1749             break;
1750 
1751         case WHvRunVpExitReasonX64IoPortAccess:
1752             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1753             break;
1754 
1755         case WHvRunVpExitReasonX64InterruptWindow:
1756             vcpu->ready_for_pic_interrupt = 1;
1757             vcpu->window_registered = 0;
1758             ret = 0;
1759             break;
1760 
1761         case WHvRunVpExitReasonX64ApicEoi:
1762             assert(whpx_apic_in_platform());
1763             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1764             break;
1765 
1766         case WHvRunVpExitReasonX64Halt:
1767             /*
1768              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1769              * longer used.
1770              */
1771             ret = whpx_handle_halt(cpu);
1772             break;
1773 
1774         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1775             WHV_INTERRUPT_CONTROL ipi = {0};
1776             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1777             uint32_t delivery_mode =
1778                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1779             int dest_shorthand =
1780                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1781             bool broadcast = false;
1782             bool include_self = false;
1783             uint32_t i;
1784 
1785             /* We only registered for INIT and SIPI exits. */
1786             if ((delivery_mode != APIC_DM_INIT) &&
1787                 (delivery_mode != APIC_DM_SIPI)) {
1788                 error_report(
1789                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1790                 break;
1791             }
1792 
1793             if (delivery_mode == APIC_DM_INIT) {
1794                 ipi.Type = WHvX64InterruptTypeInit;
1795             } else {
1796                 ipi.Type = WHvX64InterruptTypeSipi;
1797             }
1798 
1799             ipi.DestinationMode =
1800                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1801                     WHvX64InterruptDestinationModeLogical :
1802                     WHvX64InterruptDestinationModePhysical;
1803 
1804             ipi.TriggerMode =
1805                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1806                     WHvX64InterruptTriggerModeLevel :
1807                     WHvX64InterruptTriggerModeEdge;
1808 
1809             ipi.Vector = icr & APIC_VECTOR_MASK;
1810             switch (dest_shorthand) {
1811             /* no shorthand. Bits 56-63 contain the destination. */
1812             case 0:
1813                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1814                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1815                         &ipi, sizeof(ipi));
1816                 if (FAILED(hr)) {
1817                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1818                         hr);
1819                 }
1820 
1821                 break;
1822 
1823             /* self */
1824             case 1:
1825                 include_self = true;
1826                 break;
1827 
1828             /* broadcast, including self */
1829             case 2:
1830                 broadcast = true;
1831                 include_self = true;
1832                 break;
1833 
1834             /* broadcast, excluding self */
1835             case 3:
1836                 broadcast = true;
1837                 break;
1838             }
1839 
1840             if (!broadcast && !include_self) {
1841                 break;
1842             }
1843 
1844             for (i = 0; i <= max_vcpu_index; i++) {
1845                 if (i == cpu->cpu_index && !include_self) {
1846                     continue;
1847                 }
1848 
1849                 /*
1850                  * Assuming that APIC Ids are identity mapped since
1851                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1852                  * are not handled yet and the hypervisor doesn't allow the
1853                  * guest to modify the APIC ID.
1854                  */
1855                 ipi.Destination = i;
1856                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1857                         &ipi, sizeof(ipi));
1858                 if (FAILED(hr)) {
1859                     error_report(
1860                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1861                         i, hr);
1862                 }
1863             }
1864 
1865             break;
1866         }
1867 
1868         case WHvRunVpExitReasonCanceled:
1869             if (exclusive_step_mode != WHPX_STEP_NONE) {
1870                 /*
1871                  * We are trying to step over a single instruction, and
1872                  * likely got a request to stop from another thread.
1873                  * Delay it until we are done stepping
1874                  * over.
1875                  */
1876                 ret = 0;
1877             } else {
1878                 cpu->exception_index = EXCP_INTERRUPT;
1879                 ret = 1;
1880             }
1881             break;
1882         case WHvRunVpExitReasonX64MsrAccess: {
1883             WHV_REGISTER_VALUE reg_values[3] = {0};
1884             WHV_REGISTER_NAME reg_names[3];
1885             UINT32 reg_count;
1886 
1887             reg_names[0] = WHvX64RegisterRip;
1888             reg_names[1] = WHvX64RegisterRax;
1889             reg_names[2] = WHvX64RegisterRdx;
1890 
1891             reg_values[0].Reg64 =
1892                 vcpu->exit_ctx.VpContext.Rip +
1893                 vcpu->exit_ctx.VpContext.InstructionLength;
1894 
1895             /*
1896              * For all unsupported MSR access we:
1897              *     ignore writes
1898              *     return 0 on read.
1899              */
1900             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1901                         1 : 3;
1902 
1903             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1904                 whpx->partition,
1905                 cpu->cpu_index,
1906                 reg_names, reg_count,
1907                 reg_values);
1908 
1909             if (FAILED(hr)) {
1910                 error_report("WHPX: Failed to set MsrAccess state "
1911                              " registers, hr=%08lx", hr);
1912             }
1913             ret = 0;
1914             break;
1915         }
1916         case WHvRunVpExitReasonX64Cpuid: {
1917             WHV_REGISTER_VALUE reg_values[5];
1918             WHV_REGISTER_NAME reg_names[5];
1919             UINT32 reg_count = 5;
1920             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1921             X86CPU *x86_cpu = X86_CPU(cpu);
1922             CPUX86State *env = &x86_cpu->env;
1923 
1924             memset(reg_values, 0, sizeof(reg_values));
1925 
1926             rip = vcpu->exit_ctx.VpContext.Rip +
1927                   vcpu->exit_ctx.VpContext.InstructionLength;
1928             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1929 
1930             /*
1931              * Ideally, these should be supplied to the hypervisor during VCPU
1932              * initialization and it should be able to satisfy this request.
1933              * But, currently, WHPX doesn't support setting CPUID values in the
1934              * hypervisor once the partition has been setup, which is too late
1935              * since VCPUs are realized later. For now, use the values from
1936              * QEMU to satisfy these requests, until WHPX adds support for
1937              * being able to set these values in the hypervisor at runtime.
1938              */
1939             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1940                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1941             switch (cpuid_fn) {
1942             case 0x40000000:
1943                 /* Expose the vmware cpu frequency cpuid leaf */
1944                 rax = 0x40000010;
1945                 rbx = rcx = rdx = 0;
1946                 break;
1947 
1948             case 0x40000010:
1949                 rax = env->tsc_khz;
1950                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1951                 rcx = rdx = 0;
1952                 break;
1953 
1954             case 0x80000001:
1955                 /* Remove any support of OSVW */
1956                 rcx &= ~CPUID_EXT3_OSVW;
1957                 break;
1958             }
1959 
1960             reg_names[0] = WHvX64RegisterRip;
1961             reg_names[1] = WHvX64RegisterRax;
1962             reg_names[2] = WHvX64RegisterRcx;
1963             reg_names[3] = WHvX64RegisterRdx;
1964             reg_names[4] = WHvX64RegisterRbx;
1965 
1966             reg_values[0].Reg64 = rip;
1967             reg_values[1].Reg64 = rax;
1968             reg_values[2].Reg64 = rcx;
1969             reg_values[3].Reg64 = rdx;
1970             reg_values[4].Reg64 = rbx;
1971 
1972             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1973                 whpx->partition, cpu->cpu_index,
1974                 reg_names,
1975                 reg_count,
1976                 reg_values);
1977 
1978             if (FAILED(hr)) {
1979                 error_report("WHPX: Failed to set CpuidAccess state registers,"
1980                              " hr=%08lx", hr);
1981             }
1982             ret = 0;
1983             break;
1984         }
1985         case WHvRunVpExitReasonException:
1986             whpx_get_registers(cpu);
1987 
1988             if ((vcpu->exit_ctx.VpException.ExceptionType ==
1989                  WHvX64ExceptionTypeDebugTrapOrFault) &&
1990                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1991                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1992                  whpx_breakpoint_instruction)) {
1993                 /* Stopped at a software breakpoint. */
1994                 cpu->exception_index = EXCP_DEBUG;
1995             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
1996                         WHvX64ExceptionTypeDebugTrapOrFault) &&
1997                        !cpu->singlestep_enabled) {
1998                 /*
1999                  * Just finished stepping over a breakpoint, but the
2000                  * gdb does not expect us to do single-stepping.
2001                  * Don't do anything special.
2002                  */
2003                 cpu->exception_index = EXCP_INTERRUPT;
2004             } else {
2005                 /* Another exception or debug event. Report it to GDB. */
2006                 cpu->exception_index = EXCP_DEBUG;
2007             }
2008 
2009             ret = 1;
2010             break;
2011         case WHvRunVpExitReasonNone:
2012         case WHvRunVpExitReasonUnrecoverableException:
2013         case WHvRunVpExitReasonInvalidVpRegisterValue:
2014         case WHvRunVpExitReasonUnsupportedFeature:
2015         default:
2016             error_report("WHPX: Unexpected VP exit code %d",
2017                          vcpu->exit_ctx.ExitReason);
2018             whpx_get_registers(cpu);
2019             bql_lock();
2020             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2021             bql_unlock();
2022             break;
2023         }
2024 
2025     } while (!ret);
2026 
2027     if (stepped_over_bp) {
2028         /* Restore the breakpoint we stepped over */
2029         cpu_memory_rw_debug(cpu,
2030             stepped_over_bp->address,
2031             (void *)&whpx_breakpoint_instruction,
2032             1,
2033             true);
2034     }
2035 
2036     if (exclusive_step_mode != WHPX_STEP_NONE) {
2037         g_assert(cpu_in_exclusive_context(cpu));
2038         cpu->running = false;
2039         end_exclusive();
2040 
2041         exclusive_step_mode = WHPX_STEP_NONE;
2042     } else {
2043         cpu_exec_end(cpu);
2044     }
2045 
2046     bql_lock();
2047     current_cpu = cpu;
2048 
2049     if (--whpx->running_cpus == 0) {
2050         whpx_last_vcpu_stopping(cpu);
2051     }
2052 
2053     qatomic_set(&cpu->exit_request, false);
2054 
2055     return ret < 0;
2056 }
2057 
do_whpx_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2058 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2059 {
2060     if (!cpu->vcpu_dirty) {
2061         whpx_get_registers(cpu);
2062         cpu->vcpu_dirty = true;
2063     }
2064 }
2065 
do_whpx_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2066 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2067                                                run_on_cpu_data arg)
2068 {
2069     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2070     cpu->vcpu_dirty = false;
2071 }
2072 
do_whpx_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2073 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2074                                               run_on_cpu_data arg)
2075 {
2076     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2077     cpu->vcpu_dirty = false;
2078 }
2079 
do_whpx_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2080 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2081                                                run_on_cpu_data arg)
2082 {
2083     cpu->vcpu_dirty = true;
2084 }
2085 
2086 /*
2087  * CPU support.
2088  */
2089 
whpx_cpu_synchronize_state(CPUState * cpu)2090 void whpx_cpu_synchronize_state(CPUState *cpu)
2091 {
2092     if (!cpu->vcpu_dirty) {
2093         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2094     }
2095 }
2096 
whpx_cpu_synchronize_post_reset(CPUState * cpu)2097 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2098 {
2099     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2100 }
2101 
whpx_cpu_synchronize_post_init(CPUState * cpu)2102 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2103 {
2104     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2105 }
2106 
whpx_cpu_synchronize_pre_loadvm(CPUState * cpu)2107 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2108 {
2109     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2110 }
2111 
whpx_pre_resume_vm(AccelState * as,bool step_pending)2112 static void whpx_pre_resume_vm(AccelState *as, bool step_pending)
2113 {
2114     whpx_global.step_pending = step_pending;
2115 }
2116 
2117 /*
2118  * Vcpu support.
2119  */
2120 
2121 static Error *whpx_migration_blocker;
2122 
whpx_cpu_update_state(void * opaque,bool running,RunState state)2123 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2124 {
2125     CPUX86State *env = opaque;
2126 
2127     if (running) {
2128         env->tsc_valid = false;
2129     }
2130 }
2131 
whpx_init_vcpu(CPUState * cpu)2132 int whpx_init_vcpu(CPUState *cpu)
2133 {
2134     HRESULT hr;
2135     struct whpx_state *whpx = &whpx_global;
2136     AccelCPUState *vcpu = NULL;
2137     Error *local_error = NULL;
2138     X86CPU *x86_cpu = X86_CPU(cpu);
2139     CPUX86State *env = &x86_cpu->env;
2140     UINT64 freq = 0;
2141     int ret;
2142 
2143     /* Add migration blockers for all unsupported features of the
2144      * Windows Hypervisor Platform
2145      */
2146     if (whpx_migration_blocker == NULL) {
2147         error_setg(&whpx_migration_blocker,
2148                "State blocked due to non-migratable CPUID feature support,"
2149                "dirty memory tracking support, and XSAVE/XRSTOR support");
2150 
2151         if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2152             error_report_err(local_error);
2153             ret = -EINVAL;
2154             goto error;
2155         }
2156     }
2157 
2158     vcpu = g_new0(AccelCPUState, 1);
2159 
2160     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2161         &whpx_emu_callbacks,
2162         &vcpu->emulator);
2163     if (FAILED(hr)) {
2164         error_report("WHPX: Failed to setup instruction completion support,"
2165                      " hr=%08lx", hr);
2166         ret = -EINVAL;
2167         goto error;
2168     }
2169 
2170     hr = whp_dispatch.WHvCreateVirtualProcessor(
2171         whpx->partition, cpu->cpu_index, 0);
2172     if (FAILED(hr)) {
2173         error_report("WHPX: Failed to create a virtual processor,"
2174                      " hr=%08lx", hr);
2175         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2176         ret = -EINVAL;
2177         goto error;
2178     }
2179 
2180     /*
2181      * vcpu's TSC frequency is either specified by user, or use the value
2182      * provided by Hyper-V if the former is not present. In the latter case, we
2183      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2184      * frequency can be migrated later via this field.
2185      */
2186     if (!env->tsc_khz) {
2187         hr = whp_dispatch.WHvGetCapability(
2188             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2189                 NULL);
2190         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2191             if (FAILED(hr)) {
2192                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2193             } else {
2194                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2195             }
2196         }
2197     }
2198 
2199     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2200     hr = whp_dispatch.WHvGetCapability(
2201         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2202     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2203         if (FAILED(hr)) {
2204             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2205         } else {
2206             env->apic_bus_freq = freq;
2207         }
2208     }
2209 
2210     /*
2211      * If the vmware cpuid frequency leaf option is set, and we have a valid
2212      * tsc value, trap the corresponding cpuid's.
2213      */
2214     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2215         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2216 
2217         hr = whp_dispatch.WHvSetPartitionProperty(
2218                 whpx->partition,
2219                 WHvPartitionPropertyCodeCpuidExitList,
2220                 cpuidExitList,
2221                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2222 
2223         if (FAILED(hr)) {
2224             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2225                         hr);
2226             ret = -EINVAL;
2227             goto error;
2228         }
2229     }
2230 
2231     vcpu->interruptable = true;
2232     cpu->vcpu_dirty = true;
2233     cpu->accel = vcpu;
2234     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2235     qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2236 
2237     return 0;
2238 
2239 error:
2240     g_free(vcpu);
2241 
2242     return ret;
2243 }
2244 
whpx_vcpu_exec(CPUState * cpu)2245 int whpx_vcpu_exec(CPUState *cpu)
2246 {
2247     int ret;
2248     int fatal;
2249 
2250     for (;;) {
2251         if (cpu->exception_index >= EXCP_INTERRUPT) {
2252             ret = cpu->exception_index;
2253             cpu->exception_index = -1;
2254             break;
2255         }
2256 
2257         fatal = whpx_vcpu_run(cpu);
2258 
2259         if (fatal) {
2260             error_report("WHPX: Failed to exec a virtual processor");
2261             abort();
2262         }
2263     }
2264 
2265     return ret;
2266 }
2267 
whpx_destroy_vcpu(CPUState * cpu)2268 void whpx_destroy_vcpu(CPUState *cpu)
2269 {
2270     struct whpx_state *whpx = &whpx_global;
2271     AccelCPUState *vcpu = cpu->accel;
2272 
2273     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2274     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2275     g_free(cpu->accel);
2276 }
2277 
whpx_vcpu_kick(CPUState * cpu)2278 void whpx_vcpu_kick(CPUState *cpu)
2279 {
2280     struct whpx_state *whpx = &whpx_global;
2281     whp_dispatch.WHvCancelRunVirtualProcessor(
2282         whpx->partition, cpu->cpu_index, 0);
2283 }
2284 
2285 /*
2286  * Memory support.
2287  */
2288 
whpx_update_mapping(hwaddr start_pa,ram_addr_t size,void * host_va,int add,int rom,const char * name)2289 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2290                                 void *host_va, int add, int rom,
2291                                 const char *name)
2292 {
2293     struct whpx_state *whpx = &whpx_global;
2294     HRESULT hr;
2295 
2296     /*
2297     if (add) {
2298         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2299                (void*)start_pa, (void*)size, host_va,
2300                (rom ? "ROM" : "RAM"), name);
2301     } else {
2302         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2303                (void*)start_pa, (void*)size, host_va, name);
2304     }
2305     */
2306 
2307     if (add) {
2308         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2309                                          host_va,
2310                                          start_pa,
2311                                          size,
2312                                          (WHvMapGpaRangeFlagRead |
2313                                           WHvMapGpaRangeFlagExecute |
2314                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2315     } else {
2316         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2317                                            start_pa,
2318                                            size);
2319     }
2320 
2321     if (FAILED(hr)) {
2322         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2323                      " Host:%p, hr=%08lx",
2324                      (add ? "MAP" : "UNMAP"), name,
2325                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2326     }
2327 }
2328 
whpx_process_section(MemoryRegionSection * section,int add)2329 static void whpx_process_section(MemoryRegionSection *section, int add)
2330 {
2331     MemoryRegion *mr = section->mr;
2332     hwaddr start_pa = section->offset_within_address_space;
2333     ram_addr_t size = int128_get64(section->size);
2334     unsigned int delta;
2335     uint64_t host_va;
2336 
2337     if (!memory_region_is_ram(mr)) {
2338         return;
2339     }
2340 
2341     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2342     delta &= ~qemu_real_host_page_mask();
2343     if (delta > size) {
2344         return;
2345     }
2346     start_pa += delta;
2347     size -= delta;
2348     size &= qemu_real_host_page_mask();
2349     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2350         return;
2351     }
2352 
2353     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2354             + section->offset_within_region + delta;
2355 
2356     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2357                         memory_region_is_rom(mr), mr->name);
2358 }
2359 
whpx_region_add(MemoryListener * listener,MemoryRegionSection * section)2360 static void whpx_region_add(MemoryListener *listener,
2361                            MemoryRegionSection *section)
2362 {
2363     memory_region_ref(section->mr);
2364     whpx_process_section(section, 1);
2365 }
2366 
whpx_region_del(MemoryListener * listener,MemoryRegionSection * section)2367 static void whpx_region_del(MemoryListener *listener,
2368                            MemoryRegionSection *section)
2369 {
2370     whpx_process_section(section, 0);
2371     memory_region_unref(section->mr);
2372 }
2373 
whpx_transaction_begin(MemoryListener * listener)2374 static void whpx_transaction_begin(MemoryListener *listener)
2375 {
2376 }
2377 
whpx_transaction_commit(MemoryListener * listener)2378 static void whpx_transaction_commit(MemoryListener *listener)
2379 {
2380 }
2381 
whpx_log_sync(MemoryListener * listener,MemoryRegionSection * section)2382 static void whpx_log_sync(MemoryListener *listener,
2383                          MemoryRegionSection *section)
2384 {
2385     MemoryRegion *mr = section->mr;
2386 
2387     if (!memory_region_is_ram(mr)) {
2388         return;
2389     }
2390 
2391     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2392 }
2393 
2394 static MemoryListener whpx_memory_listener = {
2395     .name = "whpx",
2396     .begin = whpx_transaction_begin,
2397     .commit = whpx_transaction_commit,
2398     .region_add = whpx_region_add,
2399     .region_del = whpx_region_del,
2400     .log_sync = whpx_log_sync,
2401     .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2402 };
2403 
whpx_memory_init(void)2404 static void whpx_memory_init(void)
2405 {
2406     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2407 }
2408 
2409 /*
2410  * Load the functions from the given library, using the given handle. If a
2411  * handle is provided, it is used, otherwise the library is opened. The
2412  * handle will be updated on return with the opened one.
2413  */
load_whp_dispatch_fns(HMODULE * handle,WHPFunctionList function_list)2414 static bool load_whp_dispatch_fns(HMODULE *handle,
2415     WHPFunctionList function_list)
2416 {
2417     HMODULE hLib = *handle;
2418 
2419     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2420     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2421     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2422         whp_dispatch.function_name = \
2423             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2424 
2425     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2426         whp_dispatch.function_name = \
2427             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2428         if (!whp_dispatch.function_name) { \
2429             error_report("Could not load function %s", #function_name); \
2430             goto error; \
2431         } \
2432 
2433     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2434     if (!handle_lib) { \
2435         handle_lib = LoadLibrary(lib_name); \
2436         if (!handle_lib) { \
2437             error_report("Could not load library %s.", lib_name); \
2438             goto error; \
2439         } \
2440     } \
2441 
2442     switch (function_list) {
2443     case WINHV_PLATFORM_FNS_DEFAULT:
2444         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2445         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2446         break;
2447 
2448     case WINHV_EMULATION_FNS_DEFAULT:
2449         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2450         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2451         break;
2452 
2453     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2454         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2455         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2456         break;
2457     }
2458 
2459     *handle = hLib;
2460     return true;
2461 
2462 error:
2463     if (hLib) {
2464         FreeLibrary(hLib);
2465     }
2466 
2467     return false;
2468 }
2469 
whpx_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)2470 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2471                                    const char *name, void *opaque,
2472                                    Error **errp)
2473 {
2474     struct whpx_state *whpx = &whpx_global;
2475     OnOffSplit mode;
2476 
2477     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2478         return;
2479     }
2480 
2481     switch (mode) {
2482     case ON_OFF_SPLIT_ON:
2483         whpx->kernel_irqchip_allowed = true;
2484         whpx->kernel_irqchip_required = true;
2485         break;
2486 
2487     case ON_OFF_SPLIT_OFF:
2488         whpx->kernel_irqchip_allowed = false;
2489         whpx->kernel_irqchip_required = false;
2490         break;
2491 
2492     case ON_OFF_SPLIT_SPLIT:
2493         error_setg(errp, "WHPX: split irqchip currently not supported");
2494         error_append_hint(errp,
2495             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2496         break;
2497 
2498     default:
2499         /*
2500          * The value was checked in visit_type_OnOffSplit() above. If
2501          * we get here, then something is wrong in QEMU.
2502          */
2503         abort();
2504     }
2505 }
2506 
whpx_cpu_instance_init(CPUState * cs)2507 static void whpx_cpu_instance_init(CPUState *cs)
2508 {
2509     X86CPU *cpu = X86_CPU(cs);
2510 
2511     host_cpu_instance_init(cpu);
2512 }
2513 
whpx_cpu_accel_class_init(ObjectClass * oc,const void * data)2514 static void whpx_cpu_accel_class_init(ObjectClass *oc, const void *data)
2515 {
2516     AccelCPUClass *acc = ACCEL_CPU_CLASS(oc);
2517 
2518     acc->cpu_instance_init = whpx_cpu_instance_init;
2519 }
2520 
2521 static const TypeInfo whpx_cpu_accel_type = {
2522     .name = ACCEL_CPU_NAME("whpx"),
2523 
2524     .parent = TYPE_ACCEL_CPU,
2525     .class_init = whpx_cpu_accel_class_init,
2526     .abstract = true,
2527 };
2528 
2529 /*
2530  * Partition support
2531  */
2532 
whpx_accel_init(AccelState * as,MachineState * ms)2533 static int whpx_accel_init(AccelState *as, MachineState *ms)
2534 {
2535     struct whpx_state *whpx;
2536     int ret;
2537     HRESULT hr;
2538     WHV_CAPABILITY whpx_cap;
2539     UINT32 whpx_cap_size;
2540     WHV_PARTITION_PROPERTY prop;
2541     UINT32 cpuidExitList[] = {1, 0x80000001};
2542     WHV_CAPABILITY_FEATURES features = {0};
2543 
2544     whpx = &whpx_global;
2545 
2546     if (!init_whp_dispatch()) {
2547         ret = -ENOSYS;
2548         goto error;
2549     }
2550 
2551     whpx->mem_quota = ms->ram_size;
2552 
2553     hr = whp_dispatch.WHvGetCapability(
2554         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2555         sizeof(whpx_cap), &whpx_cap_size);
2556     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2557         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2558         ret = -ENOSPC;
2559         goto error;
2560     }
2561 
2562     hr = whp_dispatch.WHvGetCapability(
2563         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2564     if (FAILED(hr)) {
2565         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2566         ret = -EINVAL;
2567         goto error;
2568     }
2569 
2570     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2571     if (FAILED(hr)) {
2572         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2573         ret = -EINVAL;
2574         goto error;
2575     }
2576 
2577     /*
2578      * Query the XSAVE capability of the partition. Any error here is not
2579      * considered fatal.
2580      */
2581     hr = whp_dispatch.WHvGetPartitionProperty(
2582         whpx->partition,
2583         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2584         &whpx_xsave_cap,
2585         sizeof(whpx_xsave_cap),
2586         &whpx_cap_size);
2587 
2588     /*
2589      * Windows version which don't support this property will return with the
2590      * specific error code.
2591      */
2592     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2593         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2594     }
2595 
2596     if (!whpx_has_xsave()) {
2597         printf("WHPX: Partition is not XSAVE capable\n");
2598     }
2599 
2600     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2601     prop.ProcessorCount = ms->smp.cpus;
2602     hr = whp_dispatch.WHvSetPartitionProperty(
2603         whpx->partition,
2604         WHvPartitionPropertyCodeProcessorCount,
2605         &prop,
2606         sizeof(WHV_PARTITION_PROPERTY));
2607 
2608     if (FAILED(hr)) {
2609         error_report("WHPX: Failed to set partition processor count to %u,"
2610                      " hr=%08lx", prop.ProcessorCount, hr);
2611         ret = -EINVAL;
2612         goto error;
2613     }
2614 
2615     /*
2616      * Error out if WHP doesn't support apic emulation and user is requiring
2617      * it.
2618      */
2619     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2620             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2621         error_report("WHPX: kernel irqchip requested, but unavailable. "
2622             "Try without kernel-irqchip or with kernel-irqchip=off");
2623         ret = -EINVAL;
2624         goto error;
2625     }
2626 
2627     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2628         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2629         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2630             WHvX64LocalApicEmulationModeXApic;
2631         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2632         hr = whp_dispatch.WHvSetPartitionProperty(
2633             whpx->partition,
2634             WHvPartitionPropertyCodeLocalApicEmulationMode,
2635             &mode,
2636             sizeof(mode));
2637         if (FAILED(hr)) {
2638             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2639             if (whpx->kernel_irqchip_required) {
2640                 error_report("WHPX: kernel irqchip requested, but unavailable");
2641                 ret = -EINVAL;
2642                 goto error;
2643             }
2644         } else {
2645             whpx->apic_in_platform = true;
2646         }
2647     }
2648 
2649     /* Register for MSR and CPUID exits */
2650     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2651     prop.ExtendedVmExits.X64MsrExit = 1;
2652     prop.ExtendedVmExits.X64CpuidExit = 1;
2653     prop.ExtendedVmExits.ExceptionExit = 1;
2654     if (whpx_apic_in_platform()) {
2655         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2656     }
2657 
2658     hr = whp_dispatch.WHvSetPartitionProperty(
2659             whpx->partition,
2660             WHvPartitionPropertyCodeExtendedVmExits,
2661             &prop,
2662             sizeof(WHV_PARTITION_PROPERTY));
2663     if (FAILED(hr)) {
2664         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2665         ret = -EINVAL;
2666         goto error;
2667     }
2668 
2669     hr = whp_dispatch.WHvSetPartitionProperty(
2670         whpx->partition,
2671         WHvPartitionPropertyCodeCpuidExitList,
2672         cpuidExitList,
2673         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2674 
2675     if (FAILED(hr)) {
2676         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2677                      hr);
2678         ret = -EINVAL;
2679         goto error;
2680     }
2681 
2682     /*
2683      * We do not want to intercept any exceptions from the guest,
2684      * until we actually start debugging with gdb.
2685      */
2686     whpx->exception_exit_bitmap = -1;
2687     hr = whpx_set_exception_exit_bitmap(0);
2688 
2689     if (FAILED(hr)) {
2690         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2691         ret = -EINVAL;
2692         goto error;
2693     }
2694 
2695     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2696     if (FAILED(hr)) {
2697         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2698         ret = -EINVAL;
2699         goto error;
2700     }
2701 
2702     whpx_memory_init();
2703 
2704     printf("Windows Hypervisor Platform accelerator is operational\n");
2705     return 0;
2706 
2707 error:
2708 
2709     if (NULL != whpx->partition) {
2710         whp_dispatch.WHvDeletePartition(whpx->partition);
2711         whpx->partition = NULL;
2712     }
2713 
2714     return ret;
2715 }
2716 
whpx_apic_in_platform(void)2717 bool whpx_apic_in_platform(void) {
2718     return whpx_global.apic_in_platform;
2719 }
2720 
whpx_accel_class_init(ObjectClass * oc,const void * data)2721 static void whpx_accel_class_init(ObjectClass *oc, const void *data)
2722 {
2723     AccelClass *ac = ACCEL_CLASS(oc);
2724     ac->name = "WHPX";
2725     ac->init_machine = whpx_accel_init;
2726     ac->pre_resume_vm = whpx_pre_resume_vm;
2727     ac->allowed = &whpx_allowed;
2728 
2729     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2730         NULL, whpx_set_kernel_irqchip,
2731         NULL, NULL);
2732     object_class_property_set_description(oc, "kernel-irqchip",
2733         "Configure WHPX in-kernel irqchip");
2734 }
2735 
whpx_accel_instance_init(Object * obj)2736 static void whpx_accel_instance_init(Object *obj)
2737 {
2738     struct whpx_state *whpx = &whpx_global;
2739 
2740     memset(whpx, 0, sizeof(struct whpx_state));
2741     /* Turn on kernel-irqchip, by default */
2742     whpx->kernel_irqchip_allowed = true;
2743 }
2744 
2745 static const TypeInfo whpx_accel_type = {
2746     .name = ACCEL_CLASS_NAME("whpx"),
2747     .parent = TYPE_ACCEL,
2748     .instance_init = whpx_accel_instance_init,
2749     .class_init = whpx_accel_class_init,
2750 };
2751 
whpx_type_init(void)2752 static void whpx_type_init(void)
2753 {
2754     type_register_static(&whpx_accel_type);
2755     type_register_static(&whpx_cpu_accel_type);
2756 }
2757 
init_whp_dispatch(void)2758 bool init_whp_dispatch(void)
2759 {
2760     if (whp_dispatch_initialized) {
2761         return true;
2762     }
2763 
2764     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2765         goto error;
2766     }
2767 
2768     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2769         goto error;
2770     }
2771 
2772     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2773         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2774     whp_dispatch_initialized = true;
2775 
2776     return true;
2777 error:
2778     if (hWinHvPlatform) {
2779         FreeLibrary(hWinHvPlatform);
2780     }
2781 
2782     if (hWinHvEmulation) {
2783         FreeLibrary(hWinHvEmulation);
2784     }
2785 
2786     return false;
2787 }
2788 
2789 type_init(whpx_type_init);
2790