xref: /openbmc/qemu/target/i386/whpx/whpx-all.c (revision 083367db)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <winhvplatform.h>
35 #include <winhvemulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct AccelCPUState {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240     bool dirty;
241 
242     /* Must be the last field as it may have a tail */
243     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
244 };
245 
246 static bool whpx_allowed;
247 static bool whp_dispatch_initialized;
248 static HMODULE hWinHvPlatform, hWinHvEmulation;
249 static uint32_t max_vcpu_index;
250 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
251 
252 struct whpx_state whpx_global;
253 struct WHPDispatch whp_dispatch;
254 
whpx_has_xsave(void)255 static bool whpx_has_xsave(void)
256 {
257     return whpx_xsave_cap.XsaveSupport;
258 }
259 
whpx_seg_q2h(const SegmentCache * qs,int v86,int r86)260 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
261                                              int r86)
262 {
263     WHV_X64_SEGMENT_REGISTER hs;
264     unsigned flags = qs->flags;
265 
266     hs.Base = qs->base;
267     hs.Limit = qs->limit;
268     hs.Selector = qs->selector;
269 
270     if (v86) {
271         hs.Attributes = 0;
272         hs.SegmentType = 3;
273         hs.Present = 1;
274         hs.DescriptorPrivilegeLevel = 3;
275         hs.NonSystemSegment = 1;
276 
277     } else {
278         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
279 
280         if (r86) {
281             /* hs.Base &= 0xfffff; */
282         }
283     }
284 
285     return hs;
286 }
287 
whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER * hs)288 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
289 {
290     SegmentCache qs;
291 
292     qs.base = hs->Base;
293     qs.limit = hs->Limit;
294     qs.selector = hs->Selector;
295 
296     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
297 
298     return qs;
299 }
300 
301 /* X64 Extended Control Registers */
whpx_set_xcrs(CPUState * cpu)302 static void whpx_set_xcrs(CPUState *cpu)
303 {
304     HRESULT hr;
305     struct whpx_state *whpx = &whpx_global;
306     WHV_REGISTER_VALUE xcr0;
307     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
308 
309     if (!whpx_has_xsave()) {
310         return;
311     }
312 
313     /* Only xcr0 is supported by the hypervisor currently */
314     xcr0.Reg64 = cpu_env(cpu)->xcr0;
315     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
316         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
317     if (FAILED(hr)) {
318         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
319     }
320 }
321 
whpx_set_tsc(CPUState * cpu)322 static int whpx_set_tsc(CPUState *cpu)
323 {
324     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
325     WHV_REGISTER_VALUE tsc_val;
326     HRESULT hr;
327     struct whpx_state *whpx = &whpx_global;
328 
329     /*
330      * Suspend the partition prior to setting the TSC to reduce the variance
331      * in TSC across vCPUs. When the first vCPU runs post suspend, the
332      * partition is automatically resumed.
333      */
334     if (whp_dispatch.WHvSuspendPartitionTime) {
335 
336         /*
337          * Unable to suspend partition while setting TSC is not a fatal
338          * error. It just increases the likelihood of TSC variance between
339          * vCPUs and some guest OS are able to handle that just fine.
340          */
341         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
342         if (FAILED(hr)) {
343             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
344         }
345     }
346 
347     tsc_val.Reg64 = cpu_env(cpu)->tsc;
348     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
349         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
350     if (FAILED(hr)) {
351         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
352         return -1;
353     }
354 
355     return 0;
356 }
357 
358 /*
359  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
360  * however, they use a slightly different encoding. Specifically:
361  *
362  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
363  *
364  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
365  * and IA-32 Architectures Software Developer's Manual.
366  *
367  * The functions below translate the value of CR8 to TPR and vice versa.
368  */
369 
whpx_apic_tpr_to_cr8(uint64_t tpr)370 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
371 {
372     return tpr >> 4;
373 }
374 
whpx_cr8_to_apic_tpr(uint64_t cr8)375 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
376 {
377     return cr8 << 4;
378 }
379 
whpx_set_registers(CPUState * cpu,int level)380 static void whpx_set_registers(CPUState *cpu, int level)
381 {
382     struct whpx_state *whpx = &whpx_global;
383     AccelCPUState *vcpu = cpu->accel;
384     X86CPU *x86_cpu = X86_CPU(cpu);
385     CPUX86State *env = &x86_cpu->env;
386     struct whpx_register_set vcxt;
387     HRESULT hr;
388     int idx;
389     int idx_next;
390     int i;
391     int v86, r86;
392 
393     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
394 
395     /*
396      * Following MSRs have side effects on the guest or are too heavy for
397      * runtime. Limit them to full state update.
398      */
399     if (level >= WHPX_SET_RESET_STATE) {
400         whpx_set_tsc(cpu);
401     }
402 
403     memset(&vcxt, 0, sizeof(struct whpx_register_set));
404 
405     v86 = (env->eflags & VM_MASK);
406     r86 = !(env->cr[0] & CR0_PE_MASK);
407 
408     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
409     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
410 
411     idx = 0;
412 
413     /* Indexes for first 16 registers match between HV and QEMU definitions */
414     idx_next = 16;
415     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
416         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
417     }
418     idx = idx_next;
419 
420     /* Same goes for RIP and RFLAGS */
421     assert(whpx_register_names[idx] == WHvX64RegisterRip);
422     vcxt.values[idx++].Reg64 = env->eip;
423 
424     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
425     vcxt.values[idx++].Reg64 = env->eflags;
426 
427     /* Translate 6+4 segment registers. HV and QEMU order matches  */
428     assert(idx == WHvX64RegisterEs);
429     for (i = 0; i < 6; i += 1, idx += 1) {
430         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
431     }
432 
433     assert(idx == WHvX64RegisterLdtr);
434     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
435 
436     assert(idx == WHvX64RegisterTr);
437     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
438 
439     assert(idx == WHvX64RegisterIdtr);
440     vcxt.values[idx].Table.Base = env->idt.base;
441     vcxt.values[idx].Table.Limit = env->idt.limit;
442     idx += 1;
443 
444     assert(idx == WHvX64RegisterGdtr);
445     vcxt.values[idx].Table.Base = env->gdt.base;
446     vcxt.values[idx].Table.Limit = env->gdt.limit;
447     idx += 1;
448 
449     /* CR0, 2, 3, 4, 8 */
450     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
451     vcxt.values[idx++].Reg64 = env->cr[0];
452     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
453     vcxt.values[idx++].Reg64 = env->cr[2];
454     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
455     vcxt.values[idx++].Reg64 = env->cr[3];
456     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
457     vcxt.values[idx++].Reg64 = env->cr[4];
458     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
459     vcxt.values[idx++].Reg64 = vcpu->tpr;
460 
461     /* 8 Debug Registers - Skipped */
462 
463     /*
464      * Extended control registers needs to be handled separately depending
465      * on whether xsave is supported/enabled or not.
466      */
467     whpx_set_xcrs(cpu);
468 
469     /* 16 XMM registers */
470     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
471     idx_next = idx + 16;
472     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
473         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
474         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
475     }
476     idx = idx_next;
477 
478     /* 8 FP registers */
479     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
480     for (i = 0; i < 8; i += 1, idx += 1) {
481         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
482         /* vcxt.values[idx].Fp.AsUINT128.High64 =
483                env->fpregs[i].mmx.MMX_Q(1);
484         */
485     }
486 
487     /* FP control status register */
488     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
489     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
490     vcxt.values[idx].FpControlStatus.FpStatus =
491         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
492     vcxt.values[idx].FpControlStatus.FpTag = 0;
493     for (i = 0; i < 8; ++i) {
494         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
495     }
496     vcxt.values[idx].FpControlStatus.Reserved = 0;
497     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
498     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
499     idx += 1;
500 
501     /* XMM control status register */
502     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
503     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
504     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
505     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
506     idx += 1;
507 
508     /* MSRs */
509     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
510     vcxt.values[idx++].Reg64 = env->efer;
511 #ifdef TARGET_X86_64
512     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
513     vcxt.values[idx++].Reg64 = env->kernelgsbase;
514 #endif
515 
516     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
517     vcxt.values[idx++].Reg64 = vcpu->apic_base;
518 
519     /* WHvX64RegisterPat - Skipped */
520 
521     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
522     vcxt.values[idx++].Reg64 = env->sysenter_cs;
523     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
524     vcxt.values[idx++].Reg64 = env->sysenter_eip;
525     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
526     vcxt.values[idx++].Reg64 = env->sysenter_esp;
527     assert(whpx_register_names[idx] == WHvX64RegisterStar);
528     vcxt.values[idx++].Reg64 = env->star;
529 #ifdef TARGET_X86_64
530     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
531     vcxt.values[idx++].Reg64 = env->lstar;
532     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
533     vcxt.values[idx++].Reg64 = env->cstar;
534     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
535     vcxt.values[idx++].Reg64 = env->fmask;
536 #endif
537 
538     /* Interrupt / Event Registers - Skipped */
539 
540     assert(idx == RTL_NUMBER_OF(whpx_register_names));
541 
542     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
543         whpx->partition, cpu->cpu_index,
544         whpx_register_names,
545         RTL_NUMBER_OF(whpx_register_names),
546         &vcxt.values[0]);
547 
548     if (FAILED(hr)) {
549         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
550                      hr);
551     }
552 
553     return;
554 }
555 
whpx_get_tsc(CPUState * cpu)556 static int whpx_get_tsc(CPUState *cpu)
557 {
558     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
559     WHV_REGISTER_VALUE tsc_val;
560     HRESULT hr;
561     struct whpx_state *whpx = &whpx_global;
562 
563     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
564         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
565     if (FAILED(hr)) {
566         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
567         return -1;
568     }
569 
570     cpu_env(cpu)->tsc = tsc_val.Reg64;
571     return 0;
572 }
573 
574 /* X64 Extended Control Registers */
whpx_get_xcrs(CPUState * cpu)575 static void whpx_get_xcrs(CPUState *cpu)
576 {
577     HRESULT hr;
578     struct whpx_state *whpx = &whpx_global;
579     WHV_REGISTER_VALUE xcr0;
580     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
581 
582     if (!whpx_has_xsave()) {
583         return;
584     }
585 
586     /* Only xcr0 is supported by the hypervisor currently */
587     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
588         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
589     if (FAILED(hr)) {
590         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
591         return;
592     }
593 
594     cpu_env(cpu)->xcr0 = xcr0.Reg64;
595 }
596 
whpx_get_registers(CPUState * cpu)597 static void whpx_get_registers(CPUState *cpu)
598 {
599     struct whpx_state *whpx = &whpx_global;
600     AccelCPUState *vcpu = cpu->accel;
601     X86CPU *x86_cpu = X86_CPU(cpu);
602     CPUX86State *env = &x86_cpu->env;
603     struct whpx_register_set vcxt;
604     uint64_t tpr, apic_base;
605     HRESULT hr;
606     int idx;
607     int idx_next;
608     int i;
609 
610     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
611 
612     if (!env->tsc_valid) {
613         whpx_get_tsc(cpu);
614         env->tsc_valid = !runstate_is_running();
615     }
616 
617     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
618         whpx->partition, cpu->cpu_index,
619         whpx_register_names,
620         RTL_NUMBER_OF(whpx_register_names),
621         &vcxt.values[0]);
622     if (FAILED(hr)) {
623         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
624                      hr);
625     }
626 
627     if (whpx_apic_in_platform()) {
628         /*
629          * Fetch the TPR value from the emulated APIC. It may get overwritten
630          * below with the value from CR8 returned by
631          * WHvGetVirtualProcessorRegisters().
632          */
633         whpx_apic_get(x86_cpu->apic_state);
634         vcpu->tpr = whpx_apic_tpr_to_cr8(
635             cpu_get_apic_tpr(x86_cpu->apic_state));
636     }
637 
638     idx = 0;
639 
640     /* Indexes for first 16 registers match between HV and QEMU definitions */
641     idx_next = 16;
642     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
643         env->regs[idx] = vcxt.values[idx].Reg64;
644     }
645     idx = idx_next;
646 
647     /* Same goes for RIP and RFLAGS */
648     assert(whpx_register_names[idx] == WHvX64RegisterRip);
649     env->eip = vcxt.values[idx++].Reg64;
650     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
651     env->eflags = vcxt.values[idx++].Reg64;
652 
653     /* Translate 6+4 segment registers. HV and QEMU order matches  */
654     assert(idx == WHvX64RegisterEs);
655     for (i = 0; i < 6; i += 1, idx += 1) {
656         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
657     }
658 
659     assert(idx == WHvX64RegisterLdtr);
660     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
661     assert(idx == WHvX64RegisterTr);
662     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
663     assert(idx == WHvX64RegisterIdtr);
664     env->idt.base = vcxt.values[idx].Table.Base;
665     env->idt.limit = vcxt.values[idx].Table.Limit;
666     idx += 1;
667     assert(idx == WHvX64RegisterGdtr);
668     env->gdt.base = vcxt.values[idx].Table.Base;
669     env->gdt.limit = vcxt.values[idx].Table.Limit;
670     idx += 1;
671 
672     /* CR0, 2, 3, 4, 8 */
673     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
674     env->cr[0] = vcxt.values[idx++].Reg64;
675     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
676     env->cr[2] = vcxt.values[idx++].Reg64;
677     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
678     env->cr[3] = vcxt.values[idx++].Reg64;
679     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
680     env->cr[4] = vcxt.values[idx++].Reg64;
681     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
682     tpr = vcxt.values[idx++].Reg64;
683     if (tpr != vcpu->tpr) {
684         vcpu->tpr = tpr;
685         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
686     }
687 
688     /* 8 Debug Registers - Skipped */
689 
690     /*
691      * Extended control registers needs to be handled separately depending
692      * on whether xsave is supported/enabled or not.
693      */
694     whpx_get_xcrs(cpu);
695 
696     /* 16 XMM registers */
697     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
698     idx_next = idx + 16;
699     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
700         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
701         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
702     }
703     idx = idx_next;
704 
705     /* 8 FP registers */
706     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
707     for (i = 0; i < 8; i += 1, idx += 1) {
708         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
709         /* env->fpregs[i].mmx.MMX_Q(1) =
710                vcxt.values[idx].Fp.AsUINT128.High64;
711         */
712     }
713 
714     /* FP control status register */
715     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
716     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
717     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
718     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
719     for (i = 0; i < 8; ++i) {
720         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
721     }
722     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
723     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
724     idx += 1;
725 
726     /* XMM control status register */
727     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
728     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
729     idx += 1;
730 
731     /* MSRs */
732     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
733     env->efer = vcxt.values[idx++].Reg64;
734 #ifdef TARGET_X86_64
735     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
736     env->kernelgsbase = vcxt.values[idx++].Reg64;
737 #endif
738 
739     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
740     apic_base = vcxt.values[idx++].Reg64;
741     if (apic_base != vcpu->apic_base) {
742         vcpu->apic_base = apic_base;
743         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
744     }
745 
746     /* WHvX64RegisterPat - Skipped */
747 
748     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
749     env->sysenter_cs = vcxt.values[idx++].Reg64;
750     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
751     env->sysenter_eip = vcxt.values[idx++].Reg64;
752     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
753     env->sysenter_esp = vcxt.values[idx++].Reg64;
754     assert(whpx_register_names[idx] == WHvX64RegisterStar);
755     env->star = vcxt.values[idx++].Reg64;
756 #ifdef TARGET_X86_64
757     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
758     env->lstar = vcxt.values[idx++].Reg64;
759     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
760     env->cstar = vcxt.values[idx++].Reg64;
761     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
762     env->fmask = vcxt.values[idx++].Reg64;
763 #endif
764 
765     /* Interrupt / Event Registers - Skipped */
766 
767     assert(idx == RTL_NUMBER_OF(whpx_register_names));
768 
769     if (whpx_apic_in_platform()) {
770         whpx_apic_get(x86_cpu->apic_state);
771     }
772 
773     x86_update_hflags(env);
774 
775     return;
776 }
777 
whpx_emu_ioport_callback(void * ctx,WHV_EMULATOR_IO_ACCESS_INFO * IoAccess)778 static HRESULT CALLBACK whpx_emu_ioport_callback(
779     void *ctx,
780     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
781 {
782     MemTxAttrs attrs = { 0 };
783     address_space_rw(&address_space_io, IoAccess->Port, attrs,
784                      &IoAccess->Data, IoAccess->AccessSize,
785                      IoAccess->Direction);
786     return S_OK;
787 }
788 
whpx_emu_mmio_callback(void * ctx,WHV_EMULATOR_MEMORY_ACCESS_INFO * ma)789 static HRESULT CALLBACK whpx_emu_mmio_callback(
790     void *ctx,
791     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
792 {
793     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
794                            ma->Direction);
795     return S_OK;
796 }
797 
whpx_emu_getreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,WHV_REGISTER_VALUE * RegisterValues)798 static HRESULT CALLBACK whpx_emu_getreg_callback(
799     void *ctx,
800     const WHV_REGISTER_NAME *RegisterNames,
801     UINT32 RegisterCount,
802     WHV_REGISTER_VALUE *RegisterValues)
803 {
804     HRESULT hr;
805     struct whpx_state *whpx = &whpx_global;
806     CPUState *cpu = (CPUState *)ctx;
807 
808     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
809         whpx->partition, cpu->cpu_index,
810         RegisterNames, RegisterCount,
811         RegisterValues);
812     if (FAILED(hr)) {
813         error_report("WHPX: Failed to get virtual processor registers,"
814                      " hr=%08lx", hr);
815     }
816 
817     return hr;
818 }
819 
whpx_emu_setreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,const WHV_REGISTER_VALUE * RegisterValues)820 static HRESULT CALLBACK whpx_emu_setreg_callback(
821     void *ctx,
822     const WHV_REGISTER_NAME *RegisterNames,
823     UINT32 RegisterCount,
824     const WHV_REGISTER_VALUE *RegisterValues)
825 {
826     HRESULT hr;
827     struct whpx_state *whpx = &whpx_global;
828     CPUState *cpu = (CPUState *)ctx;
829 
830     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
831         whpx->partition, cpu->cpu_index,
832         RegisterNames, RegisterCount,
833         RegisterValues);
834     if (FAILED(hr)) {
835         error_report("WHPX: Failed to set virtual processor registers,"
836                      " hr=%08lx", hr);
837     }
838 
839     /*
840      * The emulator just successfully wrote the register state. We clear the
841      * dirty state so we avoid the double write on resume of the VP.
842      */
843     cpu->accel->dirty = false;
844 
845     return hr;
846 }
847 
whpx_emu_translate_callback(void * ctx,WHV_GUEST_VIRTUAL_ADDRESS Gva,WHV_TRANSLATE_GVA_FLAGS TranslateFlags,WHV_TRANSLATE_GVA_RESULT_CODE * TranslationResult,WHV_GUEST_PHYSICAL_ADDRESS * Gpa)848 static HRESULT CALLBACK whpx_emu_translate_callback(
849     void *ctx,
850     WHV_GUEST_VIRTUAL_ADDRESS Gva,
851     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
852     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
853     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
854 {
855     HRESULT hr;
856     struct whpx_state *whpx = &whpx_global;
857     CPUState *cpu = (CPUState *)ctx;
858     WHV_TRANSLATE_GVA_RESULT res;
859 
860     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
861                                       Gva, TranslateFlags, &res, Gpa);
862     if (FAILED(hr)) {
863         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
864     } else {
865         *TranslationResult = res.ResultCode;
866     }
867 
868     return hr;
869 }
870 
871 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
872     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
873     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
874     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
875     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
876     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
877     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
878 };
879 
whpx_handle_mmio(CPUState * cpu,WHV_MEMORY_ACCESS_CONTEXT * ctx)880 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
881 {
882     HRESULT hr;
883     AccelCPUState *vcpu = cpu->accel;
884     WHV_EMULATOR_STATUS emu_status;
885 
886     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
887         vcpu->emulator, cpu,
888         &vcpu->exit_ctx.VpContext, ctx,
889         &emu_status);
890     if (FAILED(hr)) {
891         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
892         return -1;
893     }
894 
895     if (!emu_status.EmulationSuccessful) {
896         error_report("WHPX: Failed to emulate MMIO access with"
897                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
898         return -1;
899     }
900 
901     return 0;
902 }
903 
whpx_handle_portio(CPUState * cpu,WHV_X64_IO_PORT_ACCESS_CONTEXT * ctx)904 static int whpx_handle_portio(CPUState *cpu,
905                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
906 {
907     HRESULT hr;
908     AccelCPUState *vcpu = cpu->accel;
909     WHV_EMULATOR_STATUS emu_status;
910 
911     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
912         vcpu->emulator, cpu,
913         &vcpu->exit_ctx.VpContext, ctx,
914         &emu_status);
915     if (FAILED(hr)) {
916         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
917         return -1;
918     }
919 
920     if (!emu_status.EmulationSuccessful) {
921         error_report("WHPX: Failed to emulate PortIO access with"
922                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
923         return -1;
924     }
925 
926     return 0;
927 }
928 
929 /*
930  * Controls whether we should intercept various exceptions on the guest,
931  * namely breakpoint/single-step events.
932  *
933  * The 'exceptions' argument accepts a bitmask, e.g:
934  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
935  */
whpx_set_exception_exit_bitmap(UINT64 exceptions)936 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
937 {
938     struct whpx_state *whpx = &whpx_global;
939     WHV_PARTITION_PROPERTY prop = { 0, };
940     HRESULT hr;
941 
942     if (exceptions == whpx->exception_exit_bitmap) {
943         return S_OK;
944     }
945 
946     prop.ExceptionExitBitmap = exceptions;
947 
948     hr = whp_dispatch.WHvSetPartitionProperty(
949         whpx->partition,
950         WHvPartitionPropertyCodeExceptionExitBitmap,
951         &prop,
952         sizeof(WHV_PARTITION_PROPERTY));
953 
954     if (SUCCEEDED(hr)) {
955         whpx->exception_exit_bitmap = exceptions;
956     }
957 
958     return hr;
959 }
960 
961 
962 /*
963  * This function is called before/after stepping over a single instruction.
964  * It will update the CPU registers to arm/disarm the instruction stepping
965  * accordingly.
966  */
whpx_vcpu_configure_single_stepping(CPUState * cpu,bool set,uint64_t * exit_context_rflags)967 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
968     bool set,
969     uint64_t *exit_context_rflags)
970 {
971     WHV_REGISTER_NAME reg_name;
972     WHV_REGISTER_VALUE reg_value;
973     HRESULT hr;
974     struct whpx_state *whpx = &whpx_global;
975 
976     /*
977      * If we are trying to step over a single instruction, we need to set the
978      * TF bit in rflags. Otherwise, clear it.
979      */
980     reg_name = WHvX64RegisterRflags;
981     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
982         whpx->partition,
983         cpu->cpu_index,
984         &reg_name,
985         1,
986         &reg_value);
987 
988     if (FAILED(hr)) {
989         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
990         return hr;
991     }
992 
993     if (exit_context_rflags) {
994         assert(*exit_context_rflags == reg_value.Reg64);
995     }
996 
997     if (set) {
998         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
999         reg_value.Reg64 |= TF_MASK;
1000     } else {
1001         reg_value.Reg64 &= ~TF_MASK;
1002     }
1003 
1004     if (exit_context_rflags) {
1005         *exit_context_rflags = reg_value.Reg64;
1006     }
1007 
1008     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1009         whpx->partition,
1010         cpu->cpu_index,
1011         &reg_name,
1012         1,
1013         &reg_value);
1014 
1015     if (FAILED(hr)) {
1016         error_report("WHPX: Failed to set rflags,"
1017             " hr=%08lx",
1018             hr);
1019         return hr;
1020     }
1021 
1022     reg_name = WHvRegisterInterruptState;
1023     reg_value.Reg64 = 0;
1024 
1025     /* Suspend delivery of hardware interrupts during single-stepping. */
1026     reg_value.InterruptState.InterruptShadow = set != 0;
1027 
1028     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1029     whpx->partition,
1030         cpu->cpu_index,
1031         &reg_name,
1032         1,
1033         &reg_value);
1034 
1035     if (FAILED(hr)) {
1036         error_report("WHPX: Failed to set InterruptState,"
1037             " hr=%08lx",
1038             hr);
1039         return hr;
1040     }
1041 
1042     if (!set) {
1043         /*
1044          * We have just finished stepping over a single instruction,
1045          * and intercepted the INT1 generated by it.
1046          * We need to now hide the INT1 from the guest,
1047          * as it would not be expecting it.
1048          */
1049 
1050         reg_name = WHvX64RegisterPendingDebugException;
1051         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1052         whpx->partition,
1053             cpu->cpu_index,
1054             &reg_name,
1055             1,
1056             &reg_value);
1057 
1058         if (FAILED(hr)) {
1059             error_report("WHPX: Failed to get pending debug exceptions,"
1060                          "hr=%08lx", hr);
1061             return hr;
1062         }
1063 
1064         if (reg_value.PendingDebugException.SingleStep) {
1065             reg_value.PendingDebugException.SingleStep = 0;
1066 
1067             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1068                 whpx->partition,
1069                 cpu->cpu_index,
1070                 &reg_name,
1071                 1,
1072                 &reg_value);
1073 
1074             if (FAILED(hr)) {
1075                 error_report("WHPX: Failed to clear pending debug exceptions,"
1076                              "hr=%08lx", hr);
1077              return hr;
1078             }
1079         }
1080 
1081     }
1082 
1083     return S_OK;
1084 }
1085 
1086 /* Tries to find a breakpoint at the specified address. */
whpx_lookup_breakpoint_by_addr(uint64_t address)1087 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1088 {
1089     struct whpx_state *whpx = &whpx_global;
1090     int i;
1091 
1092     if (whpx->breakpoints.breakpoints) {
1093         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1094             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1095                 return &whpx->breakpoints.breakpoints->data[i];
1096             }
1097         }
1098     }
1099 
1100     return NULL;
1101 }
1102 
1103 /*
1104  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1105  * debugging user-mode applications. Since the WHPX API does not offer
1106  * an easy way to pass the intercepted exception back to the guest, we
1107  * resort to using INT1 instead, and let the guest always handle INT3.
1108  */
1109 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1110 
1111 /*
1112  * The WHPX QEMU backend implements breakpoints by writing the INT1
1113  * instruction into memory (ignoring the DRx registers). This raises a few
1114  * issues that need to be carefully handled:
1115  *
1116  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1117  *    at the same location, and later remove them in arbitrary order.
1118  *    This should not cause memory corruption, and should only remove the
1119  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1120  *
1121  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1122  *    physical location. Hence, physically adding/removing a breakpoint can
1123  *    theoretically fail at any time. We need to keep track of it.
1124  *
1125  * The function below rebuilds a list of low-level breakpoints (one per
1126  * address, tracking the original instruction and any errors) from the list of
1127  * high-level breakpoints (set via cpu_breakpoint_insert()).
1128  *
1129  * In order to optimize performance, this function stores the list of
1130  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1131  * low-level ones, so that it won't be re-invoked until these breakpoints
1132  * change.
1133  *
1134  * Note that this function decides which breakpoints should be inserted into,
1135  * memory, but doesn't actually do it. The memory accessing is done in
1136  * whpx_apply_breakpoints().
1137  */
whpx_translate_cpu_breakpoints(struct whpx_breakpoints * breakpoints,CPUState * cpu,int cpu_breakpoint_count)1138 static void whpx_translate_cpu_breakpoints(
1139     struct whpx_breakpoints *breakpoints,
1140     CPUState *cpu,
1141     int cpu_breakpoint_count)
1142 {
1143     CPUBreakpoint *bp;
1144     int cpu_bp_index = 0;
1145 
1146     breakpoints->original_addresses =
1147         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1148 
1149     breakpoints->original_address_count = cpu_breakpoint_count;
1150 
1151     int max_breakpoints = cpu_breakpoint_count +
1152         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1153 
1154     struct whpx_breakpoint_collection *new_breakpoints =
1155         g_malloc0(sizeof(struct whpx_breakpoint_collection)
1156                   + max_breakpoints * sizeof(struct whpx_breakpoint));
1157 
1158     new_breakpoints->allocated = max_breakpoints;
1159     new_breakpoints->used = 0;
1160 
1161     /*
1162      * 1. Preserve all old breakpoints that could not be automatically
1163      * cleared when the CPU got stopped.
1164      */
1165     if (breakpoints->breakpoints) {
1166         int i;
1167         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1168             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1169                 new_breakpoints->data[new_breakpoints->used++] =
1170                     breakpoints->breakpoints->data[i];
1171             }
1172         }
1173     }
1174 
1175     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1176     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1177         int i;
1178         bool found = false;
1179 
1180         /* This will be used to detect changed CPU breakpoints later. */
1181         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1182 
1183         for (i = 0; i < new_breakpoints->used; i++) {
1184             /*
1185              * WARNING: This loop has O(N^2) complexity, where N is the
1186              * number of breakpoints. It should not be a bottleneck in
1187              * real-world scenarios, since it only needs to run once after
1188              * the breakpoints have been modified.
1189              * If this ever becomes a concern, it can be optimized by storing
1190              * high-level breakpoint objects in a tree or hash map.
1191              */
1192 
1193             if (new_breakpoints->data[i].address == bp->pc) {
1194                 /* There was already a breakpoint at this address. */
1195                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1196                     new_breakpoints->data[i].state = WHPX_BP_SET;
1197                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1198                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1199                 }
1200 
1201                 found = true;
1202                 break;
1203             }
1204         }
1205 
1206         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1207             /* No WHPX breakpoint at this address. Create one. */
1208             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1209             new_breakpoints->data[new_breakpoints->used].state =
1210                 WHPX_BP_SET_PENDING;
1211             new_breakpoints->used++;
1212         }
1213     }
1214 
1215     /*
1216      * Free the previous breakpoint list. This can be optimized by keeping
1217      * it as shadow buffer for the next computation instead of freeing
1218      * it immediately.
1219      */
1220     g_free(breakpoints->breakpoints);
1221 
1222     breakpoints->breakpoints = new_breakpoints;
1223 }
1224 
1225 /*
1226  * Physically inserts/removes the breakpoints by reading and writing the
1227  * physical memory, keeping a track of the failed attempts.
1228  *
1229  * Passing resuming=true  will try to set all previously unset breakpoints.
1230  * Passing resuming=false will remove all inserted ones.
1231  */
whpx_apply_breakpoints(struct whpx_breakpoint_collection * breakpoints,CPUState * cpu,bool resuming)1232 static void whpx_apply_breakpoints(
1233     struct whpx_breakpoint_collection *breakpoints,
1234     CPUState *cpu,
1235     bool resuming)
1236 {
1237     int i, rc;
1238     if (!breakpoints) {
1239         return;
1240     }
1241 
1242     for (i = 0; i < breakpoints->used; i++) {
1243         /* Decide what to do right now based on the last known state. */
1244         WhpxBreakpointState state = breakpoints->data[i].state;
1245         switch (state) {
1246         case WHPX_BP_CLEARED:
1247             if (resuming) {
1248                 state = WHPX_BP_SET_PENDING;
1249             }
1250             break;
1251         case WHPX_BP_SET_PENDING:
1252             if (!resuming) {
1253                 state = WHPX_BP_CLEARED;
1254             }
1255             break;
1256         case WHPX_BP_SET:
1257             if (!resuming) {
1258                 state = WHPX_BP_CLEAR_PENDING;
1259             }
1260             break;
1261         case WHPX_BP_CLEAR_PENDING:
1262             if (resuming) {
1263                 state = WHPX_BP_SET;
1264             }
1265             break;
1266         }
1267 
1268         if (state == WHPX_BP_SET_PENDING) {
1269             /* Remember the original instruction. */
1270             rc = cpu_memory_rw_debug(cpu,
1271                 breakpoints->data[i].address,
1272                 &breakpoints->data[i].original_instruction,
1273                 1,
1274                 false);
1275 
1276             if (!rc) {
1277                 /* Write the breakpoint instruction. */
1278                 rc = cpu_memory_rw_debug(cpu,
1279                     breakpoints->data[i].address,
1280                     (void *)&whpx_breakpoint_instruction,
1281                     1,
1282                     true);
1283             }
1284 
1285             if (!rc) {
1286                 state = WHPX_BP_SET;
1287             }
1288 
1289         }
1290 
1291         if (state == WHPX_BP_CLEAR_PENDING) {
1292             /* Restore the original instruction. */
1293             rc = cpu_memory_rw_debug(cpu,
1294                 breakpoints->data[i].address,
1295                 &breakpoints->data[i].original_instruction,
1296                 1,
1297                 true);
1298 
1299             if (!rc) {
1300                 state = WHPX_BP_CLEARED;
1301             }
1302         }
1303 
1304         breakpoints->data[i].state = state;
1305     }
1306 }
1307 
1308 /*
1309  * This function is called when the a VCPU is about to start and no other
1310  * VCPUs have been started so far. Since the VCPU start order could be
1311  * arbitrary, it doesn't have to be VCPU#0.
1312  *
1313  * It is used to commit the breakpoints into memory, and configure WHPX
1314  * to intercept debug exceptions.
1315  *
1316  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1317  * more VCPUs are already running, so this is the best place to do it.
1318  */
whpx_first_vcpu_starting(CPUState * cpu)1319 static int whpx_first_vcpu_starting(CPUState *cpu)
1320 {
1321     struct whpx_state *whpx = &whpx_global;
1322     HRESULT hr;
1323 
1324     g_assert(bql_locked());
1325 
1326     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1327             (whpx->breakpoints.breakpoints &&
1328              whpx->breakpoints.breakpoints->used)) {
1329         CPUBreakpoint *bp;
1330         int i = 0;
1331         bool update_pending = false;
1332 
1333         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1334             if (i >= whpx->breakpoints.original_address_count ||
1335                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1336                 update_pending = true;
1337             }
1338 
1339             i++;
1340         }
1341 
1342         if (i != whpx->breakpoints.original_address_count) {
1343             update_pending = true;
1344         }
1345 
1346         if (update_pending) {
1347             /*
1348              * The CPU breakpoints have changed since the last call to
1349              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1350              * now be recomputed.
1351              */
1352             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1353         }
1354 
1355         /* Actually insert the breakpoints into the memory. */
1356         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1357     }
1358 
1359     uint64_t exception_mask;
1360     if (whpx->step_pending ||
1361         (whpx->breakpoints.breakpoints &&
1362          whpx->breakpoints.breakpoints->used)) {
1363         /*
1364          * We are either attempting to single-step one or more CPUs, or
1365          * have one or more breakpoints enabled. Both require intercepting
1366          * the WHvX64ExceptionTypeBreakpointTrap exception.
1367          */
1368 
1369         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1370     } else {
1371         /* Let the guest handle all exceptions. */
1372         exception_mask = 0;
1373     }
1374 
1375     hr = whpx_set_exception_exit_bitmap(exception_mask);
1376     if (!SUCCEEDED(hr)) {
1377         error_report("WHPX: Failed to update exception exit mask,"
1378                      "hr=%08lx.", hr);
1379         return 1;
1380     }
1381 
1382     return 0;
1383 }
1384 
1385 /*
1386  * This function is called when the last VCPU has finished running.
1387  * It is used to remove any previously set breakpoints from memory.
1388  */
whpx_last_vcpu_stopping(CPUState * cpu)1389 static int whpx_last_vcpu_stopping(CPUState *cpu)
1390 {
1391     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1392     return 0;
1393 }
1394 
1395 /* Returns the address of the next instruction that is about to be executed. */
whpx_vcpu_get_pc(CPUState * cpu,bool exit_context_valid)1396 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1397 {
1398     if (cpu->accel->dirty) {
1399         /* The CPU registers have been modified by other parts of QEMU. */
1400         return cpu_env(cpu)->eip;
1401     } else if (exit_context_valid) {
1402         /*
1403          * The CPU registers have not been modified by neither other parts
1404          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1405          * This is the most common case.
1406          */
1407         AccelCPUState *vcpu = cpu->accel;
1408         return vcpu->exit_ctx.VpContext.Rip;
1409     } else {
1410         /*
1411          * The CPU registers have been modified by a call to
1412          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1413          * the target.
1414          */
1415         WHV_REGISTER_VALUE reg_value;
1416         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1417         HRESULT hr;
1418         struct whpx_state *whpx = &whpx_global;
1419 
1420         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1421             whpx->partition,
1422             cpu->cpu_index,
1423             &reg_name,
1424             1,
1425             &reg_value);
1426 
1427         if (FAILED(hr)) {
1428             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1429             return 0;
1430         }
1431 
1432         return reg_value.Reg64;
1433     }
1434 }
1435 
whpx_handle_halt(CPUState * cpu)1436 static int whpx_handle_halt(CPUState *cpu)
1437 {
1438     int ret = 0;
1439 
1440     bql_lock();
1441     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1442           (cpu_env(cpu)->eflags & IF_MASK)) &&
1443         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1444         cpu->exception_index = EXCP_HLT;
1445         cpu->halted = true;
1446         ret = 1;
1447     }
1448     bql_unlock();
1449 
1450     return ret;
1451 }
1452 
whpx_vcpu_pre_run(CPUState * cpu)1453 static void whpx_vcpu_pre_run(CPUState *cpu)
1454 {
1455     HRESULT hr;
1456     struct whpx_state *whpx = &whpx_global;
1457     AccelCPUState *vcpu = cpu->accel;
1458     X86CPU *x86_cpu = X86_CPU(cpu);
1459     CPUX86State *env = &x86_cpu->env;
1460     int irq;
1461     uint8_t tpr;
1462     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1463     UINT32 reg_count = 0;
1464     WHV_REGISTER_VALUE reg_values[3];
1465     WHV_REGISTER_NAME reg_names[3];
1466 
1467     memset(&new_int, 0, sizeof(new_int));
1468     memset(reg_values, 0, sizeof(reg_values));
1469 
1470     bql_lock();
1471 
1472     /* Inject NMI */
1473     if (!vcpu->interruption_pending &&
1474         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1475         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1476             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1477             vcpu->interruptable = false;
1478             new_int.InterruptionType = WHvX64PendingNmi;
1479             new_int.InterruptionPending = 1;
1480             new_int.InterruptionVector = 2;
1481         }
1482         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1483             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1484         }
1485     }
1486 
1487     /*
1488      * Force the VCPU out of its inner loop to process any INIT requests or
1489      * commit pending TPR access.
1490      */
1491     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1492         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1493             !(env->hflags & HF_SMM_MASK)) {
1494             cpu->exit_request = 1;
1495         }
1496         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1497             cpu->exit_request = 1;
1498         }
1499     }
1500 
1501     /* Get pending hard interruption or replay one that was overwritten */
1502     if (!whpx_apic_in_platform()) {
1503         if (!vcpu->interruption_pending &&
1504             vcpu->interruptable && (env->eflags & IF_MASK)) {
1505             assert(!new_int.InterruptionPending);
1506             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1507                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1508                 irq = cpu_get_pic_interrupt(env);
1509                 if (irq >= 0) {
1510                     new_int.InterruptionType = WHvX64PendingInterrupt;
1511                     new_int.InterruptionPending = 1;
1512                     new_int.InterruptionVector = irq;
1513                 }
1514             }
1515         }
1516 
1517         /* Setup interrupt state if new one was prepared */
1518         if (new_int.InterruptionPending) {
1519             reg_values[reg_count].PendingInterruption = new_int;
1520             reg_names[reg_count] = WHvRegisterPendingInterruption;
1521             reg_count += 1;
1522         }
1523     } else if (vcpu->ready_for_pic_interrupt &&
1524                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1525         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1526         irq = cpu_get_pic_interrupt(env);
1527         if (irq >= 0) {
1528             reg_names[reg_count] = WHvRegisterPendingEvent;
1529             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1530             {
1531                 .EventPending = 1,
1532                 .EventType = WHvX64PendingEventExtInt,
1533                 .Vector = irq,
1534             };
1535             reg_count += 1;
1536         }
1537      }
1538 
1539     /* Sync the TPR to the CR8 if was modified during the intercept */
1540     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1541     if (tpr != vcpu->tpr) {
1542         vcpu->tpr = tpr;
1543         reg_values[reg_count].Reg64 = tpr;
1544         cpu->exit_request = 1;
1545         reg_names[reg_count] = WHvX64RegisterCr8;
1546         reg_count += 1;
1547     }
1548 
1549     /* Update the state of the interrupt delivery notification */
1550     if (!vcpu->window_registered &&
1551         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1552         reg_values[reg_count].DeliverabilityNotifications =
1553             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1554                 .InterruptNotification = 1
1555             };
1556         vcpu->window_registered = 1;
1557         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1558         reg_count += 1;
1559     }
1560 
1561     bql_unlock();
1562     vcpu->ready_for_pic_interrupt = false;
1563 
1564     if (reg_count) {
1565         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1566             whpx->partition, cpu->cpu_index,
1567             reg_names, reg_count, reg_values);
1568         if (FAILED(hr)) {
1569             error_report("WHPX: Failed to set interrupt state registers,"
1570                          " hr=%08lx", hr);
1571         }
1572     }
1573 
1574     return;
1575 }
1576 
whpx_vcpu_post_run(CPUState * cpu)1577 static void whpx_vcpu_post_run(CPUState *cpu)
1578 {
1579     AccelCPUState *vcpu = cpu->accel;
1580     X86CPU *x86_cpu = X86_CPU(cpu);
1581     CPUX86State *env = &x86_cpu->env;
1582 
1583     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1584 
1585     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1586     if (vcpu->tpr != tpr) {
1587         vcpu->tpr = tpr;
1588         bql_lock();
1589         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1590         bql_unlock();
1591     }
1592 
1593     vcpu->interruption_pending =
1594         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1595 
1596     vcpu->interruptable =
1597         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1598 
1599     return;
1600 }
1601 
whpx_vcpu_process_async_events(CPUState * cpu)1602 static void whpx_vcpu_process_async_events(CPUState *cpu)
1603 {
1604     X86CPU *x86_cpu = X86_CPU(cpu);
1605     CPUX86State *env = &x86_cpu->env;
1606     AccelCPUState *vcpu = cpu->accel;
1607 
1608     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1609         !(env->hflags & HF_SMM_MASK)) {
1610         whpx_cpu_synchronize_state(cpu);
1611         do_cpu_init(x86_cpu);
1612         vcpu->interruptable = true;
1613     }
1614 
1615     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1616         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1617         apic_poll_irq(x86_cpu->apic_state);
1618     }
1619 
1620     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1621          (env->eflags & IF_MASK)) ||
1622         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1623         cpu->halted = false;
1624     }
1625 
1626     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1627         whpx_cpu_synchronize_state(cpu);
1628         do_cpu_sipi(x86_cpu);
1629     }
1630 
1631     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1632         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1633         whpx_cpu_synchronize_state(cpu);
1634         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1635                                       env->tpr_access_type);
1636     }
1637 
1638     return;
1639 }
1640 
whpx_vcpu_run(CPUState * cpu)1641 static int whpx_vcpu_run(CPUState *cpu)
1642 {
1643     HRESULT hr;
1644     struct whpx_state *whpx = &whpx_global;
1645     AccelCPUState *vcpu = cpu->accel;
1646     struct whpx_breakpoint *stepped_over_bp = NULL;
1647     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1648     int ret;
1649 
1650     g_assert(bql_locked());
1651 
1652     if (whpx->running_cpus++ == 0) {
1653         /* Insert breakpoints into memory, update exception exit bitmap. */
1654         ret = whpx_first_vcpu_starting(cpu);
1655         if (ret != 0) {
1656             return ret;
1657         }
1658     }
1659 
1660     if (whpx->breakpoints.breakpoints &&
1661         whpx->breakpoints.breakpoints->used > 0)
1662     {
1663         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1664         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1665         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1666             stepped_over_bp = NULL;
1667         }
1668 
1669         if (stepped_over_bp) {
1670             /*
1671              * We are trying to run the instruction overwritten by an active
1672              * breakpoint. We will temporarily disable the breakpoint, suspend
1673              * other CPUs, and step over the instruction.
1674              */
1675             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1676         }
1677     }
1678 
1679     if (exclusive_step_mode == WHPX_STEP_NONE) {
1680         whpx_vcpu_process_async_events(cpu);
1681         if (cpu->halted && !whpx_apic_in_platform()) {
1682             cpu->exception_index = EXCP_HLT;
1683             qatomic_set(&cpu->exit_request, false);
1684             return 0;
1685         }
1686     }
1687 
1688     bql_unlock();
1689 
1690     if (exclusive_step_mode != WHPX_STEP_NONE) {
1691         start_exclusive();
1692         g_assert(cpu == current_cpu);
1693         g_assert(!cpu->running);
1694         cpu->running = true;
1695 
1696         hr = whpx_set_exception_exit_bitmap(
1697             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1698         if (!SUCCEEDED(hr)) {
1699             error_report("WHPX: Failed to update exception exit mask, "
1700                          "hr=%08lx.", hr);
1701             return 1;
1702         }
1703 
1704         if (stepped_over_bp) {
1705             /* Temporarily disable the triggered breakpoint. */
1706             cpu_memory_rw_debug(cpu,
1707                 stepped_over_bp->address,
1708                 &stepped_over_bp->original_instruction,
1709                 1,
1710                 true);
1711         }
1712     } else {
1713         cpu_exec_start(cpu);
1714     }
1715 
1716     do {
1717         if (cpu->accel->dirty) {
1718             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1719             cpu->accel->dirty = false;
1720         }
1721 
1722         if (exclusive_step_mode == WHPX_STEP_NONE) {
1723             whpx_vcpu_pre_run(cpu);
1724 
1725             if (qatomic_read(&cpu->exit_request)) {
1726                 whpx_vcpu_kick(cpu);
1727             }
1728         }
1729 
1730         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1731             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1732         }
1733 
1734         hr = whp_dispatch.WHvRunVirtualProcessor(
1735             whpx->partition, cpu->cpu_index,
1736             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1737 
1738         if (FAILED(hr)) {
1739             error_report("WHPX: Failed to exec a virtual processor,"
1740                          " hr=%08lx", hr);
1741             ret = -1;
1742             break;
1743         }
1744 
1745         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1746             whpx_vcpu_configure_single_stepping(cpu,
1747                 false,
1748                 &vcpu->exit_ctx.VpContext.Rflags);
1749         }
1750 
1751         whpx_vcpu_post_run(cpu);
1752 
1753         switch (vcpu->exit_ctx.ExitReason) {
1754         case WHvRunVpExitReasonMemoryAccess:
1755             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1756             break;
1757 
1758         case WHvRunVpExitReasonX64IoPortAccess:
1759             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1760             break;
1761 
1762         case WHvRunVpExitReasonX64InterruptWindow:
1763             vcpu->ready_for_pic_interrupt = 1;
1764             vcpu->window_registered = 0;
1765             ret = 0;
1766             break;
1767 
1768         case WHvRunVpExitReasonX64ApicEoi:
1769             assert(whpx_apic_in_platform());
1770             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1771             break;
1772 
1773         case WHvRunVpExitReasonX64Halt:
1774             /*
1775              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1776              * longer used.
1777              */
1778             ret = whpx_handle_halt(cpu);
1779             break;
1780 
1781         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1782             WHV_INTERRUPT_CONTROL ipi = {0};
1783             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1784             uint32_t delivery_mode =
1785                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1786             int dest_shorthand =
1787                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1788             bool broadcast = false;
1789             bool include_self = false;
1790             uint32_t i;
1791 
1792             /* We only registered for INIT and SIPI exits. */
1793             if ((delivery_mode != APIC_DM_INIT) &&
1794                 (delivery_mode != APIC_DM_SIPI)) {
1795                 error_report(
1796                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1797                 break;
1798             }
1799 
1800             if (delivery_mode == APIC_DM_INIT) {
1801                 ipi.Type = WHvX64InterruptTypeInit;
1802             } else {
1803                 ipi.Type = WHvX64InterruptTypeSipi;
1804             }
1805 
1806             ipi.DestinationMode =
1807                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1808                     WHvX64InterruptDestinationModeLogical :
1809                     WHvX64InterruptDestinationModePhysical;
1810 
1811             ipi.TriggerMode =
1812                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1813                     WHvX64InterruptTriggerModeLevel :
1814                     WHvX64InterruptTriggerModeEdge;
1815 
1816             ipi.Vector = icr & APIC_VECTOR_MASK;
1817             switch (dest_shorthand) {
1818             /* no shorthand. Bits 56-63 contain the destination. */
1819             case 0:
1820                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1821                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1822                         &ipi, sizeof(ipi));
1823                 if (FAILED(hr)) {
1824                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1825                         hr);
1826                 }
1827 
1828                 break;
1829 
1830             /* self */
1831             case 1:
1832                 include_self = true;
1833                 break;
1834 
1835             /* broadcast, including self */
1836             case 2:
1837                 broadcast = true;
1838                 include_self = true;
1839                 break;
1840 
1841             /* broadcast, excluding self */
1842             case 3:
1843                 broadcast = true;
1844                 break;
1845             }
1846 
1847             if (!broadcast && !include_self) {
1848                 break;
1849             }
1850 
1851             for (i = 0; i <= max_vcpu_index; i++) {
1852                 if (i == cpu->cpu_index && !include_self) {
1853                     continue;
1854                 }
1855 
1856                 /*
1857                  * Assuming that APIC Ids are identity mapped since
1858                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1859                  * are not handled yet and the hypervisor doesn't allow the
1860                  * guest to modify the APIC ID.
1861                  */
1862                 ipi.Destination = i;
1863                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1864                         &ipi, sizeof(ipi));
1865                 if (FAILED(hr)) {
1866                     error_report(
1867                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1868                         i, hr);
1869                 }
1870             }
1871 
1872             break;
1873         }
1874 
1875         case WHvRunVpExitReasonCanceled:
1876             if (exclusive_step_mode != WHPX_STEP_NONE) {
1877                 /*
1878                  * We are trying to step over a single instruction, and
1879                  * likely got a request to stop from another thread.
1880                  * Delay it until we are done stepping
1881                  * over.
1882                  */
1883                 ret = 0;
1884             } else {
1885                 cpu->exception_index = EXCP_INTERRUPT;
1886                 ret = 1;
1887             }
1888             break;
1889         case WHvRunVpExitReasonX64MsrAccess: {
1890             WHV_REGISTER_VALUE reg_values[3] = {0};
1891             WHV_REGISTER_NAME reg_names[3];
1892             UINT32 reg_count;
1893 
1894             reg_names[0] = WHvX64RegisterRip;
1895             reg_names[1] = WHvX64RegisterRax;
1896             reg_names[2] = WHvX64RegisterRdx;
1897 
1898             reg_values[0].Reg64 =
1899                 vcpu->exit_ctx.VpContext.Rip +
1900                 vcpu->exit_ctx.VpContext.InstructionLength;
1901 
1902             /*
1903              * For all unsupported MSR access we:
1904              *     ignore writes
1905              *     return 0 on read.
1906              */
1907             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1908                         1 : 3;
1909 
1910             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1911                 whpx->partition,
1912                 cpu->cpu_index,
1913                 reg_names, reg_count,
1914                 reg_values);
1915 
1916             if (FAILED(hr)) {
1917                 error_report("WHPX: Failed to set MsrAccess state "
1918                              " registers, hr=%08lx", hr);
1919             }
1920             ret = 0;
1921             break;
1922         }
1923         case WHvRunVpExitReasonX64Cpuid: {
1924             WHV_REGISTER_VALUE reg_values[5];
1925             WHV_REGISTER_NAME reg_names[5];
1926             UINT32 reg_count = 5;
1927             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1928             X86CPU *x86_cpu = X86_CPU(cpu);
1929             CPUX86State *env = &x86_cpu->env;
1930 
1931             memset(reg_values, 0, sizeof(reg_values));
1932 
1933             rip = vcpu->exit_ctx.VpContext.Rip +
1934                   vcpu->exit_ctx.VpContext.InstructionLength;
1935             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1936 
1937             /*
1938              * Ideally, these should be supplied to the hypervisor during VCPU
1939              * initialization and it should be able to satisfy this request.
1940              * But, currently, WHPX doesn't support setting CPUID values in the
1941              * hypervisor once the partition has been setup, which is too late
1942              * since VCPUs are realized later. For now, use the values from
1943              * QEMU to satisfy these requests, until WHPX adds support for
1944              * being able to set these values in the hypervisor at runtime.
1945              */
1946             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1947                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1948             switch (cpuid_fn) {
1949             case 0x40000000:
1950                 /* Expose the vmware cpu frequency cpuid leaf */
1951                 rax = 0x40000010;
1952                 rbx = rcx = rdx = 0;
1953                 break;
1954 
1955             case 0x40000010:
1956                 rax = env->tsc_khz;
1957                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1958                 rcx = rdx = 0;
1959                 break;
1960 
1961             case 0x80000001:
1962                 /* Remove any support of OSVW */
1963                 rcx &= ~CPUID_EXT3_OSVW;
1964                 break;
1965             }
1966 
1967             reg_names[0] = WHvX64RegisterRip;
1968             reg_names[1] = WHvX64RegisterRax;
1969             reg_names[2] = WHvX64RegisterRcx;
1970             reg_names[3] = WHvX64RegisterRdx;
1971             reg_names[4] = WHvX64RegisterRbx;
1972 
1973             reg_values[0].Reg64 = rip;
1974             reg_values[1].Reg64 = rax;
1975             reg_values[2].Reg64 = rcx;
1976             reg_values[3].Reg64 = rdx;
1977             reg_values[4].Reg64 = rbx;
1978 
1979             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1980                 whpx->partition, cpu->cpu_index,
1981                 reg_names,
1982                 reg_count,
1983                 reg_values);
1984 
1985             if (FAILED(hr)) {
1986                 error_report("WHPX: Failed to set CpuidAccess state registers,"
1987                              " hr=%08lx", hr);
1988             }
1989             ret = 0;
1990             break;
1991         }
1992         case WHvRunVpExitReasonException:
1993             whpx_get_registers(cpu);
1994 
1995             if ((vcpu->exit_ctx.VpException.ExceptionType ==
1996                  WHvX64ExceptionTypeDebugTrapOrFault) &&
1997                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1998                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1999                  whpx_breakpoint_instruction)) {
2000                 /* Stopped at a software breakpoint. */
2001                 cpu->exception_index = EXCP_DEBUG;
2002             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
2003                         WHvX64ExceptionTypeDebugTrapOrFault) &&
2004                        !cpu->singlestep_enabled) {
2005                 /*
2006                  * Just finished stepping over a breakpoint, but the
2007                  * gdb does not expect us to do single-stepping.
2008                  * Don't do anything special.
2009                  */
2010                 cpu->exception_index = EXCP_INTERRUPT;
2011             } else {
2012                 /* Another exception or debug event. Report it to GDB. */
2013                 cpu->exception_index = EXCP_DEBUG;
2014             }
2015 
2016             ret = 1;
2017             break;
2018         case WHvRunVpExitReasonNone:
2019         case WHvRunVpExitReasonUnrecoverableException:
2020         case WHvRunVpExitReasonInvalidVpRegisterValue:
2021         case WHvRunVpExitReasonUnsupportedFeature:
2022         default:
2023             error_report("WHPX: Unexpected VP exit code %d",
2024                          vcpu->exit_ctx.ExitReason);
2025             whpx_get_registers(cpu);
2026             bql_lock();
2027             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2028             bql_unlock();
2029             break;
2030         }
2031 
2032     } while (!ret);
2033 
2034     if (stepped_over_bp) {
2035         /* Restore the breakpoint we stepped over */
2036         cpu_memory_rw_debug(cpu,
2037             stepped_over_bp->address,
2038             (void *)&whpx_breakpoint_instruction,
2039             1,
2040             true);
2041     }
2042 
2043     if (exclusive_step_mode != WHPX_STEP_NONE) {
2044         g_assert(cpu_in_exclusive_context(cpu));
2045         cpu->running = false;
2046         end_exclusive();
2047 
2048         exclusive_step_mode = WHPX_STEP_NONE;
2049     } else {
2050         cpu_exec_end(cpu);
2051     }
2052 
2053     bql_lock();
2054     current_cpu = cpu;
2055 
2056     if (--whpx->running_cpus == 0) {
2057         whpx_last_vcpu_stopping(cpu);
2058     }
2059 
2060     qatomic_set(&cpu->exit_request, false);
2061 
2062     return ret < 0;
2063 }
2064 
do_whpx_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2065 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2066 {
2067     if (!cpu->accel->dirty) {
2068         whpx_get_registers(cpu);
2069         cpu->accel->dirty = true;
2070     }
2071 }
2072 
do_whpx_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2073 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2074                                                run_on_cpu_data arg)
2075 {
2076     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2077     cpu->accel->dirty = false;
2078 }
2079 
do_whpx_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2080 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2081                                               run_on_cpu_data arg)
2082 {
2083     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2084     cpu->accel->dirty = false;
2085 }
2086 
do_whpx_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2087 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2088                                                run_on_cpu_data arg)
2089 {
2090     cpu->accel->dirty = true;
2091 }
2092 
2093 /*
2094  * CPU support.
2095  */
2096 
whpx_cpu_synchronize_state(CPUState * cpu)2097 void whpx_cpu_synchronize_state(CPUState *cpu)
2098 {
2099     if (!cpu->accel->dirty) {
2100         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2101     }
2102 }
2103 
whpx_cpu_synchronize_post_reset(CPUState * cpu)2104 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2105 {
2106     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2107 }
2108 
whpx_cpu_synchronize_post_init(CPUState * cpu)2109 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2110 {
2111     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2112 }
2113 
whpx_cpu_synchronize_pre_loadvm(CPUState * cpu)2114 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2115 {
2116     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2117 }
2118 
whpx_cpu_synchronize_pre_resume(bool step_pending)2119 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2120 {
2121     whpx_global.step_pending = step_pending;
2122 }
2123 
2124 /*
2125  * Vcpu support.
2126  */
2127 
2128 static Error *whpx_migration_blocker;
2129 
whpx_cpu_update_state(void * opaque,bool running,RunState state)2130 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2131 {
2132     CPUX86State *env = opaque;
2133 
2134     if (running) {
2135         env->tsc_valid = false;
2136     }
2137 }
2138 
whpx_init_vcpu(CPUState * cpu)2139 int whpx_init_vcpu(CPUState *cpu)
2140 {
2141     HRESULT hr;
2142     struct whpx_state *whpx = &whpx_global;
2143     AccelCPUState *vcpu = NULL;
2144     Error *local_error = NULL;
2145     X86CPU *x86_cpu = X86_CPU(cpu);
2146     CPUX86State *env = &x86_cpu->env;
2147     UINT64 freq = 0;
2148     int ret;
2149 
2150     /* Add migration blockers for all unsupported features of the
2151      * Windows Hypervisor Platform
2152      */
2153     if (whpx_migration_blocker == NULL) {
2154         error_setg(&whpx_migration_blocker,
2155                "State blocked due to non-migratable CPUID feature support,"
2156                "dirty memory tracking support, and XSAVE/XRSTOR support");
2157 
2158         if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2159             error_report_err(local_error);
2160             ret = -EINVAL;
2161             goto error;
2162         }
2163     }
2164 
2165     vcpu = g_new0(AccelCPUState, 1);
2166 
2167     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2168         &whpx_emu_callbacks,
2169         &vcpu->emulator);
2170     if (FAILED(hr)) {
2171         error_report("WHPX: Failed to setup instruction completion support,"
2172                      " hr=%08lx", hr);
2173         ret = -EINVAL;
2174         goto error;
2175     }
2176 
2177     hr = whp_dispatch.WHvCreateVirtualProcessor(
2178         whpx->partition, cpu->cpu_index, 0);
2179     if (FAILED(hr)) {
2180         error_report("WHPX: Failed to create a virtual processor,"
2181                      " hr=%08lx", hr);
2182         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2183         ret = -EINVAL;
2184         goto error;
2185     }
2186 
2187     /*
2188      * vcpu's TSC frequency is either specified by user, or use the value
2189      * provided by Hyper-V if the former is not present. In the latter case, we
2190      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2191      * frequency can be migrated later via this field.
2192      */
2193     if (!env->tsc_khz) {
2194         hr = whp_dispatch.WHvGetCapability(
2195             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2196                 NULL);
2197         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2198             if (FAILED(hr)) {
2199                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2200             } else {
2201                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2202             }
2203         }
2204     }
2205 
2206     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2207     hr = whp_dispatch.WHvGetCapability(
2208         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2209     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2210         if (FAILED(hr)) {
2211             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2212         } else {
2213             env->apic_bus_freq = freq;
2214         }
2215     }
2216 
2217     /*
2218      * If the vmware cpuid frequency leaf option is set, and we have a valid
2219      * tsc value, trap the corresponding cpuid's.
2220      */
2221     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2222         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2223 
2224         hr = whp_dispatch.WHvSetPartitionProperty(
2225                 whpx->partition,
2226                 WHvPartitionPropertyCodeCpuidExitList,
2227                 cpuidExitList,
2228                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2229 
2230         if (FAILED(hr)) {
2231             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2232                         hr);
2233             ret = -EINVAL;
2234             goto error;
2235         }
2236     }
2237 
2238     vcpu->interruptable = true;
2239     vcpu->dirty = true;
2240     cpu->accel = vcpu;
2241     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2242     qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2243 
2244     return 0;
2245 
2246 error:
2247     g_free(vcpu);
2248 
2249     return ret;
2250 }
2251 
whpx_vcpu_exec(CPUState * cpu)2252 int whpx_vcpu_exec(CPUState *cpu)
2253 {
2254     int ret;
2255     int fatal;
2256 
2257     for (;;) {
2258         if (cpu->exception_index >= EXCP_INTERRUPT) {
2259             ret = cpu->exception_index;
2260             cpu->exception_index = -1;
2261             break;
2262         }
2263 
2264         fatal = whpx_vcpu_run(cpu);
2265 
2266         if (fatal) {
2267             error_report("WHPX: Failed to exec a virtual processor");
2268             abort();
2269         }
2270     }
2271 
2272     return ret;
2273 }
2274 
whpx_destroy_vcpu(CPUState * cpu)2275 void whpx_destroy_vcpu(CPUState *cpu)
2276 {
2277     struct whpx_state *whpx = &whpx_global;
2278     AccelCPUState *vcpu = cpu->accel;
2279 
2280     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2281     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2282     g_free(cpu->accel);
2283     return;
2284 }
2285 
whpx_vcpu_kick(CPUState * cpu)2286 void whpx_vcpu_kick(CPUState *cpu)
2287 {
2288     struct whpx_state *whpx = &whpx_global;
2289     whp_dispatch.WHvCancelRunVirtualProcessor(
2290         whpx->partition, cpu->cpu_index, 0);
2291 }
2292 
2293 /*
2294  * Memory support.
2295  */
2296 
whpx_update_mapping(hwaddr start_pa,ram_addr_t size,void * host_va,int add,int rom,const char * name)2297 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2298                                 void *host_va, int add, int rom,
2299                                 const char *name)
2300 {
2301     struct whpx_state *whpx = &whpx_global;
2302     HRESULT hr;
2303 
2304     /*
2305     if (add) {
2306         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2307                (void*)start_pa, (void*)size, host_va,
2308                (rom ? "ROM" : "RAM"), name);
2309     } else {
2310         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2311                (void*)start_pa, (void*)size, host_va, name);
2312     }
2313     */
2314 
2315     if (add) {
2316         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2317                                          host_va,
2318                                          start_pa,
2319                                          size,
2320                                          (WHvMapGpaRangeFlagRead |
2321                                           WHvMapGpaRangeFlagExecute |
2322                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2323     } else {
2324         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2325                                            start_pa,
2326                                            size);
2327     }
2328 
2329     if (FAILED(hr)) {
2330         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2331                      " Host:%p, hr=%08lx",
2332                      (add ? "MAP" : "UNMAP"), name,
2333                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2334     }
2335 }
2336 
whpx_process_section(MemoryRegionSection * section,int add)2337 static void whpx_process_section(MemoryRegionSection *section, int add)
2338 {
2339     MemoryRegion *mr = section->mr;
2340     hwaddr start_pa = section->offset_within_address_space;
2341     ram_addr_t size = int128_get64(section->size);
2342     unsigned int delta;
2343     uint64_t host_va;
2344 
2345     if (!memory_region_is_ram(mr)) {
2346         return;
2347     }
2348 
2349     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2350     delta &= ~qemu_real_host_page_mask();
2351     if (delta > size) {
2352         return;
2353     }
2354     start_pa += delta;
2355     size -= delta;
2356     size &= qemu_real_host_page_mask();
2357     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2358         return;
2359     }
2360 
2361     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2362             + section->offset_within_region + delta;
2363 
2364     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2365                         memory_region_is_rom(mr), mr->name);
2366 }
2367 
whpx_region_add(MemoryListener * listener,MemoryRegionSection * section)2368 static void whpx_region_add(MemoryListener *listener,
2369                            MemoryRegionSection *section)
2370 {
2371     memory_region_ref(section->mr);
2372     whpx_process_section(section, 1);
2373 }
2374 
whpx_region_del(MemoryListener * listener,MemoryRegionSection * section)2375 static void whpx_region_del(MemoryListener *listener,
2376                            MemoryRegionSection *section)
2377 {
2378     whpx_process_section(section, 0);
2379     memory_region_unref(section->mr);
2380 }
2381 
whpx_transaction_begin(MemoryListener * listener)2382 static void whpx_transaction_begin(MemoryListener *listener)
2383 {
2384 }
2385 
whpx_transaction_commit(MemoryListener * listener)2386 static void whpx_transaction_commit(MemoryListener *listener)
2387 {
2388 }
2389 
whpx_log_sync(MemoryListener * listener,MemoryRegionSection * section)2390 static void whpx_log_sync(MemoryListener *listener,
2391                          MemoryRegionSection *section)
2392 {
2393     MemoryRegion *mr = section->mr;
2394 
2395     if (!memory_region_is_ram(mr)) {
2396         return;
2397     }
2398 
2399     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2400 }
2401 
2402 static MemoryListener whpx_memory_listener = {
2403     .name = "whpx",
2404     .begin = whpx_transaction_begin,
2405     .commit = whpx_transaction_commit,
2406     .region_add = whpx_region_add,
2407     .region_del = whpx_region_del,
2408     .log_sync = whpx_log_sync,
2409     .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2410 };
2411 
whpx_memory_init(void)2412 static void whpx_memory_init(void)
2413 {
2414     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2415 }
2416 
2417 /*
2418  * Load the functions from the given library, using the given handle. If a
2419  * handle is provided, it is used, otherwise the library is opened. The
2420  * handle will be updated on return with the opened one.
2421  */
load_whp_dispatch_fns(HMODULE * handle,WHPFunctionList function_list)2422 static bool load_whp_dispatch_fns(HMODULE *handle,
2423     WHPFunctionList function_list)
2424 {
2425     HMODULE hLib = *handle;
2426 
2427     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2428     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2429     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2430         whp_dispatch.function_name = \
2431             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2432 
2433     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2434         whp_dispatch.function_name = \
2435             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2436         if (!whp_dispatch.function_name) { \
2437             error_report("Could not load function %s", #function_name); \
2438             goto error; \
2439         } \
2440 
2441     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2442     if (!handle_lib) { \
2443         handle_lib = LoadLibrary(lib_name); \
2444         if (!handle_lib) { \
2445             error_report("Could not load library %s.", lib_name); \
2446             goto error; \
2447         } \
2448     } \
2449 
2450     switch (function_list) {
2451     case WINHV_PLATFORM_FNS_DEFAULT:
2452         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2453         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2454         break;
2455 
2456     case WINHV_EMULATION_FNS_DEFAULT:
2457         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2458         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2459         break;
2460 
2461     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2462         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2463         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2464         break;
2465     }
2466 
2467     *handle = hLib;
2468     return true;
2469 
2470 error:
2471     if (hLib) {
2472         FreeLibrary(hLib);
2473     }
2474 
2475     return false;
2476 }
2477 
whpx_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)2478 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2479                                    const char *name, void *opaque,
2480                                    Error **errp)
2481 {
2482     struct whpx_state *whpx = &whpx_global;
2483     OnOffSplit mode;
2484 
2485     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2486         return;
2487     }
2488 
2489     switch (mode) {
2490     case ON_OFF_SPLIT_ON:
2491         whpx->kernel_irqchip_allowed = true;
2492         whpx->kernel_irqchip_required = true;
2493         break;
2494 
2495     case ON_OFF_SPLIT_OFF:
2496         whpx->kernel_irqchip_allowed = false;
2497         whpx->kernel_irqchip_required = false;
2498         break;
2499 
2500     case ON_OFF_SPLIT_SPLIT:
2501         error_setg(errp, "WHPX: split irqchip currently not supported");
2502         error_append_hint(errp,
2503             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2504         break;
2505 
2506     default:
2507         /*
2508          * The value was checked in visit_type_OnOffSplit() above. If
2509          * we get here, then something is wrong in QEMU.
2510          */
2511         abort();
2512     }
2513 }
2514 
2515 /*
2516  * Partition support
2517  */
2518 
whpx_accel_init(MachineState * ms)2519 static int whpx_accel_init(MachineState *ms)
2520 {
2521     struct whpx_state *whpx;
2522     int ret;
2523     HRESULT hr;
2524     WHV_CAPABILITY whpx_cap;
2525     UINT32 whpx_cap_size;
2526     WHV_PARTITION_PROPERTY prop;
2527     UINT32 cpuidExitList[] = {1, 0x80000001};
2528     WHV_CAPABILITY_FEATURES features = {0};
2529 
2530     whpx = &whpx_global;
2531 
2532     if (!init_whp_dispatch()) {
2533         ret = -ENOSYS;
2534         goto error;
2535     }
2536 
2537     whpx->mem_quota = ms->ram_size;
2538 
2539     hr = whp_dispatch.WHvGetCapability(
2540         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2541         sizeof(whpx_cap), &whpx_cap_size);
2542     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2543         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2544         ret = -ENOSPC;
2545         goto error;
2546     }
2547 
2548     hr = whp_dispatch.WHvGetCapability(
2549         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2550     if (FAILED(hr)) {
2551         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2552         ret = -EINVAL;
2553         goto error;
2554     }
2555 
2556     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2557     if (FAILED(hr)) {
2558         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2559         ret = -EINVAL;
2560         goto error;
2561     }
2562 
2563     /*
2564      * Query the XSAVE capability of the partition. Any error here is not
2565      * considered fatal.
2566      */
2567     hr = whp_dispatch.WHvGetPartitionProperty(
2568         whpx->partition,
2569         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2570         &whpx_xsave_cap,
2571         sizeof(whpx_xsave_cap),
2572         &whpx_cap_size);
2573 
2574     /*
2575      * Windows version which don't support this property will return with the
2576      * specific error code.
2577      */
2578     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2579         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2580     }
2581 
2582     if (!whpx_has_xsave()) {
2583         printf("WHPX: Partition is not XSAVE capable\n");
2584     }
2585 
2586     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2587     prop.ProcessorCount = ms->smp.cpus;
2588     hr = whp_dispatch.WHvSetPartitionProperty(
2589         whpx->partition,
2590         WHvPartitionPropertyCodeProcessorCount,
2591         &prop,
2592         sizeof(WHV_PARTITION_PROPERTY));
2593 
2594     if (FAILED(hr)) {
2595         error_report("WHPX: Failed to set partition processor count to %u,"
2596                      " hr=%08lx", prop.ProcessorCount, hr);
2597         ret = -EINVAL;
2598         goto error;
2599     }
2600 
2601     /*
2602      * Error out if WHP doesn't support apic emulation and user is requiring
2603      * it.
2604      */
2605     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2606             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2607         error_report("WHPX: kernel irqchip requested, but unavailable. "
2608             "Try without kernel-irqchip or with kernel-irqchip=off");
2609         ret = -EINVAL;
2610         goto error;
2611     }
2612 
2613     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2614         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2615         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2616             WHvX64LocalApicEmulationModeXApic;
2617         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2618         hr = whp_dispatch.WHvSetPartitionProperty(
2619             whpx->partition,
2620             WHvPartitionPropertyCodeLocalApicEmulationMode,
2621             &mode,
2622             sizeof(mode));
2623         if (FAILED(hr)) {
2624             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2625             if (whpx->kernel_irqchip_required) {
2626                 error_report("WHPX: kernel irqchip requested, but unavailable");
2627                 ret = -EINVAL;
2628                 goto error;
2629             }
2630         } else {
2631             whpx->apic_in_platform = true;
2632         }
2633     }
2634 
2635     /* Register for MSR and CPUID exits */
2636     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2637     prop.ExtendedVmExits.X64MsrExit = 1;
2638     prop.ExtendedVmExits.X64CpuidExit = 1;
2639     prop.ExtendedVmExits.ExceptionExit = 1;
2640     if (whpx_apic_in_platform()) {
2641         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2642     }
2643 
2644     hr = whp_dispatch.WHvSetPartitionProperty(
2645             whpx->partition,
2646             WHvPartitionPropertyCodeExtendedVmExits,
2647             &prop,
2648             sizeof(WHV_PARTITION_PROPERTY));
2649     if (FAILED(hr)) {
2650         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2651         ret = -EINVAL;
2652         goto error;
2653     }
2654 
2655     hr = whp_dispatch.WHvSetPartitionProperty(
2656         whpx->partition,
2657         WHvPartitionPropertyCodeCpuidExitList,
2658         cpuidExitList,
2659         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2660 
2661     if (FAILED(hr)) {
2662         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2663                      hr);
2664         ret = -EINVAL;
2665         goto error;
2666     }
2667 
2668     /*
2669      * We do not want to intercept any exceptions from the guest,
2670      * until we actually start debugging with gdb.
2671      */
2672     whpx->exception_exit_bitmap = -1;
2673     hr = whpx_set_exception_exit_bitmap(0);
2674 
2675     if (FAILED(hr)) {
2676         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2677         ret = -EINVAL;
2678         goto error;
2679     }
2680 
2681     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2682     if (FAILED(hr)) {
2683         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2684         ret = -EINVAL;
2685         goto error;
2686     }
2687 
2688     whpx_memory_init();
2689 
2690     printf("Windows Hypervisor Platform accelerator is operational\n");
2691     return 0;
2692 
2693 error:
2694 
2695     if (NULL != whpx->partition) {
2696         whp_dispatch.WHvDeletePartition(whpx->partition);
2697         whpx->partition = NULL;
2698     }
2699 
2700     return ret;
2701 }
2702 
whpx_enabled(void)2703 int whpx_enabled(void)
2704 {
2705     return whpx_allowed;
2706 }
2707 
whpx_apic_in_platform(void)2708 bool whpx_apic_in_platform(void) {
2709     return whpx_global.apic_in_platform;
2710 }
2711 
whpx_accel_class_init(ObjectClass * oc,void * data)2712 static void whpx_accel_class_init(ObjectClass *oc, void *data)
2713 {
2714     AccelClass *ac = ACCEL_CLASS(oc);
2715     ac->name = "WHPX";
2716     ac->init_machine = whpx_accel_init;
2717     ac->allowed = &whpx_allowed;
2718 
2719     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2720         NULL, whpx_set_kernel_irqchip,
2721         NULL, NULL);
2722     object_class_property_set_description(oc, "kernel-irqchip",
2723         "Configure WHPX in-kernel irqchip");
2724 }
2725 
whpx_accel_instance_init(Object * obj)2726 static void whpx_accel_instance_init(Object *obj)
2727 {
2728     struct whpx_state *whpx = &whpx_global;
2729 
2730     memset(whpx, 0, sizeof(struct whpx_state));
2731     /* Turn on kernel-irqchip, by default */
2732     whpx->kernel_irqchip_allowed = true;
2733 }
2734 
2735 static const TypeInfo whpx_accel_type = {
2736     .name = ACCEL_CLASS_NAME("whpx"),
2737     .parent = TYPE_ACCEL,
2738     .instance_init = whpx_accel_instance_init,
2739     .class_init = whpx_accel_class_init,
2740 };
2741 
whpx_type_init(void)2742 static void whpx_type_init(void)
2743 {
2744     type_register_static(&whpx_accel_type);
2745 }
2746 
init_whp_dispatch(void)2747 bool init_whp_dispatch(void)
2748 {
2749     if (whp_dispatch_initialized) {
2750         return true;
2751     }
2752 
2753     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2754         goto error;
2755     }
2756 
2757     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2758         goto error;
2759     }
2760 
2761     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2762         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2763     whp_dispatch_initialized = true;
2764 
2765     return true;
2766 error:
2767     if (hWinHvPlatform) {
2768         FreeLibrary(hWinHvPlatform);
2769     }
2770 
2771     if (hWinHvEmulation) {
2772         FreeLibrary(hWinHvEmulation);
2773     }
2774 
2775     return false;
2776 }
2777 
2778 type_init(whpx_type_init);
2779