1 /*
2 * QEMU Windows Hypervisor Platform accelerator (WHPX)
3 *
4 * Copyright Microsoft Corp. 2017
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
8 *
9 */
10
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33
34 #include <winhvplatform.h>
35 #include <winhvemulation.h>
36
37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL)
38
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40
41 /* X64 General purpose registers */
42 WHvX64RegisterRax,
43 WHvX64RegisterRcx,
44 WHvX64RegisterRdx,
45 WHvX64RegisterRbx,
46 WHvX64RegisterRsp,
47 WHvX64RegisterRbp,
48 WHvX64RegisterRsi,
49 WHvX64RegisterRdi,
50 WHvX64RegisterR8,
51 WHvX64RegisterR9,
52 WHvX64RegisterR10,
53 WHvX64RegisterR11,
54 WHvX64RegisterR12,
55 WHvX64RegisterR13,
56 WHvX64RegisterR14,
57 WHvX64RegisterR15,
58 WHvX64RegisterRip,
59 WHvX64RegisterRflags,
60
61 /* X64 Segment registers */
62 WHvX64RegisterEs,
63 WHvX64RegisterCs,
64 WHvX64RegisterSs,
65 WHvX64RegisterDs,
66 WHvX64RegisterFs,
67 WHvX64RegisterGs,
68 WHvX64RegisterLdtr,
69 WHvX64RegisterTr,
70
71 /* X64 Table registers */
72 WHvX64RegisterIdtr,
73 WHvX64RegisterGdtr,
74
75 /* X64 Control Registers */
76 WHvX64RegisterCr0,
77 WHvX64RegisterCr2,
78 WHvX64RegisterCr3,
79 WHvX64RegisterCr4,
80 WHvX64RegisterCr8,
81
82 /* X64 Debug Registers */
83 /*
84 * WHvX64RegisterDr0,
85 * WHvX64RegisterDr1,
86 * WHvX64RegisterDr2,
87 * WHvX64RegisterDr3,
88 * WHvX64RegisterDr6,
89 * WHvX64RegisterDr7,
90 */
91
92 /* X64 Floating Point and Vector Registers */
93 WHvX64RegisterXmm0,
94 WHvX64RegisterXmm1,
95 WHvX64RegisterXmm2,
96 WHvX64RegisterXmm3,
97 WHvX64RegisterXmm4,
98 WHvX64RegisterXmm5,
99 WHvX64RegisterXmm6,
100 WHvX64RegisterXmm7,
101 WHvX64RegisterXmm8,
102 WHvX64RegisterXmm9,
103 WHvX64RegisterXmm10,
104 WHvX64RegisterXmm11,
105 WHvX64RegisterXmm12,
106 WHvX64RegisterXmm13,
107 WHvX64RegisterXmm14,
108 WHvX64RegisterXmm15,
109 WHvX64RegisterFpMmx0,
110 WHvX64RegisterFpMmx1,
111 WHvX64RegisterFpMmx2,
112 WHvX64RegisterFpMmx3,
113 WHvX64RegisterFpMmx4,
114 WHvX64RegisterFpMmx5,
115 WHvX64RegisterFpMmx6,
116 WHvX64RegisterFpMmx7,
117 WHvX64RegisterFpControlStatus,
118 WHvX64RegisterXmmControlStatus,
119
120 /* X64 MSRs */
121 WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123 WHvX64RegisterKernelGsBase,
124 #endif
125 WHvX64RegisterApicBase,
126 /* WHvX64RegisterPat, */
127 WHvX64RegisterSysenterCs,
128 WHvX64RegisterSysenterEip,
129 WHvX64RegisterSysenterEsp,
130 WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132 WHvX64RegisterLstar,
133 WHvX64RegisterCstar,
134 WHvX64RegisterSfmask,
135 #endif
136
137 /* Interrupt / Event Registers */
138 /*
139 * WHvRegisterPendingInterruption,
140 * WHvRegisterInterruptState,
141 * WHvRegisterPendingEvent0,
142 * WHvRegisterPendingEvent1
143 * WHvX64RegisterDeliverabilityNotifications,
144 */
145 };
146
147 struct whpx_register_set {
148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150
151 /*
152 * The current implementation of instruction stepping sets the TF flag
153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155 *
156 * This approach has a few limitations:
157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158 * along with the other flags, possibly restoring it later. It would
159 * result in another INT1 when the flags are restored, triggering
160 * a stop in gdb that could be cleared by doing another step.
161 *
162 * Stepping over a POPF/LAHF instruction will let it overwrite the
163 * TF flags, ending the stepping mode.
164 *
165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166 * or anything that could result in a page fault) will save the flags
167 * to the stack, clear the TF flag, and let the guest execute the
168 * handler. Normally, the guest will restore the original flags,
169 * that will continue single-stepping.
170 *
171 * 3. Debuggers running on the guest may wish to set TF to do instruction
172 * stepping. INT1 events generated by it would be intercepted by us,
173 * as long as the gdb is connected to QEMU.
174 *
175 * In practice this means that:
176 * 1. Stepping through flags-modifying instructions may cause gdb to
177 * continue or stop in unexpected places. This will be fully recoverable
178 * and will not crash the target.
179 *
180 * 2. Stepping over an instruction that triggers an exception will step
181 * over the exception handler, not into it.
182 *
183 * 3. Debugging the guest via gdb, while running debugger on the guest
184 * at the same time may lead to unexpected effects. Removing all
185 * breakpoints set via QEMU will prevent any further interference
186 * with the guest-level debuggers.
187 *
188 * The limitations can be addressed as shown below:
189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190 * stepping through them. The exact semantics of the instructions is
191 * defined in the "Combined Volume Set of Intel 64 and IA-32
192 * Architectures Software Developer's Manuals", however it involves a
193 * fair amount of corner cases due to compatibility with real mode,
194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195 *
196 * 2. We could step into the guest's exception handlers using the following
197 * sequence:
198 * a. Temporarily enable catching of all exception types via
199 * whpx_set_exception_exit_bitmap().
200 * b. Once an exception is intercepted, read the IDT/GDT and locate
201 * the original handler.
202 * c. Patch the original handler, injecting an INT3 at the beginning.
203 * d. Update the exception exit bitmap to only catch the
204 * WHvX64ExceptionTypeBreakpointTrap exception.
205 * e. Let the affected CPU run in the exclusive mode.
206 * f. Restore the original handler and the exception exit bitmap.
207 * Note that handling all corner cases related to IDT/GDT is harder
208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209 * rough idea.
210 *
211 * 3. In order to properly support guest-level debugging in parallel with
212 * the QEMU-level debugging, we would need to be able to pass some INT1
213 * events to the guest. This could be done via the following methods:
214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215 * it seems to only work for interrupts and not software
216 * exceptions.
217 * b. Locating and patching the original handler by parsing IDT/GDT.
218 * This involves relatively complex logic outlined in the previous
219 * paragraph.
220 * c. Emulating the exception invocation (i.e. manually updating RIP,
221 * RFLAGS, and pushing the old values to stack). This is even more
222 * complicated than the previous option, since it involves checking
223 * CPL, gate attributes, and doing various adjustments depending
224 * on the current CPU mode, whether the CPL is changing, etc.
225 */
226 typedef enum WhpxStepMode {
227 WHPX_STEP_NONE = 0,
228 /* Halt other VCPUs */
229 WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231
232 struct AccelCPUState {
233 WHV_EMULATOR_HANDLE emulator;
234 bool window_registered;
235 bool interruptable;
236 bool ready_for_pic_interrupt;
237 uint64_t tpr;
238 uint64_t apic_base;
239 bool interruption_pending;
240 bool dirty;
241
242 /* Must be the last field as it may have a tail */
243 WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
244 };
245
246 static bool whpx_allowed;
247 static bool whp_dispatch_initialized;
248 static HMODULE hWinHvPlatform, hWinHvEmulation;
249 static uint32_t max_vcpu_index;
250 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
251
252 struct whpx_state whpx_global;
253 struct WHPDispatch whp_dispatch;
254
whpx_has_xsave(void)255 static bool whpx_has_xsave(void)
256 {
257 return whpx_xsave_cap.XsaveSupport;
258 }
259
whpx_seg_q2h(const SegmentCache * qs,int v86,int r86)260 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
261 int r86)
262 {
263 WHV_X64_SEGMENT_REGISTER hs;
264 unsigned flags = qs->flags;
265
266 hs.Base = qs->base;
267 hs.Limit = qs->limit;
268 hs.Selector = qs->selector;
269
270 if (v86) {
271 hs.Attributes = 0;
272 hs.SegmentType = 3;
273 hs.Present = 1;
274 hs.DescriptorPrivilegeLevel = 3;
275 hs.NonSystemSegment = 1;
276
277 } else {
278 hs.Attributes = (flags >> DESC_TYPE_SHIFT);
279
280 if (r86) {
281 /* hs.Base &= 0xfffff; */
282 }
283 }
284
285 return hs;
286 }
287
whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER * hs)288 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
289 {
290 SegmentCache qs;
291
292 qs.base = hs->Base;
293 qs.limit = hs->Limit;
294 qs.selector = hs->Selector;
295
296 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
297
298 return qs;
299 }
300
301 /* X64 Extended Control Registers */
whpx_set_xcrs(CPUState * cpu)302 static void whpx_set_xcrs(CPUState *cpu)
303 {
304 HRESULT hr;
305 struct whpx_state *whpx = &whpx_global;
306 WHV_REGISTER_VALUE xcr0;
307 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
308
309 if (!whpx_has_xsave()) {
310 return;
311 }
312
313 /* Only xcr0 is supported by the hypervisor currently */
314 xcr0.Reg64 = cpu_env(cpu)->xcr0;
315 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
316 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
317 if (FAILED(hr)) {
318 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
319 }
320 }
321
whpx_set_tsc(CPUState * cpu)322 static int whpx_set_tsc(CPUState *cpu)
323 {
324 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
325 WHV_REGISTER_VALUE tsc_val;
326 HRESULT hr;
327 struct whpx_state *whpx = &whpx_global;
328
329 /*
330 * Suspend the partition prior to setting the TSC to reduce the variance
331 * in TSC across vCPUs. When the first vCPU runs post suspend, the
332 * partition is automatically resumed.
333 */
334 if (whp_dispatch.WHvSuspendPartitionTime) {
335
336 /*
337 * Unable to suspend partition while setting TSC is not a fatal
338 * error. It just increases the likelihood of TSC variance between
339 * vCPUs and some guest OS are able to handle that just fine.
340 */
341 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
342 if (FAILED(hr)) {
343 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
344 }
345 }
346
347 tsc_val.Reg64 = cpu_env(cpu)->tsc;
348 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
349 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
350 if (FAILED(hr)) {
351 error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
352 return -1;
353 }
354
355 return 0;
356 }
357
358 /*
359 * The CR8 register in the CPU is mapped to the TPR register of the APIC,
360 * however, they use a slightly different encoding. Specifically:
361 *
362 * APIC.TPR[bits 7:4] = CR8[bits 3:0]
363 *
364 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
365 * and IA-32 Architectures Software Developer's Manual.
366 *
367 * The functions below translate the value of CR8 to TPR and vice versa.
368 */
369
whpx_apic_tpr_to_cr8(uint64_t tpr)370 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
371 {
372 return tpr >> 4;
373 }
374
whpx_cr8_to_apic_tpr(uint64_t cr8)375 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
376 {
377 return cr8 << 4;
378 }
379
whpx_set_registers(CPUState * cpu,int level)380 static void whpx_set_registers(CPUState *cpu, int level)
381 {
382 struct whpx_state *whpx = &whpx_global;
383 AccelCPUState *vcpu = cpu->accel;
384 X86CPU *x86_cpu = X86_CPU(cpu);
385 CPUX86State *env = &x86_cpu->env;
386 struct whpx_register_set vcxt;
387 HRESULT hr;
388 int idx;
389 int idx_next;
390 int i;
391 int v86, r86;
392
393 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
394
395 /*
396 * Following MSRs have side effects on the guest or are too heavy for
397 * runtime. Limit them to full state update.
398 */
399 if (level >= WHPX_SET_RESET_STATE) {
400 whpx_set_tsc(cpu);
401 }
402
403 memset(&vcxt, 0, sizeof(struct whpx_register_set));
404
405 v86 = (env->eflags & VM_MASK);
406 r86 = !(env->cr[0] & CR0_PE_MASK);
407
408 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
409 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
410
411 idx = 0;
412
413 /* Indexes for first 16 registers match between HV and QEMU definitions */
414 idx_next = 16;
415 for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
416 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
417 }
418 idx = idx_next;
419
420 /* Same goes for RIP and RFLAGS */
421 assert(whpx_register_names[idx] == WHvX64RegisterRip);
422 vcxt.values[idx++].Reg64 = env->eip;
423
424 assert(whpx_register_names[idx] == WHvX64RegisterRflags);
425 vcxt.values[idx++].Reg64 = env->eflags;
426
427 /* Translate 6+4 segment registers. HV and QEMU order matches */
428 assert(idx == WHvX64RegisterEs);
429 for (i = 0; i < 6; i += 1, idx += 1) {
430 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
431 }
432
433 assert(idx == WHvX64RegisterLdtr);
434 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
435
436 assert(idx == WHvX64RegisterTr);
437 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
438
439 assert(idx == WHvX64RegisterIdtr);
440 vcxt.values[idx].Table.Base = env->idt.base;
441 vcxt.values[idx].Table.Limit = env->idt.limit;
442 idx += 1;
443
444 assert(idx == WHvX64RegisterGdtr);
445 vcxt.values[idx].Table.Base = env->gdt.base;
446 vcxt.values[idx].Table.Limit = env->gdt.limit;
447 idx += 1;
448
449 /* CR0, 2, 3, 4, 8 */
450 assert(whpx_register_names[idx] == WHvX64RegisterCr0);
451 vcxt.values[idx++].Reg64 = env->cr[0];
452 assert(whpx_register_names[idx] == WHvX64RegisterCr2);
453 vcxt.values[idx++].Reg64 = env->cr[2];
454 assert(whpx_register_names[idx] == WHvX64RegisterCr3);
455 vcxt.values[idx++].Reg64 = env->cr[3];
456 assert(whpx_register_names[idx] == WHvX64RegisterCr4);
457 vcxt.values[idx++].Reg64 = env->cr[4];
458 assert(whpx_register_names[idx] == WHvX64RegisterCr8);
459 vcxt.values[idx++].Reg64 = vcpu->tpr;
460
461 /* 8 Debug Registers - Skipped */
462
463 /*
464 * Extended control registers needs to be handled separately depending
465 * on whether xsave is supported/enabled or not.
466 */
467 whpx_set_xcrs(cpu);
468
469 /* 16 XMM registers */
470 assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
471 idx_next = idx + 16;
472 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
473 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
474 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
475 }
476 idx = idx_next;
477
478 /* 8 FP registers */
479 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
480 for (i = 0; i < 8; i += 1, idx += 1) {
481 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
482 /* vcxt.values[idx].Fp.AsUINT128.High64 =
483 env->fpregs[i].mmx.MMX_Q(1);
484 */
485 }
486
487 /* FP control status register */
488 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
489 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
490 vcxt.values[idx].FpControlStatus.FpStatus =
491 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
492 vcxt.values[idx].FpControlStatus.FpTag = 0;
493 for (i = 0; i < 8; ++i) {
494 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
495 }
496 vcxt.values[idx].FpControlStatus.Reserved = 0;
497 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
498 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
499 idx += 1;
500
501 /* XMM control status register */
502 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
503 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
504 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
505 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
506 idx += 1;
507
508 /* MSRs */
509 assert(whpx_register_names[idx] == WHvX64RegisterEfer);
510 vcxt.values[idx++].Reg64 = env->efer;
511 #ifdef TARGET_X86_64
512 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
513 vcxt.values[idx++].Reg64 = env->kernelgsbase;
514 #endif
515
516 assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
517 vcxt.values[idx++].Reg64 = vcpu->apic_base;
518
519 /* WHvX64RegisterPat - Skipped */
520
521 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
522 vcxt.values[idx++].Reg64 = env->sysenter_cs;
523 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
524 vcxt.values[idx++].Reg64 = env->sysenter_eip;
525 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
526 vcxt.values[idx++].Reg64 = env->sysenter_esp;
527 assert(whpx_register_names[idx] == WHvX64RegisterStar);
528 vcxt.values[idx++].Reg64 = env->star;
529 #ifdef TARGET_X86_64
530 assert(whpx_register_names[idx] == WHvX64RegisterLstar);
531 vcxt.values[idx++].Reg64 = env->lstar;
532 assert(whpx_register_names[idx] == WHvX64RegisterCstar);
533 vcxt.values[idx++].Reg64 = env->cstar;
534 assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
535 vcxt.values[idx++].Reg64 = env->fmask;
536 #endif
537
538 /* Interrupt / Event Registers - Skipped */
539
540 assert(idx == RTL_NUMBER_OF(whpx_register_names));
541
542 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
543 whpx->partition, cpu->cpu_index,
544 whpx_register_names,
545 RTL_NUMBER_OF(whpx_register_names),
546 &vcxt.values[0]);
547
548 if (FAILED(hr)) {
549 error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
550 hr);
551 }
552
553 return;
554 }
555
whpx_get_tsc(CPUState * cpu)556 static int whpx_get_tsc(CPUState *cpu)
557 {
558 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
559 WHV_REGISTER_VALUE tsc_val;
560 HRESULT hr;
561 struct whpx_state *whpx = &whpx_global;
562
563 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
564 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
565 if (FAILED(hr)) {
566 error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
567 return -1;
568 }
569
570 cpu_env(cpu)->tsc = tsc_val.Reg64;
571 return 0;
572 }
573
574 /* X64 Extended Control Registers */
whpx_get_xcrs(CPUState * cpu)575 static void whpx_get_xcrs(CPUState *cpu)
576 {
577 HRESULT hr;
578 struct whpx_state *whpx = &whpx_global;
579 WHV_REGISTER_VALUE xcr0;
580 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
581
582 if (!whpx_has_xsave()) {
583 return;
584 }
585
586 /* Only xcr0 is supported by the hypervisor currently */
587 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
588 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
589 if (FAILED(hr)) {
590 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
591 return;
592 }
593
594 cpu_env(cpu)->xcr0 = xcr0.Reg64;
595 }
596
whpx_get_registers(CPUState * cpu)597 static void whpx_get_registers(CPUState *cpu)
598 {
599 struct whpx_state *whpx = &whpx_global;
600 AccelCPUState *vcpu = cpu->accel;
601 X86CPU *x86_cpu = X86_CPU(cpu);
602 CPUX86State *env = &x86_cpu->env;
603 struct whpx_register_set vcxt;
604 uint64_t tpr, apic_base;
605 HRESULT hr;
606 int idx;
607 int idx_next;
608 int i;
609
610 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
611
612 if (!env->tsc_valid) {
613 whpx_get_tsc(cpu);
614 env->tsc_valid = !runstate_is_running();
615 }
616
617 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
618 whpx->partition, cpu->cpu_index,
619 whpx_register_names,
620 RTL_NUMBER_OF(whpx_register_names),
621 &vcxt.values[0]);
622 if (FAILED(hr)) {
623 error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
624 hr);
625 }
626
627 if (whpx_apic_in_platform()) {
628 /*
629 * Fetch the TPR value from the emulated APIC. It may get overwritten
630 * below with the value from CR8 returned by
631 * WHvGetVirtualProcessorRegisters().
632 */
633 whpx_apic_get(x86_cpu->apic_state);
634 vcpu->tpr = whpx_apic_tpr_to_cr8(
635 cpu_get_apic_tpr(x86_cpu->apic_state));
636 }
637
638 idx = 0;
639
640 /* Indexes for first 16 registers match between HV and QEMU definitions */
641 idx_next = 16;
642 for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
643 env->regs[idx] = vcxt.values[idx].Reg64;
644 }
645 idx = idx_next;
646
647 /* Same goes for RIP and RFLAGS */
648 assert(whpx_register_names[idx] == WHvX64RegisterRip);
649 env->eip = vcxt.values[idx++].Reg64;
650 assert(whpx_register_names[idx] == WHvX64RegisterRflags);
651 env->eflags = vcxt.values[idx++].Reg64;
652
653 /* Translate 6+4 segment registers. HV and QEMU order matches */
654 assert(idx == WHvX64RegisterEs);
655 for (i = 0; i < 6; i += 1, idx += 1) {
656 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
657 }
658
659 assert(idx == WHvX64RegisterLdtr);
660 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
661 assert(idx == WHvX64RegisterTr);
662 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
663 assert(idx == WHvX64RegisterIdtr);
664 env->idt.base = vcxt.values[idx].Table.Base;
665 env->idt.limit = vcxt.values[idx].Table.Limit;
666 idx += 1;
667 assert(idx == WHvX64RegisterGdtr);
668 env->gdt.base = vcxt.values[idx].Table.Base;
669 env->gdt.limit = vcxt.values[idx].Table.Limit;
670 idx += 1;
671
672 /* CR0, 2, 3, 4, 8 */
673 assert(whpx_register_names[idx] == WHvX64RegisterCr0);
674 env->cr[0] = vcxt.values[idx++].Reg64;
675 assert(whpx_register_names[idx] == WHvX64RegisterCr2);
676 env->cr[2] = vcxt.values[idx++].Reg64;
677 assert(whpx_register_names[idx] == WHvX64RegisterCr3);
678 env->cr[3] = vcxt.values[idx++].Reg64;
679 assert(whpx_register_names[idx] == WHvX64RegisterCr4);
680 env->cr[4] = vcxt.values[idx++].Reg64;
681 assert(whpx_register_names[idx] == WHvX64RegisterCr8);
682 tpr = vcxt.values[idx++].Reg64;
683 if (tpr != vcpu->tpr) {
684 vcpu->tpr = tpr;
685 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
686 }
687
688 /* 8 Debug Registers - Skipped */
689
690 /*
691 * Extended control registers needs to be handled separately depending
692 * on whether xsave is supported/enabled or not.
693 */
694 whpx_get_xcrs(cpu);
695
696 /* 16 XMM registers */
697 assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
698 idx_next = idx + 16;
699 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
700 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
701 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
702 }
703 idx = idx_next;
704
705 /* 8 FP registers */
706 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
707 for (i = 0; i < 8; i += 1, idx += 1) {
708 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
709 /* env->fpregs[i].mmx.MMX_Q(1) =
710 vcxt.values[idx].Fp.AsUINT128.High64;
711 */
712 }
713
714 /* FP control status register */
715 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
716 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
717 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
718 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
719 for (i = 0; i < 8; ++i) {
720 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
721 }
722 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
723 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
724 idx += 1;
725
726 /* XMM control status register */
727 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
728 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
729 idx += 1;
730
731 /* MSRs */
732 assert(whpx_register_names[idx] == WHvX64RegisterEfer);
733 env->efer = vcxt.values[idx++].Reg64;
734 #ifdef TARGET_X86_64
735 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
736 env->kernelgsbase = vcxt.values[idx++].Reg64;
737 #endif
738
739 assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
740 apic_base = vcxt.values[idx++].Reg64;
741 if (apic_base != vcpu->apic_base) {
742 vcpu->apic_base = apic_base;
743 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
744 }
745
746 /* WHvX64RegisterPat - Skipped */
747
748 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
749 env->sysenter_cs = vcxt.values[idx++].Reg64;
750 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
751 env->sysenter_eip = vcxt.values[idx++].Reg64;
752 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
753 env->sysenter_esp = vcxt.values[idx++].Reg64;
754 assert(whpx_register_names[idx] == WHvX64RegisterStar);
755 env->star = vcxt.values[idx++].Reg64;
756 #ifdef TARGET_X86_64
757 assert(whpx_register_names[idx] == WHvX64RegisterLstar);
758 env->lstar = vcxt.values[idx++].Reg64;
759 assert(whpx_register_names[idx] == WHvX64RegisterCstar);
760 env->cstar = vcxt.values[idx++].Reg64;
761 assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
762 env->fmask = vcxt.values[idx++].Reg64;
763 #endif
764
765 /* Interrupt / Event Registers - Skipped */
766
767 assert(idx == RTL_NUMBER_OF(whpx_register_names));
768
769 if (whpx_apic_in_platform()) {
770 whpx_apic_get(x86_cpu->apic_state);
771 }
772
773 x86_update_hflags(env);
774
775 return;
776 }
777
whpx_emu_ioport_callback(void * ctx,WHV_EMULATOR_IO_ACCESS_INFO * IoAccess)778 static HRESULT CALLBACK whpx_emu_ioport_callback(
779 void *ctx,
780 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
781 {
782 MemTxAttrs attrs = { 0 };
783 address_space_rw(&address_space_io, IoAccess->Port, attrs,
784 &IoAccess->Data, IoAccess->AccessSize,
785 IoAccess->Direction);
786 return S_OK;
787 }
788
whpx_emu_mmio_callback(void * ctx,WHV_EMULATOR_MEMORY_ACCESS_INFO * ma)789 static HRESULT CALLBACK whpx_emu_mmio_callback(
790 void *ctx,
791 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
792 {
793 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
794 ma->Direction);
795 return S_OK;
796 }
797
whpx_emu_getreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,WHV_REGISTER_VALUE * RegisterValues)798 static HRESULT CALLBACK whpx_emu_getreg_callback(
799 void *ctx,
800 const WHV_REGISTER_NAME *RegisterNames,
801 UINT32 RegisterCount,
802 WHV_REGISTER_VALUE *RegisterValues)
803 {
804 HRESULT hr;
805 struct whpx_state *whpx = &whpx_global;
806 CPUState *cpu = (CPUState *)ctx;
807
808 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
809 whpx->partition, cpu->cpu_index,
810 RegisterNames, RegisterCount,
811 RegisterValues);
812 if (FAILED(hr)) {
813 error_report("WHPX: Failed to get virtual processor registers,"
814 " hr=%08lx", hr);
815 }
816
817 return hr;
818 }
819
whpx_emu_setreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,const WHV_REGISTER_VALUE * RegisterValues)820 static HRESULT CALLBACK whpx_emu_setreg_callback(
821 void *ctx,
822 const WHV_REGISTER_NAME *RegisterNames,
823 UINT32 RegisterCount,
824 const WHV_REGISTER_VALUE *RegisterValues)
825 {
826 HRESULT hr;
827 struct whpx_state *whpx = &whpx_global;
828 CPUState *cpu = (CPUState *)ctx;
829
830 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
831 whpx->partition, cpu->cpu_index,
832 RegisterNames, RegisterCount,
833 RegisterValues);
834 if (FAILED(hr)) {
835 error_report("WHPX: Failed to set virtual processor registers,"
836 " hr=%08lx", hr);
837 }
838
839 /*
840 * The emulator just successfully wrote the register state. We clear the
841 * dirty state so we avoid the double write on resume of the VP.
842 */
843 cpu->accel->dirty = false;
844
845 return hr;
846 }
847
whpx_emu_translate_callback(void * ctx,WHV_GUEST_VIRTUAL_ADDRESS Gva,WHV_TRANSLATE_GVA_FLAGS TranslateFlags,WHV_TRANSLATE_GVA_RESULT_CODE * TranslationResult,WHV_GUEST_PHYSICAL_ADDRESS * Gpa)848 static HRESULT CALLBACK whpx_emu_translate_callback(
849 void *ctx,
850 WHV_GUEST_VIRTUAL_ADDRESS Gva,
851 WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
852 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
853 WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
854 {
855 HRESULT hr;
856 struct whpx_state *whpx = &whpx_global;
857 CPUState *cpu = (CPUState *)ctx;
858 WHV_TRANSLATE_GVA_RESULT res;
859
860 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
861 Gva, TranslateFlags, &res, Gpa);
862 if (FAILED(hr)) {
863 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
864 } else {
865 *TranslationResult = res.ResultCode;
866 }
867
868 return hr;
869 }
870
871 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
872 .Size = sizeof(WHV_EMULATOR_CALLBACKS),
873 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
874 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
875 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
876 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
877 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
878 };
879
whpx_handle_mmio(CPUState * cpu,WHV_MEMORY_ACCESS_CONTEXT * ctx)880 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
881 {
882 HRESULT hr;
883 AccelCPUState *vcpu = cpu->accel;
884 WHV_EMULATOR_STATUS emu_status;
885
886 hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
887 vcpu->emulator, cpu,
888 &vcpu->exit_ctx.VpContext, ctx,
889 &emu_status);
890 if (FAILED(hr)) {
891 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
892 return -1;
893 }
894
895 if (!emu_status.EmulationSuccessful) {
896 error_report("WHPX: Failed to emulate MMIO access with"
897 " EmulatorReturnStatus: %u", emu_status.AsUINT32);
898 return -1;
899 }
900
901 return 0;
902 }
903
whpx_handle_portio(CPUState * cpu,WHV_X64_IO_PORT_ACCESS_CONTEXT * ctx)904 static int whpx_handle_portio(CPUState *cpu,
905 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
906 {
907 HRESULT hr;
908 AccelCPUState *vcpu = cpu->accel;
909 WHV_EMULATOR_STATUS emu_status;
910
911 hr = whp_dispatch.WHvEmulatorTryIoEmulation(
912 vcpu->emulator, cpu,
913 &vcpu->exit_ctx.VpContext, ctx,
914 &emu_status);
915 if (FAILED(hr)) {
916 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
917 return -1;
918 }
919
920 if (!emu_status.EmulationSuccessful) {
921 error_report("WHPX: Failed to emulate PortIO access with"
922 " EmulatorReturnStatus: %u", emu_status.AsUINT32);
923 return -1;
924 }
925
926 return 0;
927 }
928
929 /*
930 * Controls whether we should intercept various exceptions on the guest,
931 * namely breakpoint/single-step events.
932 *
933 * The 'exceptions' argument accepts a bitmask, e.g:
934 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
935 */
whpx_set_exception_exit_bitmap(UINT64 exceptions)936 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
937 {
938 struct whpx_state *whpx = &whpx_global;
939 WHV_PARTITION_PROPERTY prop = { 0, };
940 HRESULT hr;
941
942 if (exceptions == whpx->exception_exit_bitmap) {
943 return S_OK;
944 }
945
946 prop.ExceptionExitBitmap = exceptions;
947
948 hr = whp_dispatch.WHvSetPartitionProperty(
949 whpx->partition,
950 WHvPartitionPropertyCodeExceptionExitBitmap,
951 &prop,
952 sizeof(WHV_PARTITION_PROPERTY));
953
954 if (SUCCEEDED(hr)) {
955 whpx->exception_exit_bitmap = exceptions;
956 }
957
958 return hr;
959 }
960
961
962 /*
963 * This function is called before/after stepping over a single instruction.
964 * It will update the CPU registers to arm/disarm the instruction stepping
965 * accordingly.
966 */
whpx_vcpu_configure_single_stepping(CPUState * cpu,bool set,uint64_t * exit_context_rflags)967 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
968 bool set,
969 uint64_t *exit_context_rflags)
970 {
971 WHV_REGISTER_NAME reg_name;
972 WHV_REGISTER_VALUE reg_value;
973 HRESULT hr;
974 struct whpx_state *whpx = &whpx_global;
975
976 /*
977 * If we are trying to step over a single instruction, we need to set the
978 * TF bit in rflags. Otherwise, clear it.
979 */
980 reg_name = WHvX64RegisterRflags;
981 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
982 whpx->partition,
983 cpu->cpu_index,
984 ®_name,
985 1,
986 ®_value);
987
988 if (FAILED(hr)) {
989 error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
990 return hr;
991 }
992
993 if (exit_context_rflags) {
994 assert(*exit_context_rflags == reg_value.Reg64);
995 }
996
997 if (set) {
998 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
999 reg_value.Reg64 |= TF_MASK;
1000 } else {
1001 reg_value.Reg64 &= ~TF_MASK;
1002 }
1003
1004 if (exit_context_rflags) {
1005 *exit_context_rflags = reg_value.Reg64;
1006 }
1007
1008 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1009 whpx->partition,
1010 cpu->cpu_index,
1011 ®_name,
1012 1,
1013 ®_value);
1014
1015 if (FAILED(hr)) {
1016 error_report("WHPX: Failed to set rflags,"
1017 " hr=%08lx",
1018 hr);
1019 return hr;
1020 }
1021
1022 reg_name = WHvRegisterInterruptState;
1023 reg_value.Reg64 = 0;
1024
1025 /* Suspend delivery of hardware interrupts during single-stepping. */
1026 reg_value.InterruptState.InterruptShadow = set != 0;
1027
1028 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1029 whpx->partition,
1030 cpu->cpu_index,
1031 ®_name,
1032 1,
1033 ®_value);
1034
1035 if (FAILED(hr)) {
1036 error_report("WHPX: Failed to set InterruptState,"
1037 " hr=%08lx",
1038 hr);
1039 return hr;
1040 }
1041
1042 if (!set) {
1043 /*
1044 * We have just finished stepping over a single instruction,
1045 * and intercepted the INT1 generated by it.
1046 * We need to now hide the INT1 from the guest,
1047 * as it would not be expecting it.
1048 */
1049
1050 reg_name = WHvX64RegisterPendingDebugException;
1051 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1052 whpx->partition,
1053 cpu->cpu_index,
1054 ®_name,
1055 1,
1056 ®_value);
1057
1058 if (FAILED(hr)) {
1059 error_report("WHPX: Failed to get pending debug exceptions,"
1060 "hr=%08lx", hr);
1061 return hr;
1062 }
1063
1064 if (reg_value.PendingDebugException.SingleStep) {
1065 reg_value.PendingDebugException.SingleStep = 0;
1066
1067 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1068 whpx->partition,
1069 cpu->cpu_index,
1070 ®_name,
1071 1,
1072 ®_value);
1073
1074 if (FAILED(hr)) {
1075 error_report("WHPX: Failed to clear pending debug exceptions,"
1076 "hr=%08lx", hr);
1077 return hr;
1078 }
1079 }
1080
1081 }
1082
1083 return S_OK;
1084 }
1085
1086 /* Tries to find a breakpoint at the specified address. */
whpx_lookup_breakpoint_by_addr(uint64_t address)1087 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1088 {
1089 struct whpx_state *whpx = &whpx_global;
1090 int i;
1091
1092 if (whpx->breakpoints.breakpoints) {
1093 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1094 if (address == whpx->breakpoints.breakpoints->data[i].address) {
1095 return &whpx->breakpoints.breakpoints->data[i];
1096 }
1097 }
1098 }
1099
1100 return NULL;
1101 }
1102
1103 /*
1104 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1105 * debugging user-mode applications. Since the WHPX API does not offer
1106 * an easy way to pass the intercepted exception back to the guest, we
1107 * resort to using INT1 instead, and let the guest always handle INT3.
1108 */
1109 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1110
1111 /*
1112 * The WHPX QEMU backend implements breakpoints by writing the INT1
1113 * instruction into memory (ignoring the DRx registers). This raises a few
1114 * issues that need to be carefully handled:
1115 *
1116 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1117 * at the same location, and later remove them in arbitrary order.
1118 * This should not cause memory corruption, and should only remove the
1119 * physical breakpoint instruction when the last QEMU breakpoint is gone.
1120 *
1121 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1122 * physical location. Hence, physically adding/removing a breakpoint can
1123 * theoretically fail at any time. We need to keep track of it.
1124 *
1125 * The function below rebuilds a list of low-level breakpoints (one per
1126 * address, tracking the original instruction and any errors) from the list of
1127 * high-level breakpoints (set via cpu_breakpoint_insert()).
1128 *
1129 * In order to optimize performance, this function stores the list of
1130 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1131 * low-level ones, so that it won't be re-invoked until these breakpoints
1132 * change.
1133 *
1134 * Note that this function decides which breakpoints should be inserted into,
1135 * memory, but doesn't actually do it. The memory accessing is done in
1136 * whpx_apply_breakpoints().
1137 */
whpx_translate_cpu_breakpoints(struct whpx_breakpoints * breakpoints,CPUState * cpu,int cpu_breakpoint_count)1138 static void whpx_translate_cpu_breakpoints(
1139 struct whpx_breakpoints *breakpoints,
1140 CPUState *cpu,
1141 int cpu_breakpoint_count)
1142 {
1143 CPUBreakpoint *bp;
1144 int cpu_bp_index = 0;
1145
1146 breakpoints->original_addresses =
1147 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1148
1149 breakpoints->original_address_count = cpu_breakpoint_count;
1150
1151 int max_breakpoints = cpu_breakpoint_count +
1152 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1153
1154 struct whpx_breakpoint_collection *new_breakpoints =
1155 g_malloc0(sizeof(struct whpx_breakpoint_collection)
1156 + max_breakpoints * sizeof(struct whpx_breakpoint));
1157
1158 new_breakpoints->allocated = max_breakpoints;
1159 new_breakpoints->used = 0;
1160
1161 /*
1162 * 1. Preserve all old breakpoints that could not be automatically
1163 * cleared when the CPU got stopped.
1164 */
1165 if (breakpoints->breakpoints) {
1166 int i;
1167 for (i = 0; i < breakpoints->breakpoints->used; i++) {
1168 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1169 new_breakpoints->data[new_breakpoints->used++] =
1170 breakpoints->breakpoints->data[i];
1171 }
1172 }
1173 }
1174
1175 /* 2. Map all CPU breakpoints to WHPX breakpoints */
1176 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1177 int i;
1178 bool found = false;
1179
1180 /* This will be used to detect changed CPU breakpoints later. */
1181 breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1182
1183 for (i = 0; i < new_breakpoints->used; i++) {
1184 /*
1185 * WARNING: This loop has O(N^2) complexity, where N is the
1186 * number of breakpoints. It should not be a bottleneck in
1187 * real-world scenarios, since it only needs to run once after
1188 * the breakpoints have been modified.
1189 * If this ever becomes a concern, it can be optimized by storing
1190 * high-level breakpoint objects in a tree or hash map.
1191 */
1192
1193 if (new_breakpoints->data[i].address == bp->pc) {
1194 /* There was already a breakpoint at this address. */
1195 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1196 new_breakpoints->data[i].state = WHPX_BP_SET;
1197 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1198 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1199 }
1200
1201 found = true;
1202 break;
1203 }
1204 }
1205
1206 if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1207 /* No WHPX breakpoint at this address. Create one. */
1208 new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1209 new_breakpoints->data[new_breakpoints->used].state =
1210 WHPX_BP_SET_PENDING;
1211 new_breakpoints->used++;
1212 }
1213 }
1214
1215 /*
1216 * Free the previous breakpoint list. This can be optimized by keeping
1217 * it as shadow buffer for the next computation instead of freeing
1218 * it immediately.
1219 */
1220 g_free(breakpoints->breakpoints);
1221
1222 breakpoints->breakpoints = new_breakpoints;
1223 }
1224
1225 /*
1226 * Physically inserts/removes the breakpoints by reading and writing the
1227 * physical memory, keeping a track of the failed attempts.
1228 *
1229 * Passing resuming=true will try to set all previously unset breakpoints.
1230 * Passing resuming=false will remove all inserted ones.
1231 */
whpx_apply_breakpoints(struct whpx_breakpoint_collection * breakpoints,CPUState * cpu,bool resuming)1232 static void whpx_apply_breakpoints(
1233 struct whpx_breakpoint_collection *breakpoints,
1234 CPUState *cpu,
1235 bool resuming)
1236 {
1237 int i, rc;
1238 if (!breakpoints) {
1239 return;
1240 }
1241
1242 for (i = 0; i < breakpoints->used; i++) {
1243 /* Decide what to do right now based on the last known state. */
1244 WhpxBreakpointState state = breakpoints->data[i].state;
1245 switch (state) {
1246 case WHPX_BP_CLEARED:
1247 if (resuming) {
1248 state = WHPX_BP_SET_PENDING;
1249 }
1250 break;
1251 case WHPX_BP_SET_PENDING:
1252 if (!resuming) {
1253 state = WHPX_BP_CLEARED;
1254 }
1255 break;
1256 case WHPX_BP_SET:
1257 if (!resuming) {
1258 state = WHPX_BP_CLEAR_PENDING;
1259 }
1260 break;
1261 case WHPX_BP_CLEAR_PENDING:
1262 if (resuming) {
1263 state = WHPX_BP_SET;
1264 }
1265 break;
1266 }
1267
1268 if (state == WHPX_BP_SET_PENDING) {
1269 /* Remember the original instruction. */
1270 rc = cpu_memory_rw_debug(cpu,
1271 breakpoints->data[i].address,
1272 &breakpoints->data[i].original_instruction,
1273 1,
1274 false);
1275
1276 if (!rc) {
1277 /* Write the breakpoint instruction. */
1278 rc = cpu_memory_rw_debug(cpu,
1279 breakpoints->data[i].address,
1280 (void *)&whpx_breakpoint_instruction,
1281 1,
1282 true);
1283 }
1284
1285 if (!rc) {
1286 state = WHPX_BP_SET;
1287 }
1288
1289 }
1290
1291 if (state == WHPX_BP_CLEAR_PENDING) {
1292 /* Restore the original instruction. */
1293 rc = cpu_memory_rw_debug(cpu,
1294 breakpoints->data[i].address,
1295 &breakpoints->data[i].original_instruction,
1296 1,
1297 true);
1298
1299 if (!rc) {
1300 state = WHPX_BP_CLEARED;
1301 }
1302 }
1303
1304 breakpoints->data[i].state = state;
1305 }
1306 }
1307
1308 /*
1309 * This function is called when the a VCPU is about to start and no other
1310 * VCPUs have been started so far. Since the VCPU start order could be
1311 * arbitrary, it doesn't have to be VCPU#0.
1312 *
1313 * It is used to commit the breakpoints into memory, and configure WHPX
1314 * to intercept debug exceptions.
1315 *
1316 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1317 * more VCPUs are already running, so this is the best place to do it.
1318 */
whpx_first_vcpu_starting(CPUState * cpu)1319 static int whpx_first_vcpu_starting(CPUState *cpu)
1320 {
1321 struct whpx_state *whpx = &whpx_global;
1322 HRESULT hr;
1323
1324 g_assert(bql_locked());
1325
1326 if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1327 (whpx->breakpoints.breakpoints &&
1328 whpx->breakpoints.breakpoints->used)) {
1329 CPUBreakpoint *bp;
1330 int i = 0;
1331 bool update_pending = false;
1332
1333 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1334 if (i >= whpx->breakpoints.original_address_count ||
1335 bp->pc != whpx->breakpoints.original_addresses[i]) {
1336 update_pending = true;
1337 }
1338
1339 i++;
1340 }
1341
1342 if (i != whpx->breakpoints.original_address_count) {
1343 update_pending = true;
1344 }
1345
1346 if (update_pending) {
1347 /*
1348 * The CPU breakpoints have changed since the last call to
1349 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1350 * now be recomputed.
1351 */
1352 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1353 }
1354
1355 /* Actually insert the breakpoints into the memory. */
1356 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1357 }
1358
1359 uint64_t exception_mask;
1360 if (whpx->step_pending ||
1361 (whpx->breakpoints.breakpoints &&
1362 whpx->breakpoints.breakpoints->used)) {
1363 /*
1364 * We are either attempting to single-step one or more CPUs, or
1365 * have one or more breakpoints enabled. Both require intercepting
1366 * the WHvX64ExceptionTypeBreakpointTrap exception.
1367 */
1368
1369 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1370 } else {
1371 /* Let the guest handle all exceptions. */
1372 exception_mask = 0;
1373 }
1374
1375 hr = whpx_set_exception_exit_bitmap(exception_mask);
1376 if (!SUCCEEDED(hr)) {
1377 error_report("WHPX: Failed to update exception exit mask,"
1378 "hr=%08lx.", hr);
1379 return 1;
1380 }
1381
1382 return 0;
1383 }
1384
1385 /*
1386 * This function is called when the last VCPU has finished running.
1387 * It is used to remove any previously set breakpoints from memory.
1388 */
whpx_last_vcpu_stopping(CPUState * cpu)1389 static int whpx_last_vcpu_stopping(CPUState *cpu)
1390 {
1391 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1392 return 0;
1393 }
1394
1395 /* Returns the address of the next instruction that is about to be executed. */
whpx_vcpu_get_pc(CPUState * cpu,bool exit_context_valid)1396 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1397 {
1398 if (cpu->accel->dirty) {
1399 /* The CPU registers have been modified by other parts of QEMU. */
1400 return cpu_env(cpu)->eip;
1401 } else if (exit_context_valid) {
1402 /*
1403 * The CPU registers have not been modified by neither other parts
1404 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1405 * This is the most common case.
1406 */
1407 AccelCPUState *vcpu = cpu->accel;
1408 return vcpu->exit_ctx.VpContext.Rip;
1409 } else {
1410 /*
1411 * The CPU registers have been modified by a call to
1412 * WHvSetVirtualProcessorRegisters() and must be re-queried from
1413 * the target.
1414 */
1415 WHV_REGISTER_VALUE reg_value;
1416 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1417 HRESULT hr;
1418 struct whpx_state *whpx = &whpx_global;
1419
1420 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1421 whpx->partition,
1422 cpu->cpu_index,
1423 ®_name,
1424 1,
1425 ®_value);
1426
1427 if (FAILED(hr)) {
1428 error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1429 return 0;
1430 }
1431
1432 return reg_value.Reg64;
1433 }
1434 }
1435
whpx_handle_halt(CPUState * cpu)1436 static int whpx_handle_halt(CPUState *cpu)
1437 {
1438 int ret = 0;
1439
1440 bql_lock();
1441 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1442 (cpu_env(cpu)->eflags & IF_MASK)) &&
1443 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1444 cpu->exception_index = EXCP_HLT;
1445 cpu->halted = true;
1446 ret = 1;
1447 }
1448 bql_unlock();
1449
1450 return ret;
1451 }
1452
whpx_vcpu_pre_run(CPUState * cpu)1453 static void whpx_vcpu_pre_run(CPUState *cpu)
1454 {
1455 HRESULT hr;
1456 struct whpx_state *whpx = &whpx_global;
1457 AccelCPUState *vcpu = cpu->accel;
1458 X86CPU *x86_cpu = X86_CPU(cpu);
1459 CPUX86State *env = &x86_cpu->env;
1460 int irq;
1461 uint8_t tpr;
1462 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1463 UINT32 reg_count = 0;
1464 WHV_REGISTER_VALUE reg_values[3];
1465 WHV_REGISTER_NAME reg_names[3];
1466
1467 memset(&new_int, 0, sizeof(new_int));
1468 memset(reg_values, 0, sizeof(reg_values));
1469
1470 bql_lock();
1471
1472 /* Inject NMI */
1473 if (!vcpu->interruption_pending &&
1474 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1475 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1476 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1477 vcpu->interruptable = false;
1478 new_int.InterruptionType = WHvX64PendingNmi;
1479 new_int.InterruptionPending = 1;
1480 new_int.InterruptionVector = 2;
1481 }
1482 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1483 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1484 }
1485 }
1486
1487 /*
1488 * Force the VCPU out of its inner loop to process any INIT requests or
1489 * commit pending TPR access.
1490 */
1491 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1492 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1493 !(env->hflags & HF_SMM_MASK)) {
1494 cpu->exit_request = 1;
1495 }
1496 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1497 cpu->exit_request = 1;
1498 }
1499 }
1500
1501 /* Get pending hard interruption or replay one that was overwritten */
1502 if (!whpx_apic_in_platform()) {
1503 if (!vcpu->interruption_pending &&
1504 vcpu->interruptable && (env->eflags & IF_MASK)) {
1505 assert(!new_int.InterruptionPending);
1506 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1507 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1508 irq = cpu_get_pic_interrupt(env);
1509 if (irq >= 0) {
1510 new_int.InterruptionType = WHvX64PendingInterrupt;
1511 new_int.InterruptionPending = 1;
1512 new_int.InterruptionVector = irq;
1513 }
1514 }
1515 }
1516
1517 /* Setup interrupt state if new one was prepared */
1518 if (new_int.InterruptionPending) {
1519 reg_values[reg_count].PendingInterruption = new_int;
1520 reg_names[reg_count] = WHvRegisterPendingInterruption;
1521 reg_count += 1;
1522 }
1523 } else if (vcpu->ready_for_pic_interrupt &&
1524 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1525 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1526 irq = cpu_get_pic_interrupt(env);
1527 if (irq >= 0) {
1528 reg_names[reg_count] = WHvRegisterPendingEvent;
1529 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1530 {
1531 .EventPending = 1,
1532 .EventType = WHvX64PendingEventExtInt,
1533 .Vector = irq,
1534 };
1535 reg_count += 1;
1536 }
1537 }
1538
1539 /* Sync the TPR to the CR8 if was modified during the intercept */
1540 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1541 if (tpr != vcpu->tpr) {
1542 vcpu->tpr = tpr;
1543 reg_values[reg_count].Reg64 = tpr;
1544 cpu->exit_request = 1;
1545 reg_names[reg_count] = WHvX64RegisterCr8;
1546 reg_count += 1;
1547 }
1548
1549 /* Update the state of the interrupt delivery notification */
1550 if (!vcpu->window_registered &&
1551 cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1552 reg_values[reg_count].DeliverabilityNotifications =
1553 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1554 .InterruptNotification = 1
1555 };
1556 vcpu->window_registered = 1;
1557 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1558 reg_count += 1;
1559 }
1560
1561 bql_unlock();
1562 vcpu->ready_for_pic_interrupt = false;
1563
1564 if (reg_count) {
1565 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1566 whpx->partition, cpu->cpu_index,
1567 reg_names, reg_count, reg_values);
1568 if (FAILED(hr)) {
1569 error_report("WHPX: Failed to set interrupt state registers,"
1570 " hr=%08lx", hr);
1571 }
1572 }
1573
1574 return;
1575 }
1576
whpx_vcpu_post_run(CPUState * cpu)1577 static void whpx_vcpu_post_run(CPUState *cpu)
1578 {
1579 AccelCPUState *vcpu = cpu->accel;
1580 X86CPU *x86_cpu = X86_CPU(cpu);
1581 CPUX86State *env = &x86_cpu->env;
1582
1583 env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1584
1585 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1586 if (vcpu->tpr != tpr) {
1587 vcpu->tpr = tpr;
1588 bql_lock();
1589 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1590 bql_unlock();
1591 }
1592
1593 vcpu->interruption_pending =
1594 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1595
1596 vcpu->interruptable =
1597 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1598
1599 return;
1600 }
1601
whpx_vcpu_process_async_events(CPUState * cpu)1602 static void whpx_vcpu_process_async_events(CPUState *cpu)
1603 {
1604 X86CPU *x86_cpu = X86_CPU(cpu);
1605 CPUX86State *env = &x86_cpu->env;
1606 AccelCPUState *vcpu = cpu->accel;
1607
1608 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1609 !(env->hflags & HF_SMM_MASK)) {
1610 whpx_cpu_synchronize_state(cpu);
1611 do_cpu_init(x86_cpu);
1612 vcpu->interruptable = true;
1613 }
1614
1615 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1616 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1617 apic_poll_irq(x86_cpu->apic_state);
1618 }
1619
1620 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1621 (env->eflags & IF_MASK)) ||
1622 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1623 cpu->halted = false;
1624 }
1625
1626 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1627 whpx_cpu_synchronize_state(cpu);
1628 do_cpu_sipi(x86_cpu);
1629 }
1630
1631 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1632 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1633 whpx_cpu_synchronize_state(cpu);
1634 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1635 env->tpr_access_type);
1636 }
1637
1638 return;
1639 }
1640
whpx_vcpu_run(CPUState * cpu)1641 static int whpx_vcpu_run(CPUState *cpu)
1642 {
1643 HRESULT hr;
1644 struct whpx_state *whpx = &whpx_global;
1645 AccelCPUState *vcpu = cpu->accel;
1646 struct whpx_breakpoint *stepped_over_bp = NULL;
1647 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1648 int ret;
1649
1650 g_assert(bql_locked());
1651
1652 if (whpx->running_cpus++ == 0) {
1653 /* Insert breakpoints into memory, update exception exit bitmap. */
1654 ret = whpx_first_vcpu_starting(cpu);
1655 if (ret != 0) {
1656 return ret;
1657 }
1658 }
1659
1660 if (whpx->breakpoints.breakpoints &&
1661 whpx->breakpoints.breakpoints->used > 0)
1662 {
1663 uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1664 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1665 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1666 stepped_over_bp = NULL;
1667 }
1668
1669 if (stepped_over_bp) {
1670 /*
1671 * We are trying to run the instruction overwritten by an active
1672 * breakpoint. We will temporarily disable the breakpoint, suspend
1673 * other CPUs, and step over the instruction.
1674 */
1675 exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1676 }
1677 }
1678
1679 if (exclusive_step_mode == WHPX_STEP_NONE) {
1680 whpx_vcpu_process_async_events(cpu);
1681 if (cpu->halted && !whpx_apic_in_platform()) {
1682 cpu->exception_index = EXCP_HLT;
1683 qatomic_set(&cpu->exit_request, false);
1684 return 0;
1685 }
1686 }
1687
1688 bql_unlock();
1689
1690 if (exclusive_step_mode != WHPX_STEP_NONE) {
1691 start_exclusive();
1692 g_assert(cpu == current_cpu);
1693 g_assert(!cpu->running);
1694 cpu->running = true;
1695
1696 hr = whpx_set_exception_exit_bitmap(
1697 1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1698 if (!SUCCEEDED(hr)) {
1699 error_report("WHPX: Failed to update exception exit mask, "
1700 "hr=%08lx.", hr);
1701 return 1;
1702 }
1703
1704 if (stepped_over_bp) {
1705 /* Temporarily disable the triggered breakpoint. */
1706 cpu_memory_rw_debug(cpu,
1707 stepped_over_bp->address,
1708 &stepped_over_bp->original_instruction,
1709 1,
1710 true);
1711 }
1712 } else {
1713 cpu_exec_start(cpu);
1714 }
1715
1716 do {
1717 if (cpu->accel->dirty) {
1718 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1719 cpu->accel->dirty = false;
1720 }
1721
1722 if (exclusive_step_mode == WHPX_STEP_NONE) {
1723 whpx_vcpu_pre_run(cpu);
1724
1725 if (qatomic_read(&cpu->exit_request)) {
1726 whpx_vcpu_kick(cpu);
1727 }
1728 }
1729
1730 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1731 whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1732 }
1733
1734 hr = whp_dispatch.WHvRunVirtualProcessor(
1735 whpx->partition, cpu->cpu_index,
1736 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1737
1738 if (FAILED(hr)) {
1739 error_report("WHPX: Failed to exec a virtual processor,"
1740 " hr=%08lx", hr);
1741 ret = -1;
1742 break;
1743 }
1744
1745 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1746 whpx_vcpu_configure_single_stepping(cpu,
1747 false,
1748 &vcpu->exit_ctx.VpContext.Rflags);
1749 }
1750
1751 whpx_vcpu_post_run(cpu);
1752
1753 switch (vcpu->exit_ctx.ExitReason) {
1754 case WHvRunVpExitReasonMemoryAccess:
1755 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1756 break;
1757
1758 case WHvRunVpExitReasonX64IoPortAccess:
1759 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1760 break;
1761
1762 case WHvRunVpExitReasonX64InterruptWindow:
1763 vcpu->ready_for_pic_interrupt = 1;
1764 vcpu->window_registered = 0;
1765 ret = 0;
1766 break;
1767
1768 case WHvRunVpExitReasonX64ApicEoi:
1769 assert(whpx_apic_in_platform());
1770 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1771 break;
1772
1773 case WHvRunVpExitReasonX64Halt:
1774 /*
1775 * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1776 * longer used.
1777 */
1778 ret = whpx_handle_halt(cpu);
1779 break;
1780
1781 case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1782 WHV_INTERRUPT_CONTROL ipi = {0};
1783 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1784 uint32_t delivery_mode =
1785 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1786 int dest_shorthand =
1787 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1788 bool broadcast = false;
1789 bool include_self = false;
1790 uint32_t i;
1791
1792 /* We only registered for INIT and SIPI exits. */
1793 if ((delivery_mode != APIC_DM_INIT) &&
1794 (delivery_mode != APIC_DM_SIPI)) {
1795 error_report(
1796 "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1797 break;
1798 }
1799
1800 if (delivery_mode == APIC_DM_INIT) {
1801 ipi.Type = WHvX64InterruptTypeInit;
1802 } else {
1803 ipi.Type = WHvX64InterruptTypeSipi;
1804 }
1805
1806 ipi.DestinationMode =
1807 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1808 WHvX64InterruptDestinationModeLogical :
1809 WHvX64InterruptDestinationModePhysical;
1810
1811 ipi.TriggerMode =
1812 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1813 WHvX64InterruptTriggerModeLevel :
1814 WHvX64InterruptTriggerModeEdge;
1815
1816 ipi.Vector = icr & APIC_VECTOR_MASK;
1817 switch (dest_shorthand) {
1818 /* no shorthand. Bits 56-63 contain the destination. */
1819 case 0:
1820 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1821 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1822 &ipi, sizeof(ipi));
1823 if (FAILED(hr)) {
1824 error_report("WHPX: Failed to request interrupt hr=%08lx",
1825 hr);
1826 }
1827
1828 break;
1829
1830 /* self */
1831 case 1:
1832 include_self = true;
1833 break;
1834
1835 /* broadcast, including self */
1836 case 2:
1837 broadcast = true;
1838 include_self = true;
1839 break;
1840
1841 /* broadcast, excluding self */
1842 case 3:
1843 broadcast = true;
1844 break;
1845 }
1846
1847 if (!broadcast && !include_self) {
1848 break;
1849 }
1850
1851 for (i = 0; i <= max_vcpu_index; i++) {
1852 if (i == cpu->cpu_index && !include_self) {
1853 continue;
1854 }
1855
1856 /*
1857 * Assuming that APIC Ids are identity mapped since
1858 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1859 * are not handled yet and the hypervisor doesn't allow the
1860 * guest to modify the APIC ID.
1861 */
1862 ipi.Destination = i;
1863 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1864 &ipi, sizeof(ipi));
1865 if (FAILED(hr)) {
1866 error_report(
1867 "WHPX: Failed to request SIPI for %d, hr=%08lx",
1868 i, hr);
1869 }
1870 }
1871
1872 break;
1873 }
1874
1875 case WHvRunVpExitReasonCanceled:
1876 if (exclusive_step_mode != WHPX_STEP_NONE) {
1877 /*
1878 * We are trying to step over a single instruction, and
1879 * likely got a request to stop from another thread.
1880 * Delay it until we are done stepping
1881 * over.
1882 */
1883 ret = 0;
1884 } else {
1885 cpu->exception_index = EXCP_INTERRUPT;
1886 ret = 1;
1887 }
1888 break;
1889 case WHvRunVpExitReasonX64MsrAccess: {
1890 WHV_REGISTER_VALUE reg_values[3] = {0};
1891 WHV_REGISTER_NAME reg_names[3];
1892 UINT32 reg_count;
1893
1894 reg_names[0] = WHvX64RegisterRip;
1895 reg_names[1] = WHvX64RegisterRax;
1896 reg_names[2] = WHvX64RegisterRdx;
1897
1898 reg_values[0].Reg64 =
1899 vcpu->exit_ctx.VpContext.Rip +
1900 vcpu->exit_ctx.VpContext.InstructionLength;
1901
1902 /*
1903 * For all unsupported MSR access we:
1904 * ignore writes
1905 * return 0 on read.
1906 */
1907 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1908 1 : 3;
1909
1910 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1911 whpx->partition,
1912 cpu->cpu_index,
1913 reg_names, reg_count,
1914 reg_values);
1915
1916 if (FAILED(hr)) {
1917 error_report("WHPX: Failed to set MsrAccess state "
1918 " registers, hr=%08lx", hr);
1919 }
1920 ret = 0;
1921 break;
1922 }
1923 case WHvRunVpExitReasonX64Cpuid: {
1924 WHV_REGISTER_VALUE reg_values[5];
1925 WHV_REGISTER_NAME reg_names[5];
1926 UINT32 reg_count = 5;
1927 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1928 X86CPU *x86_cpu = X86_CPU(cpu);
1929 CPUX86State *env = &x86_cpu->env;
1930
1931 memset(reg_values, 0, sizeof(reg_values));
1932
1933 rip = vcpu->exit_ctx.VpContext.Rip +
1934 vcpu->exit_ctx.VpContext.InstructionLength;
1935 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1936
1937 /*
1938 * Ideally, these should be supplied to the hypervisor during VCPU
1939 * initialization and it should be able to satisfy this request.
1940 * But, currently, WHPX doesn't support setting CPUID values in the
1941 * hypervisor once the partition has been setup, which is too late
1942 * since VCPUs are realized later. For now, use the values from
1943 * QEMU to satisfy these requests, until WHPX adds support for
1944 * being able to set these values in the hypervisor at runtime.
1945 */
1946 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1947 (UINT32 *)&rcx, (UINT32 *)&rdx);
1948 switch (cpuid_fn) {
1949 case 0x40000000:
1950 /* Expose the vmware cpu frequency cpuid leaf */
1951 rax = 0x40000010;
1952 rbx = rcx = rdx = 0;
1953 break;
1954
1955 case 0x40000010:
1956 rax = env->tsc_khz;
1957 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1958 rcx = rdx = 0;
1959 break;
1960
1961 case 0x80000001:
1962 /* Remove any support of OSVW */
1963 rcx &= ~CPUID_EXT3_OSVW;
1964 break;
1965 }
1966
1967 reg_names[0] = WHvX64RegisterRip;
1968 reg_names[1] = WHvX64RegisterRax;
1969 reg_names[2] = WHvX64RegisterRcx;
1970 reg_names[3] = WHvX64RegisterRdx;
1971 reg_names[4] = WHvX64RegisterRbx;
1972
1973 reg_values[0].Reg64 = rip;
1974 reg_values[1].Reg64 = rax;
1975 reg_values[2].Reg64 = rcx;
1976 reg_values[3].Reg64 = rdx;
1977 reg_values[4].Reg64 = rbx;
1978
1979 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1980 whpx->partition, cpu->cpu_index,
1981 reg_names,
1982 reg_count,
1983 reg_values);
1984
1985 if (FAILED(hr)) {
1986 error_report("WHPX: Failed to set CpuidAccess state registers,"
1987 " hr=%08lx", hr);
1988 }
1989 ret = 0;
1990 break;
1991 }
1992 case WHvRunVpExitReasonException:
1993 whpx_get_registers(cpu);
1994
1995 if ((vcpu->exit_ctx.VpException.ExceptionType ==
1996 WHvX64ExceptionTypeDebugTrapOrFault) &&
1997 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1998 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1999 whpx_breakpoint_instruction)) {
2000 /* Stopped at a software breakpoint. */
2001 cpu->exception_index = EXCP_DEBUG;
2002 } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
2003 WHvX64ExceptionTypeDebugTrapOrFault) &&
2004 !cpu->singlestep_enabled) {
2005 /*
2006 * Just finished stepping over a breakpoint, but the
2007 * gdb does not expect us to do single-stepping.
2008 * Don't do anything special.
2009 */
2010 cpu->exception_index = EXCP_INTERRUPT;
2011 } else {
2012 /* Another exception or debug event. Report it to GDB. */
2013 cpu->exception_index = EXCP_DEBUG;
2014 }
2015
2016 ret = 1;
2017 break;
2018 case WHvRunVpExitReasonNone:
2019 case WHvRunVpExitReasonUnrecoverableException:
2020 case WHvRunVpExitReasonInvalidVpRegisterValue:
2021 case WHvRunVpExitReasonUnsupportedFeature:
2022 default:
2023 error_report("WHPX: Unexpected VP exit code %d",
2024 vcpu->exit_ctx.ExitReason);
2025 whpx_get_registers(cpu);
2026 bql_lock();
2027 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2028 bql_unlock();
2029 break;
2030 }
2031
2032 } while (!ret);
2033
2034 if (stepped_over_bp) {
2035 /* Restore the breakpoint we stepped over */
2036 cpu_memory_rw_debug(cpu,
2037 stepped_over_bp->address,
2038 (void *)&whpx_breakpoint_instruction,
2039 1,
2040 true);
2041 }
2042
2043 if (exclusive_step_mode != WHPX_STEP_NONE) {
2044 g_assert(cpu_in_exclusive_context(cpu));
2045 cpu->running = false;
2046 end_exclusive();
2047
2048 exclusive_step_mode = WHPX_STEP_NONE;
2049 } else {
2050 cpu_exec_end(cpu);
2051 }
2052
2053 bql_lock();
2054 current_cpu = cpu;
2055
2056 if (--whpx->running_cpus == 0) {
2057 whpx_last_vcpu_stopping(cpu);
2058 }
2059
2060 qatomic_set(&cpu->exit_request, false);
2061
2062 return ret < 0;
2063 }
2064
do_whpx_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2065 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2066 {
2067 if (!cpu->accel->dirty) {
2068 whpx_get_registers(cpu);
2069 cpu->accel->dirty = true;
2070 }
2071 }
2072
do_whpx_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2073 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2074 run_on_cpu_data arg)
2075 {
2076 whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2077 cpu->accel->dirty = false;
2078 }
2079
do_whpx_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2080 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2081 run_on_cpu_data arg)
2082 {
2083 whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2084 cpu->accel->dirty = false;
2085 }
2086
do_whpx_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2087 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2088 run_on_cpu_data arg)
2089 {
2090 cpu->accel->dirty = true;
2091 }
2092
2093 /*
2094 * CPU support.
2095 */
2096
whpx_cpu_synchronize_state(CPUState * cpu)2097 void whpx_cpu_synchronize_state(CPUState *cpu)
2098 {
2099 if (!cpu->accel->dirty) {
2100 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2101 }
2102 }
2103
whpx_cpu_synchronize_post_reset(CPUState * cpu)2104 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2105 {
2106 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2107 }
2108
whpx_cpu_synchronize_post_init(CPUState * cpu)2109 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2110 {
2111 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2112 }
2113
whpx_cpu_synchronize_pre_loadvm(CPUState * cpu)2114 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2115 {
2116 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2117 }
2118
whpx_cpu_synchronize_pre_resume(bool step_pending)2119 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2120 {
2121 whpx_global.step_pending = step_pending;
2122 }
2123
2124 /*
2125 * Vcpu support.
2126 */
2127
2128 static Error *whpx_migration_blocker;
2129
whpx_cpu_update_state(void * opaque,bool running,RunState state)2130 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2131 {
2132 CPUX86State *env = opaque;
2133
2134 if (running) {
2135 env->tsc_valid = false;
2136 }
2137 }
2138
whpx_init_vcpu(CPUState * cpu)2139 int whpx_init_vcpu(CPUState *cpu)
2140 {
2141 HRESULT hr;
2142 struct whpx_state *whpx = &whpx_global;
2143 AccelCPUState *vcpu = NULL;
2144 Error *local_error = NULL;
2145 X86CPU *x86_cpu = X86_CPU(cpu);
2146 CPUX86State *env = &x86_cpu->env;
2147 UINT64 freq = 0;
2148 int ret;
2149
2150 /* Add migration blockers for all unsupported features of the
2151 * Windows Hypervisor Platform
2152 */
2153 if (whpx_migration_blocker == NULL) {
2154 error_setg(&whpx_migration_blocker,
2155 "State blocked due to non-migratable CPUID feature support,"
2156 "dirty memory tracking support, and XSAVE/XRSTOR support");
2157
2158 if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2159 error_report_err(local_error);
2160 ret = -EINVAL;
2161 goto error;
2162 }
2163 }
2164
2165 vcpu = g_new0(AccelCPUState, 1);
2166
2167 hr = whp_dispatch.WHvEmulatorCreateEmulator(
2168 &whpx_emu_callbacks,
2169 &vcpu->emulator);
2170 if (FAILED(hr)) {
2171 error_report("WHPX: Failed to setup instruction completion support,"
2172 " hr=%08lx", hr);
2173 ret = -EINVAL;
2174 goto error;
2175 }
2176
2177 hr = whp_dispatch.WHvCreateVirtualProcessor(
2178 whpx->partition, cpu->cpu_index, 0);
2179 if (FAILED(hr)) {
2180 error_report("WHPX: Failed to create a virtual processor,"
2181 " hr=%08lx", hr);
2182 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2183 ret = -EINVAL;
2184 goto error;
2185 }
2186
2187 /*
2188 * vcpu's TSC frequency is either specified by user, or use the value
2189 * provided by Hyper-V if the former is not present. In the latter case, we
2190 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2191 * frequency can be migrated later via this field.
2192 */
2193 if (!env->tsc_khz) {
2194 hr = whp_dispatch.WHvGetCapability(
2195 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2196 NULL);
2197 if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2198 if (FAILED(hr)) {
2199 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2200 } else {
2201 env->tsc_khz = freq / 1000; /* Hz to KHz */
2202 }
2203 }
2204 }
2205
2206 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2207 hr = whp_dispatch.WHvGetCapability(
2208 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2209 if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2210 if (FAILED(hr)) {
2211 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2212 } else {
2213 env->apic_bus_freq = freq;
2214 }
2215 }
2216
2217 /*
2218 * If the vmware cpuid frequency leaf option is set, and we have a valid
2219 * tsc value, trap the corresponding cpuid's.
2220 */
2221 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2222 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2223
2224 hr = whp_dispatch.WHvSetPartitionProperty(
2225 whpx->partition,
2226 WHvPartitionPropertyCodeCpuidExitList,
2227 cpuidExitList,
2228 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2229
2230 if (FAILED(hr)) {
2231 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2232 hr);
2233 ret = -EINVAL;
2234 goto error;
2235 }
2236 }
2237
2238 vcpu->interruptable = true;
2239 vcpu->dirty = true;
2240 cpu->accel = vcpu;
2241 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2242 qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2243
2244 return 0;
2245
2246 error:
2247 g_free(vcpu);
2248
2249 return ret;
2250 }
2251
whpx_vcpu_exec(CPUState * cpu)2252 int whpx_vcpu_exec(CPUState *cpu)
2253 {
2254 int ret;
2255 int fatal;
2256
2257 for (;;) {
2258 if (cpu->exception_index >= EXCP_INTERRUPT) {
2259 ret = cpu->exception_index;
2260 cpu->exception_index = -1;
2261 break;
2262 }
2263
2264 fatal = whpx_vcpu_run(cpu);
2265
2266 if (fatal) {
2267 error_report("WHPX: Failed to exec a virtual processor");
2268 abort();
2269 }
2270 }
2271
2272 return ret;
2273 }
2274
whpx_destroy_vcpu(CPUState * cpu)2275 void whpx_destroy_vcpu(CPUState *cpu)
2276 {
2277 struct whpx_state *whpx = &whpx_global;
2278 AccelCPUState *vcpu = cpu->accel;
2279
2280 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2281 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2282 g_free(cpu->accel);
2283 return;
2284 }
2285
whpx_vcpu_kick(CPUState * cpu)2286 void whpx_vcpu_kick(CPUState *cpu)
2287 {
2288 struct whpx_state *whpx = &whpx_global;
2289 whp_dispatch.WHvCancelRunVirtualProcessor(
2290 whpx->partition, cpu->cpu_index, 0);
2291 }
2292
2293 /*
2294 * Memory support.
2295 */
2296
whpx_update_mapping(hwaddr start_pa,ram_addr_t size,void * host_va,int add,int rom,const char * name)2297 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2298 void *host_va, int add, int rom,
2299 const char *name)
2300 {
2301 struct whpx_state *whpx = &whpx_global;
2302 HRESULT hr;
2303
2304 /*
2305 if (add) {
2306 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2307 (void*)start_pa, (void*)size, host_va,
2308 (rom ? "ROM" : "RAM"), name);
2309 } else {
2310 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n",
2311 (void*)start_pa, (void*)size, host_va, name);
2312 }
2313 */
2314
2315 if (add) {
2316 hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2317 host_va,
2318 start_pa,
2319 size,
2320 (WHvMapGpaRangeFlagRead |
2321 WHvMapGpaRangeFlagExecute |
2322 (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2323 } else {
2324 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2325 start_pa,
2326 size);
2327 }
2328
2329 if (FAILED(hr)) {
2330 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2331 " Host:%p, hr=%08lx",
2332 (add ? "MAP" : "UNMAP"), name,
2333 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2334 }
2335 }
2336
whpx_process_section(MemoryRegionSection * section,int add)2337 static void whpx_process_section(MemoryRegionSection *section, int add)
2338 {
2339 MemoryRegion *mr = section->mr;
2340 hwaddr start_pa = section->offset_within_address_space;
2341 ram_addr_t size = int128_get64(section->size);
2342 unsigned int delta;
2343 uint64_t host_va;
2344
2345 if (!memory_region_is_ram(mr)) {
2346 return;
2347 }
2348
2349 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2350 delta &= ~qemu_real_host_page_mask();
2351 if (delta > size) {
2352 return;
2353 }
2354 start_pa += delta;
2355 size -= delta;
2356 size &= qemu_real_host_page_mask();
2357 if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2358 return;
2359 }
2360
2361 host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2362 + section->offset_within_region + delta;
2363
2364 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2365 memory_region_is_rom(mr), mr->name);
2366 }
2367
whpx_region_add(MemoryListener * listener,MemoryRegionSection * section)2368 static void whpx_region_add(MemoryListener *listener,
2369 MemoryRegionSection *section)
2370 {
2371 memory_region_ref(section->mr);
2372 whpx_process_section(section, 1);
2373 }
2374
whpx_region_del(MemoryListener * listener,MemoryRegionSection * section)2375 static void whpx_region_del(MemoryListener *listener,
2376 MemoryRegionSection *section)
2377 {
2378 whpx_process_section(section, 0);
2379 memory_region_unref(section->mr);
2380 }
2381
whpx_transaction_begin(MemoryListener * listener)2382 static void whpx_transaction_begin(MemoryListener *listener)
2383 {
2384 }
2385
whpx_transaction_commit(MemoryListener * listener)2386 static void whpx_transaction_commit(MemoryListener *listener)
2387 {
2388 }
2389
whpx_log_sync(MemoryListener * listener,MemoryRegionSection * section)2390 static void whpx_log_sync(MemoryListener *listener,
2391 MemoryRegionSection *section)
2392 {
2393 MemoryRegion *mr = section->mr;
2394
2395 if (!memory_region_is_ram(mr)) {
2396 return;
2397 }
2398
2399 memory_region_set_dirty(mr, 0, int128_get64(section->size));
2400 }
2401
2402 static MemoryListener whpx_memory_listener = {
2403 .name = "whpx",
2404 .begin = whpx_transaction_begin,
2405 .commit = whpx_transaction_commit,
2406 .region_add = whpx_region_add,
2407 .region_del = whpx_region_del,
2408 .log_sync = whpx_log_sync,
2409 .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2410 };
2411
whpx_memory_init(void)2412 static void whpx_memory_init(void)
2413 {
2414 memory_listener_register(&whpx_memory_listener, &address_space_memory);
2415 }
2416
2417 /*
2418 * Load the functions from the given library, using the given handle. If a
2419 * handle is provided, it is used, otherwise the library is opened. The
2420 * handle will be updated on return with the opened one.
2421 */
load_whp_dispatch_fns(HMODULE * handle,WHPFunctionList function_list)2422 static bool load_whp_dispatch_fns(HMODULE *handle,
2423 WHPFunctionList function_list)
2424 {
2425 HMODULE hLib = *handle;
2426
2427 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2428 #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2429 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2430 whp_dispatch.function_name = \
2431 (function_name ## _t)GetProcAddress(hLib, #function_name); \
2432
2433 #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2434 whp_dispatch.function_name = \
2435 (function_name ## _t)GetProcAddress(hLib, #function_name); \
2436 if (!whp_dispatch.function_name) { \
2437 error_report("Could not load function %s", #function_name); \
2438 goto error; \
2439 } \
2440
2441 #define WHP_LOAD_LIB(lib_name, handle_lib) \
2442 if (!handle_lib) { \
2443 handle_lib = LoadLibrary(lib_name); \
2444 if (!handle_lib) { \
2445 error_report("Could not load library %s.", lib_name); \
2446 goto error; \
2447 } \
2448 } \
2449
2450 switch (function_list) {
2451 case WINHV_PLATFORM_FNS_DEFAULT:
2452 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2453 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2454 break;
2455
2456 case WINHV_EMULATION_FNS_DEFAULT:
2457 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2458 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2459 break;
2460
2461 case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2462 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2463 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2464 break;
2465 }
2466
2467 *handle = hLib;
2468 return true;
2469
2470 error:
2471 if (hLib) {
2472 FreeLibrary(hLib);
2473 }
2474
2475 return false;
2476 }
2477
whpx_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)2478 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2479 const char *name, void *opaque,
2480 Error **errp)
2481 {
2482 struct whpx_state *whpx = &whpx_global;
2483 OnOffSplit mode;
2484
2485 if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2486 return;
2487 }
2488
2489 switch (mode) {
2490 case ON_OFF_SPLIT_ON:
2491 whpx->kernel_irqchip_allowed = true;
2492 whpx->kernel_irqchip_required = true;
2493 break;
2494
2495 case ON_OFF_SPLIT_OFF:
2496 whpx->kernel_irqchip_allowed = false;
2497 whpx->kernel_irqchip_required = false;
2498 break;
2499
2500 case ON_OFF_SPLIT_SPLIT:
2501 error_setg(errp, "WHPX: split irqchip currently not supported");
2502 error_append_hint(errp,
2503 "Try without kernel-irqchip or with kernel-irqchip=on|off");
2504 break;
2505
2506 default:
2507 /*
2508 * The value was checked in visit_type_OnOffSplit() above. If
2509 * we get here, then something is wrong in QEMU.
2510 */
2511 abort();
2512 }
2513 }
2514
2515 /*
2516 * Partition support
2517 */
2518
whpx_accel_init(MachineState * ms)2519 static int whpx_accel_init(MachineState *ms)
2520 {
2521 struct whpx_state *whpx;
2522 int ret;
2523 HRESULT hr;
2524 WHV_CAPABILITY whpx_cap;
2525 UINT32 whpx_cap_size;
2526 WHV_PARTITION_PROPERTY prop;
2527 UINT32 cpuidExitList[] = {1, 0x80000001};
2528 WHV_CAPABILITY_FEATURES features = {0};
2529
2530 whpx = &whpx_global;
2531
2532 if (!init_whp_dispatch()) {
2533 ret = -ENOSYS;
2534 goto error;
2535 }
2536
2537 whpx->mem_quota = ms->ram_size;
2538
2539 hr = whp_dispatch.WHvGetCapability(
2540 WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2541 sizeof(whpx_cap), &whpx_cap_size);
2542 if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2543 error_report("WHPX: No accelerator found, hr=%08lx", hr);
2544 ret = -ENOSPC;
2545 goto error;
2546 }
2547
2548 hr = whp_dispatch.WHvGetCapability(
2549 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2550 if (FAILED(hr)) {
2551 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2552 ret = -EINVAL;
2553 goto error;
2554 }
2555
2556 hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2557 if (FAILED(hr)) {
2558 error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2559 ret = -EINVAL;
2560 goto error;
2561 }
2562
2563 /*
2564 * Query the XSAVE capability of the partition. Any error here is not
2565 * considered fatal.
2566 */
2567 hr = whp_dispatch.WHvGetPartitionProperty(
2568 whpx->partition,
2569 WHvPartitionPropertyCodeProcessorXsaveFeatures,
2570 &whpx_xsave_cap,
2571 sizeof(whpx_xsave_cap),
2572 &whpx_cap_size);
2573
2574 /*
2575 * Windows version which don't support this property will return with the
2576 * specific error code.
2577 */
2578 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2579 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2580 }
2581
2582 if (!whpx_has_xsave()) {
2583 printf("WHPX: Partition is not XSAVE capable\n");
2584 }
2585
2586 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2587 prop.ProcessorCount = ms->smp.cpus;
2588 hr = whp_dispatch.WHvSetPartitionProperty(
2589 whpx->partition,
2590 WHvPartitionPropertyCodeProcessorCount,
2591 &prop,
2592 sizeof(WHV_PARTITION_PROPERTY));
2593
2594 if (FAILED(hr)) {
2595 error_report("WHPX: Failed to set partition processor count to %u,"
2596 " hr=%08lx", prop.ProcessorCount, hr);
2597 ret = -EINVAL;
2598 goto error;
2599 }
2600
2601 /*
2602 * Error out if WHP doesn't support apic emulation and user is requiring
2603 * it.
2604 */
2605 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2606 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2607 error_report("WHPX: kernel irqchip requested, but unavailable. "
2608 "Try without kernel-irqchip or with kernel-irqchip=off");
2609 ret = -EINVAL;
2610 goto error;
2611 }
2612
2613 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2614 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2615 WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2616 WHvX64LocalApicEmulationModeXApic;
2617 printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2618 hr = whp_dispatch.WHvSetPartitionProperty(
2619 whpx->partition,
2620 WHvPartitionPropertyCodeLocalApicEmulationMode,
2621 &mode,
2622 sizeof(mode));
2623 if (FAILED(hr)) {
2624 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2625 if (whpx->kernel_irqchip_required) {
2626 error_report("WHPX: kernel irqchip requested, but unavailable");
2627 ret = -EINVAL;
2628 goto error;
2629 }
2630 } else {
2631 whpx->apic_in_platform = true;
2632 }
2633 }
2634
2635 /* Register for MSR and CPUID exits */
2636 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2637 prop.ExtendedVmExits.X64MsrExit = 1;
2638 prop.ExtendedVmExits.X64CpuidExit = 1;
2639 prop.ExtendedVmExits.ExceptionExit = 1;
2640 if (whpx_apic_in_platform()) {
2641 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2642 }
2643
2644 hr = whp_dispatch.WHvSetPartitionProperty(
2645 whpx->partition,
2646 WHvPartitionPropertyCodeExtendedVmExits,
2647 &prop,
2648 sizeof(WHV_PARTITION_PROPERTY));
2649 if (FAILED(hr)) {
2650 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2651 ret = -EINVAL;
2652 goto error;
2653 }
2654
2655 hr = whp_dispatch.WHvSetPartitionProperty(
2656 whpx->partition,
2657 WHvPartitionPropertyCodeCpuidExitList,
2658 cpuidExitList,
2659 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2660
2661 if (FAILED(hr)) {
2662 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2663 hr);
2664 ret = -EINVAL;
2665 goto error;
2666 }
2667
2668 /*
2669 * We do not want to intercept any exceptions from the guest,
2670 * until we actually start debugging with gdb.
2671 */
2672 whpx->exception_exit_bitmap = -1;
2673 hr = whpx_set_exception_exit_bitmap(0);
2674
2675 if (FAILED(hr)) {
2676 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2677 ret = -EINVAL;
2678 goto error;
2679 }
2680
2681 hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2682 if (FAILED(hr)) {
2683 error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2684 ret = -EINVAL;
2685 goto error;
2686 }
2687
2688 whpx_memory_init();
2689
2690 printf("Windows Hypervisor Platform accelerator is operational\n");
2691 return 0;
2692
2693 error:
2694
2695 if (NULL != whpx->partition) {
2696 whp_dispatch.WHvDeletePartition(whpx->partition);
2697 whpx->partition = NULL;
2698 }
2699
2700 return ret;
2701 }
2702
whpx_enabled(void)2703 int whpx_enabled(void)
2704 {
2705 return whpx_allowed;
2706 }
2707
whpx_apic_in_platform(void)2708 bool whpx_apic_in_platform(void) {
2709 return whpx_global.apic_in_platform;
2710 }
2711
whpx_accel_class_init(ObjectClass * oc,void * data)2712 static void whpx_accel_class_init(ObjectClass *oc, void *data)
2713 {
2714 AccelClass *ac = ACCEL_CLASS(oc);
2715 ac->name = "WHPX";
2716 ac->init_machine = whpx_accel_init;
2717 ac->allowed = &whpx_allowed;
2718
2719 object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2720 NULL, whpx_set_kernel_irqchip,
2721 NULL, NULL);
2722 object_class_property_set_description(oc, "kernel-irqchip",
2723 "Configure WHPX in-kernel irqchip");
2724 }
2725
whpx_accel_instance_init(Object * obj)2726 static void whpx_accel_instance_init(Object *obj)
2727 {
2728 struct whpx_state *whpx = &whpx_global;
2729
2730 memset(whpx, 0, sizeof(struct whpx_state));
2731 /* Turn on kernel-irqchip, by default */
2732 whpx->kernel_irqchip_allowed = true;
2733 }
2734
2735 static const TypeInfo whpx_accel_type = {
2736 .name = ACCEL_CLASS_NAME("whpx"),
2737 .parent = TYPE_ACCEL,
2738 .instance_init = whpx_accel_instance_init,
2739 .class_init = whpx_accel_class_init,
2740 };
2741
whpx_type_init(void)2742 static void whpx_type_init(void)
2743 {
2744 type_register_static(&whpx_accel_type);
2745 }
2746
init_whp_dispatch(void)2747 bool init_whp_dispatch(void)
2748 {
2749 if (whp_dispatch_initialized) {
2750 return true;
2751 }
2752
2753 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2754 goto error;
2755 }
2756
2757 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2758 goto error;
2759 }
2760
2761 assert(load_whp_dispatch_fns(&hWinHvPlatform,
2762 WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2763 whp_dispatch_initialized = true;
2764
2765 return true;
2766 error:
2767 if (hWinHvPlatform) {
2768 FreeLibrary(hWinHvPlatform);
2769 }
2770
2771 if (hWinHvEmulation) {
2772 FreeLibrary(hWinHvEmulation);
2773 }
2774
2775 return false;
2776 }
2777
2778 type_init(whpx_type_init);
2779