1 /*
2 * QEMU Windows Hypervisor Platform accelerator (WHPX)
3 *
4 * Copyright Microsoft Corp. 2017
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
8 *
9 */
10
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "system/address-spaces.h"
14 #include "system/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "accel/accel-ops.h"
18 #include "system/whpx.h"
19 #include "system/cpus.h"
20 #include "system/runstate.h"
21 #include "qemu/main-loop.h"
22 #include "hw/boards.h"
23 #include "hw/intc/ioapic.h"
24 #include "hw/i386/apic_internal.h"
25 #include "qemu/error-report.h"
26 #include "qapi/error.h"
27 #include "qapi/qapi-types-common.h"
28 #include "qapi/qapi-visit-common.h"
29 #include "migration/blocker.h"
30 #include "host-cpu.h"
31 #include "accel/accel-cpu-target.h"
32 #include <winerror.h>
33
34 #include "whpx-internal.h"
35 #include "whpx-accel-ops.h"
36
37 #include <winhvplatform.h>
38 #include <winhvemulation.h>
39
40 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL)
41
42 static const WHV_REGISTER_NAME whpx_register_names[] = {
43
44 /* X64 General purpose registers */
45 WHvX64RegisterRax,
46 WHvX64RegisterRcx,
47 WHvX64RegisterRdx,
48 WHvX64RegisterRbx,
49 WHvX64RegisterRsp,
50 WHvX64RegisterRbp,
51 WHvX64RegisterRsi,
52 WHvX64RegisterRdi,
53 WHvX64RegisterR8,
54 WHvX64RegisterR9,
55 WHvX64RegisterR10,
56 WHvX64RegisterR11,
57 WHvX64RegisterR12,
58 WHvX64RegisterR13,
59 WHvX64RegisterR14,
60 WHvX64RegisterR15,
61 WHvX64RegisterRip,
62 WHvX64RegisterRflags,
63
64 /* X64 Segment registers */
65 WHvX64RegisterEs,
66 WHvX64RegisterCs,
67 WHvX64RegisterSs,
68 WHvX64RegisterDs,
69 WHvX64RegisterFs,
70 WHvX64RegisterGs,
71 WHvX64RegisterLdtr,
72 WHvX64RegisterTr,
73
74 /* X64 Table registers */
75 WHvX64RegisterIdtr,
76 WHvX64RegisterGdtr,
77
78 /* X64 Control Registers */
79 WHvX64RegisterCr0,
80 WHvX64RegisterCr2,
81 WHvX64RegisterCr3,
82 WHvX64RegisterCr4,
83 WHvX64RegisterCr8,
84
85 /* X64 Debug Registers */
86 /*
87 * WHvX64RegisterDr0,
88 * WHvX64RegisterDr1,
89 * WHvX64RegisterDr2,
90 * WHvX64RegisterDr3,
91 * WHvX64RegisterDr6,
92 * WHvX64RegisterDr7,
93 */
94
95 /* X64 Floating Point and Vector Registers */
96 WHvX64RegisterXmm0,
97 WHvX64RegisterXmm1,
98 WHvX64RegisterXmm2,
99 WHvX64RegisterXmm3,
100 WHvX64RegisterXmm4,
101 WHvX64RegisterXmm5,
102 WHvX64RegisterXmm6,
103 WHvX64RegisterXmm7,
104 WHvX64RegisterXmm8,
105 WHvX64RegisterXmm9,
106 WHvX64RegisterXmm10,
107 WHvX64RegisterXmm11,
108 WHvX64RegisterXmm12,
109 WHvX64RegisterXmm13,
110 WHvX64RegisterXmm14,
111 WHvX64RegisterXmm15,
112 WHvX64RegisterFpMmx0,
113 WHvX64RegisterFpMmx1,
114 WHvX64RegisterFpMmx2,
115 WHvX64RegisterFpMmx3,
116 WHvX64RegisterFpMmx4,
117 WHvX64RegisterFpMmx5,
118 WHvX64RegisterFpMmx6,
119 WHvX64RegisterFpMmx7,
120 WHvX64RegisterFpControlStatus,
121 WHvX64RegisterXmmControlStatus,
122
123 /* X64 MSRs */
124 WHvX64RegisterEfer,
125 #ifdef TARGET_X86_64
126 WHvX64RegisterKernelGsBase,
127 #endif
128 WHvX64RegisterApicBase,
129 /* WHvX64RegisterPat, */
130 WHvX64RegisterSysenterCs,
131 WHvX64RegisterSysenterEip,
132 WHvX64RegisterSysenterEsp,
133 WHvX64RegisterStar,
134 #ifdef TARGET_X86_64
135 WHvX64RegisterLstar,
136 WHvX64RegisterCstar,
137 WHvX64RegisterSfmask,
138 #endif
139
140 /* Interrupt / Event Registers */
141 /*
142 * WHvRegisterPendingInterruption,
143 * WHvRegisterInterruptState,
144 * WHvRegisterPendingEvent0,
145 * WHvRegisterPendingEvent1
146 * WHvX64RegisterDeliverabilityNotifications,
147 */
148 };
149
150 struct whpx_register_set {
151 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
152 };
153
154 /*
155 * The current implementation of instruction stepping sets the TF flag
156 * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
157 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
158 *
159 * This approach has a few limitations:
160 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag
161 * along with the other flags, possibly restoring it later. It would
162 * result in another INT1 when the flags are restored, triggering
163 * a stop in gdb that could be cleared by doing another step.
164 *
165 * Stepping over a POPF/LAHF instruction will let it overwrite the
166 * TF flags, ending the stepping mode.
167 *
168 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV,
169 * or anything that could result in a page fault) will save the flags
170 * to the stack, clear the TF flag, and let the guest execute the
171 * handler. Normally, the guest will restore the original flags,
172 * that will continue single-stepping.
173 *
174 * 3. Debuggers running on the guest may wish to set TF to do instruction
175 * stepping. INT1 events generated by it would be intercepted by us,
176 * as long as the gdb is connected to QEMU.
177 *
178 * In practice this means that:
179 * 1. Stepping through flags-modifying instructions may cause gdb to
180 * continue or stop in unexpected places. This will be fully recoverable
181 * and will not crash the target.
182 *
183 * 2. Stepping over an instruction that triggers an exception will step
184 * over the exception handler, not into it.
185 *
186 * 3. Debugging the guest via gdb, while running debugger on the guest
187 * at the same time may lead to unexpected effects. Removing all
188 * breakpoints set via QEMU will prevent any further interference
189 * with the guest-level debuggers.
190 *
191 * The limitations can be addressed as shown below:
192 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
193 * stepping through them. The exact semantics of the instructions is
194 * defined in the "Combined Volume Set of Intel 64 and IA-32
195 * Architectures Software Developer's Manuals", however it involves a
196 * fair amount of corner cases due to compatibility with real mode,
197 * virtual 8086 mode, and differences between 64-bit and 32-bit modes.
198 *
199 * 2. We could step into the guest's exception handlers using the following
200 * sequence:
201 * a. Temporarily enable catching of all exception types via
202 * whpx_set_exception_exit_bitmap().
203 * b. Once an exception is intercepted, read the IDT/GDT and locate
204 * the original handler.
205 * c. Patch the original handler, injecting an INT3 at the beginning.
206 * d. Update the exception exit bitmap to only catch the
207 * WHvX64ExceptionTypeBreakpointTrap exception.
208 * e. Let the affected CPU run in the exclusive mode.
209 * f. Restore the original handler and the exception exit bitmap.
210 * Note that handling all corner cases related to IDT/GDT is harder
211 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
212 * rough idea.
213 *
214 * 3. In order to properly support guest-level debugging in parallel with
215 * the QEMU-level debugging, we would need to be able to pass some INT1
216 * events to the guest. This could be done via the following methods:
217 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
218 * it seems to only work for interrupts and not software
219 * exceptions.
220 * b. Locating and patching the original handler by parsing IDT/GDT.
221 * This involves relatively complex logic outlined in the previous
222 * paragraph.
223 * c. Emulating the exception invocation (i.e. manually updating RIP,
224 * RFLAGS, and pushing the old values to stack). This is even more
225 * complicated than the previous option, since it involves checking
226 * CPL, gate attributes, and doing various adjustments depending
227 * on the current CPU mode, whether the CPL is changing, etc.
228 */
229 typedef enum WhpxStepMode {
230 WHPX_STEP_NONE = 0,
231 /* Halt other VCPUs */
232 WHPX_STEP_EXCLUSIVE,
233 } WhpxStepMode;
234
235 struct AccelCPUState {
236 WHV_EMULATOR_HANDLE emulator;
237 bool window_registered;
238 bool interruptable;
239 bool ready_for_pic_interrupt;
240 uint64_t tpr;
241 uint64_t apic_base;
242 bool interruption_pending;
243
244 /* Must be the last field as it may have a tail */
245 WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
246 };
247
248 bool whpx_allowed;
249 static bool whp_dispatch_initialized;
250 static HMODULE hWinHvPlatform, hWinHvEmulation;
251 static uint32_t max_vcpu_index;
252 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
253
254 struct whpx_state whpx_global;
255 struct WHPDispatch whp_dispatch;
256
whpx_has_xsave(void)257 static bool whpx_has_xsave(void)
258 {
259 return whpx_xsave_cap.XsaveSupport;
260 }
261
whpx_seg_q2h(const SegmentCache * qs,int v86,int r86)262 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
263 int r86)
264 {
265 WHV_X64_SEGMENT_REGISTER hs;
266 unsigned flags = qs->flags;
267
268 hs.Base = qs->base;
269 hs.Limit = qs->limit;
270 hs.Selector = qs->selector;
271
272 if (v86) {
273 hs.Attributes = 0;
274 hs.SegmentType = 3;
275 hs.Present = 1;
276 hs.DescriptorPrivilegeLevel = 3;
277 hs.NonSystemSegment = 1;
278
279 } else {
280 hs.Attributes = (flags >> DESC_TYPE_SHIFT);
281
282 if (r86) {
283 /* hs.Base &= 0xfffff; */
284 }
285 }
286
287 return hs;
288 }
289
whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER * hs)290 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
291 {
292 SegmentCache qs;
293
294 qs.base = hs->Base;
295 qs.limit = hs->Limit;
296 qs.selector = hs->Selector;
297
298 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
299
300 return qs;
301 }
302
303 /* X64 Extended Control Registers */
whpx_set_xcrs(CPUState * cpu)304 static void whpx_set_xcrs(CPUState *cpu)
305 {
306 HRESULT hr;
307 struct whpx_state *whpx = &whpx_global;
308 WHV_REGISTER_VALUE xcr0;
309 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
310
311 if (!whpx_has_xsave()) {
312 return;
313 }
314
315 /* Only xcr0 is supported by the hypervisor currently */
316 xcr0.Reg64 = cpu_env(cpu)->xcr0;
317 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
318 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
319 if (FAILED(hr)) {
320 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
321 }
322 }
323
whpx_set_tsc(CPUState * cpu)324 static int whpx_set_tsc(CPUState *cpu)
325 {
326 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
327 WHV_REGISTER_VALUE tsc_val;
328 HRESULT hr;
329 struct whpx_state *whpx = &whpx_global;
330
331 /*
332 * Suspend the partition prior to setting the TSC to reduce the variance
333 * in TSC across vCPUs. When the first vCPU runs post suspend, the
334 * partition is automatically resumed.
335 */
336 if (whp_dispatch.WHvSuspendPartitionTime) {
337
338 /*
339 * Unable to suspend partition while setting TSC is not a fatal
340 * error. It just increases the likelihood of TSC variance between
341 * vCPUs and some guest OS are able to handle that just fine.
342 */
343 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
344 if (FAILED(hr)) {
345 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
346 }
347 }
348
349 tsc_val.Reg64 = cpu_env(cpu)->tsc;
350 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
351 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
352 if (FAILED(hr)) {
353 error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
354 return -1;
355 }
356
357 return 0;
358 }
359
360 /*
361 * The CR8 register in the CPU is mapped to the TPR register of the APIC,
362 * however, they use a slightly different encoding. Specifically:
363 *
364 * APIC.TPR[bits 7:4] = CR8[bits 3:0]
365 *
366 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
367 * and IA-32 Architectures Software Developer's Manual.
368 *
369 * The functions below translate the value of CR8 to TPR and vice versa.
370 */
371
whpx_apic_tpr_to_cr8(uint64_t tpr)372 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
373 {
374 return tpr >> 4;
375 }
376
whpx_cr8_to_apic_tpr(uint64_t cr8)377 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
378 {
379 return cr8 << 4;
380 }
381
whpx_set_registers(CPUState * cpu,int level)382 static void whpx_set_registers(CPUState *cpu, int level)
383 {
384 struct whpx_state *whpx = &whpx_global;
385 AccelCPUState *vcpu = cpu->accel;
386 X86CPU *x86_cpu = X86_CPU(cpu);
387 CPUX86State *env = &x86_cpu->env;
388 struct whpx_register_set vcxt;
389 HRESULT hr;
390 int idx;
391 int idx_next;
392 int i;
393 int v86, r86;
394
395 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
396
397 /*
398 * Following MSRs have side effects on the guest or are too heavy for
399 * runtime. Limit them to full state update.
400 */
401 if (level >= WHPX_SET_RESET_STATE) {
402 whpx_set_tsc(cpu);
403 }
404
405 memset(&vcxt, 0, sizeof(struct whpx_register_set));
406
407 v86 = (env->eflags & VM_MASK);
408 r86 = !(env->cr[0] & CR0_PE_MASK);
409
410 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
411 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
412
413 idx = 0;
414
415 /* Indexes for first 16 registers match between HV and QEMU definitions */
416 idx_next = 16;
417 for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
418 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
419 }
420 idx = idx_next;
421
422 /* Same goes for RIP and RFLAGS */
423 assert(whpx_register_names[idx] == WHvX64RegisterRip);
424 vcxt.values[idx++].Reg64 = env->eip;
425
426 assert(whpx_register_names[idx] == WHvX64RegisterRflags);
427 vcxt.values[idx++].Reg64 = env->eflags;
428
429 /* Translate 6+4 segment registers. HV and QEMU order matches */
430 assert(idx == WHvX64RegisterEs);
431 for (i = 0; i < 6; i += 1, idx += 1) {
432 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
433 }
434
435 assert(idx == WHvX64RegisterLdtr);
436 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
437
438 assert(idx == WHvX64RegisterTr);
439 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
440
441 assert(idx == WHvX64RegisterIdtr);
442 vcxt.values[idx].Table.Base = env->idt.base;
443 vcxt.values[idx].Table.Limit = env->idt.limit;
444 idx += 1;
445
446 assert(idx == WHvX64RegisterGdtr);
447 vcxt.values[idx].Table.Base = env->gdt.base;
448 vcxt.values[idx].Table.Limit = env->gdt.limit;
449 idx += 1;
450
451 /* CR0, 2, 3, 4, 8 */
452 assert(whpx_register_names[idx] == WHvX64RegisterCr0);
453 vcxt.values[idx++].Reg64 = env->cr[0];
454 assert(whpx_register_names[idx] == WHvX64RegisterCr2);
455 vcxt.values[idx++].Reg64 = env->cr[2];
456 assert(whpx_register_names[idx] == WHvX64RegisterCr3);
457 vcxt.values[idx++].Reg64 = env->cr[3];
458 assert(whpx_register_names[idx] == WHvX64RegisterCr4);
459 vcxt.values[idx++].Reg64 = env->cr[4];
460 assert(whpx_register_names[idx] == WHvX64RegisterCr8);
461 vcxt.values[idx++].Reg64 = vcpu->tpr;
462
463 /* 8 Debug Registers - Skipped */
464
465 /*
466 * Extended control registers needs to be handled separately depending
467 * on whether xsave is supported/enabled or not.
468 */
469 whpx_set_xcrs(cpu);
470
471 /* 16 XMM registers */
472 assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
473 idx_next = idx + 16;
474 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
475 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
476 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
477 }
478 idx = idx_next;
479
480 /* 8 FP registers */
481 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
482 for (i = 0; i < 8; i += 1, idx += 1) {
483 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
484 /* vcxt.values[idx].Fp.AsUINT128.High64 =
485 env->fpregs[i].mmx.MMX_Q(1);
486 */
487 }
488
489 /* FP control status register */
490 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
491 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
492 vcxt.values[idx].FpControlStatus.FpStatus =
493 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
494 vcxt.values[idx].FpControlStatus.FpTag = 0;
495 for (i = 0; i < 8; ++i) {
496 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
497 }
498 vcxt.values[idx].FpControlStatus.Reserved = 0;
499 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
500 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
501 idx += 1;
502
503 /* XMM control status register */
504 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
505 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
506 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
507 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
508 idx += 1;
509
510 /* MSRs */
511 assert(whpx_register_names[idx] == WHvX64RegisterEfer);
512 vcxt.values[idx++].Reg64 = env->efer;
513 #ifdef TARGET_X86_64
514 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
515 vcxt.values[idx++].Reg64 = env->kernelgsbase;
516 #endif
517
518 assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
519 vcxt.values[idx++].Reg64 = vcpu->apic_base;
520
521 /* WHvX64RegisterPat - Skipped */
522
523 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
524 vcxt.values[idx++].Reg64 = env->sysenter_cs;
525 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
526 vcxt.values[idx++].Reg64 = env->sysenter_eip;
527 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
528 vcxt.values[idx++].Reg64 = env->sysenter_esp;
529 assert(whpx_register_names[idx] == WHvX64RegisterStar);
530 vcxt.values[idx++].Reg64 = env->star;
531 #ifdef TARGET_X86_64
532 assert(whpx_register_names[idx] == WHvX64RegisterLstar);
533 vcxt.values[idx++].Reg64 = env->lstar;
534 assert(whpx_register_names[idx] == WHvX64RegisterCstar);
535 vcxt.values[idx++].Reg64 = env->cstar;
536 assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
537 vcxt.values[idx++].Reg64 = env->fmask;
538 #endif
539
540 /* Interrupt / Event Registers - Skipped */
541
542 assert(idx == RTL_NUMBER_OF(whpx_register_names));
543
544 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
545 whpx->partition, cpu->cpu_index,
546 whpx_register_names,
547 RTL_NUMBER_OF(whpx_register_names),
548 &vcxt.values[0]);
549
550 if (FAILED(hr)) {
551 error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
552 hr);
553 }
554 }
555
whpx_get_tsc(CPUState * cpu)556 static int whpx_get_tsc(CPUState *cpu)
557 {
558 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
559 WHV_REGISTER_VALUE tsc_val;
560 HRESULT hr;
561 struct whpx_state *whpx = &whpx_global;
562
563 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
564 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
565 if (FAILED(hr)) {
566 error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
567 return -1;
568 }
569
570 cpu_env(cpu)->tsc = tsc_val.Reg64;
571 return 0;
572 }
573
574 /* X64 Extended Control Registers */
whpx_get_xcrs(CPUState * cpu)575 static void whpx_get_xcrs(CPUState *cpu)
576 {
577 HRESULT hr;
578 struct whpx_state *whpx = &whpx_global;
579 WHV_REGISTER_VALUE xcr0;
580 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
581
582 if (!whpx_has_xsave()) {
583 return;
584 }
585
586 /* Only xcr0 is supported by the hypervisor currently */
587 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
588 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
589 if (FAILED(hr)) {
590 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
591 return;
592 }
593
594 cpu_env(cpu)->xcr0 = xcr0.Reg64;
595 }
596
whpx_get_registers(CPUState * cpu)597 static void whpx_get_registers(CPUState *cpu)
598 {
599 struct whpx_state *whpx = &whpx_global;
600 AccelCPUState *vcpu = cpu->accel;
601 X86CPU *x86_cpu = X86_CPU(cpu);
602 CPUX86State *env = &x86_cpu->env;
603 struct whpx_register_set vcxt;
604 uint64_t tpr, apic_base;
605 HRESULT hr;
606 int idx;
607 int idx_next;
608 int i;
609
610 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
611
612 if (!env->tsc_valid) {
613 whpx_get_tsc(cpu);
614 env->tsc_valid = !runstate_is_running();
615 }
616
617 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
618 whpx->partition, cpu->cpu_index,
619 whpx_register_names,
620 RTL_NUMBER_OF(whpx_register_names),
621 &vcxt.values[0]);
622 if (FAILED(hr)) {
623 error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
624 hr);
625 }
626
627 if (whpx_apic_in_platform()) {
628 /*
629 * Fetch the TPR value from the emulated APIC. It may get overwritten
630 * below with the value from CR8 returned by
631 * WHvGetVirtualProcessorRegisters().
632 */
633 whpx_apic_get(x86_cpu->apic_state);
634 vcpu->tpr = whpx_apic_tpr_to_cr8(
635 cpu_get_apic_tpr(x86_cpu->apic_state));
636 }
637
638 idx = 0;
639
640 /* Indexes for first 16 registers match between HV and QEMU definitions */
641 idx_next = 16;
642 for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
643 env->regs[idx] = vcxt.values[idx].Reg64;
644 }
645 idx = idx_next;
646
647 /* Same goes for RIP and RFLAGS */
648 assert(whpx_register_names[idx] == WHvX64RegisterRip);
649 env->eip = vcxt.values[idx++].Reg64;
650 assert(whpx_register_names[idx] == WHvX64RegisterRflags);
651 env->eflags = vcxt.values[idx++].Reg64;
652
653 /* Translate 6+4 segment registers. HV and QEMU order matches */
654 assert(idx == WHvX64RegisterEs);
655 for (i = 0; i < 6; i += 1, idx += 1) {
656 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
657 }
658
659 assert(idx == WHvX64RegisterLdtr);
660 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
661 assert(idx == WHvX64RegisterTr);
662 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
663 assert(idx == WHvX64RegisterIdtr);
664 env->idt.base = vcxt.values[idx].Table.Base;
665 env->idt.limit = vcxt.values[idx].Table.Limit;
666 idx += 1;
667 assert(idx == WHvX64RegisterGdtr);
668 env->gdt.base = vcxt.values[idx].Table.Base;
669 env->gdt.limit = vcxt.values[idx].Table.Limit;
670 idx += 1;
671
672 /* CR0, 2, 3, 4, 8 */
673 assert(whpx_register_names[idx] == WHvX64RegisterCr0);
674 env->cr[0] = vcxt.values[idx++].Reg64;
675 assert(whpx_register_names[idx] == WHvX64RegisterCr2);
676 env->cr[2] = vcxt.values[idx++].Reg64;
677 assert(whpx_register_names[idx] == WHvX64RegisterCr3);
678 env->cr[3] = vcxt.values[idx++].Reg64;
679 assert(whpx_register_names[idx] == WHvX64RegisterCr4);
680 env->cr[4] = vcxt.values[idx++].Reg64;
681 assert(whpx_register_names[idx] == WHvX64RegisterCr8);
682 tpr = vcxt.values[idx++].Reg64;
683 if (tpr != vcpu->tpr) {
684 vcpu->tpr = tpr;
685 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
686 }
687
688 /* 8 Debug Registers - Skipped */
689
690 /*
691 * Extended control registers needs to be handled separately depending
692 * on whether xsave is supported/enabled or not.
693 */
694 whpx_get_xcrs(cpu);
695
696 /* 16 XMM registers */
697 assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
698 idx_next = idx + 16;
699 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
700 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
701 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
702 }
703 idx = idx_next;
704
705 /* 8 FP registers */
706 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
707 for (i = 0; i < 8; i += 1, idx += 1) {
708 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
709 /* env->fpregs[i].mmx.MMX_Q(1) =
710 vcxt.values[idx].Fp.AsUINT128.High64;
711 */
712 }
713
714 /* FP control status register */
715 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
716 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
717 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
718 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
719 for (i = 0; i < 8; ++i) {
720 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
721 }
722 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
723 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
724 idx += 1;
725
726 /* XMM control status register */
727 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
728 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
729 idx += 1;
730
731 /* MSRs */
732 assert(whpx_register_names[idx] == WHvX64RegisterEfer);
733 env->efer = vcxt.values[idx++].Reg64;
734 #ifdef TARGET_X86_64
735 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
736 env->kernelgsbase = vcxt.values[idx++].Reg64;
737 #endif
738
739 assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
740 apic_base = vcxt.values[idx++].Reg64;
741 if (apic_base != vcpu->apic_base) {
742 vcpu->apic_base = apic_base;
743 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
744 }
745
746 /* WHvX64RegisterPat - Skipped */
747
748 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
749 env->sysenter_cs = vcxt.values[idx++].Reg64;
750 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
751 env->sysenter_eip = vcxt.values[idx++].Reg64;
752 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
753 env->sysenter_esp = vcxt.values[idx++].Reg64;
754 assert(whpx_register_names[idx] == WHvX64RegisterStar);
755 env->star = vcxt.values[idx++].Reg64;
756 #ifdef TARGET_X86_64
757 assert(whpx_register_names[idx] == WHvX64RegisterLstar);
758 env->lstar = vcxt.values[idx++].Reg64;
759 assert(whpx_register_names[idx] == WHvX64RegisterCstar);
760 env->cstar = vcxt.values[idx++].Reg64;
761 assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
762 env->fmask = vcxt.values[idx++].Reg64;
763 #endif
764
765 /* Interrupt / Event Registers - Skipped */
766
767 assert(idx == RTL_NUMBER_OF(whpx_register_names));
768
769 if (whpx_apic_in_platform()) {
770 whpx_apic_get(x86_cpu->apic_state);
771 }
772
773 x86_update_hflags(env);
774 }
775
whpx_emu_ioport_callback(void * ctx,WHV_EMULATOR_IO_ACCESS_INFO * IoAccess)776 static HRESULT CALLBACK whpx_emu_ioport_callback(
777 void *ctx,
778 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
779 {
780 MemTxAttrs attrs = { 0 };
781 address_space_rw(&address_space_io, IoAccess->Port, attrs,
782 &IoAccess->Data, IoAccess->AccessSize,
783 IoAccess->Direction);
784 return S_OK;
785 }
786
whpx_emu_mmio_callback(void * ctx,WHV_EMULATOR_MEMORY_ACCESS_INFO * ma)787 static HRESULT CALLBACK whpx_emu_mmio_callback(
788 void *ctx,
789 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
790 {
791 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
792 ma->Direction);
793 return S_OK;
794 }
795
whpx_emu_getreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,WHV_REGISTER_VALUE * RegisterValues)796 static HRESULT CALLBACK whpx_emu_getreg_callback(
797 void *ctx,
798 const WHV_REGISTER_NAME *RegisterNames,
799 UINT32 RegisterCount,
800 WHV_REGISTER_VALUE *RegisterValues)
801 {
802 HRESULT hr;
803 struct whpx_state *whpx = &whpx_global;
804 CPUState *cpu = (CPUState *)ctx;
805
806 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
807 whpx->partition, cpu->cpu_index,
808 RegisterNames, RegisterCount,
809 RegisterValues);
810 if (FAILED(hr)) {
811 error_report("WHPX: Failed to get virtual processor registers,"
812 " hr=%08lx", hr);
813 }
814
815 return hr;
816 }
817
whpx_emu_setreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,const WHV_REGISTER_VALUE * RegisterValues)818 static HRESULT CALLBACK whpx_emu_setreg_callback(
819 void *ctx,
820 const WHV_REGISTER_NAME *RegisterNames,
821 UINT32 RegisterCount,
822 const WHV_REGISTER_VALUE *RegisterValues)
823 {
824 HRESULT hr;
825 struct whpx_state *whpx = &whpx_global;
826 CPUState *cpu = (CPUState *)ctx;
827
828 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
829 whpx->partition, cpu->cpu_index,
830 RegisterNames, RegisterCount,
831 RegisterValues);
832 if (FAILED(hr)) {
833 error_report("WHPX: Failed to set virtual processor registers,"
834 " hr=%08lx", hr);
835 }
836
837 /*
838 * The emulator just successfully wrote the register state. We clear the
839 * dirty state so we avoid the double write on resume of the VP.
840 */
841 cpu->vcpu_dirty = false;
842
843 return hr;
844 }
845
whpx_emu_translate_callback(void * ctx,WHV_GUEST_VIRTUAL_ADDRESS Gva,WHV_TRANSLATE_GVA_FLAGS TranslateFlags,WHV_TRANSLATE_GVA_RESULT_CODE * TranslationResult,WHV_GUEST_PHYSICAL_ADDRESS * Gpa)846 static HRESULT CALLBACK whpx_emu_translate_callback(
847 void *ctx,
848 WHV_GUEST_VIRTUAL_ADDRESS Gva,
849 WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
850 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
851 WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
852 {
853 HRESULT hr;
854 struct whpx_state *whpx = &whpx_global;
855 CPUState *cpu = (CPUState *)ctx;
856 WHV_TRANSLATE_GVA_RESULT res;
857
858 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
859 Gva, TranslateFlags, &res, Gpa);
860 if (FAILED(hr)) {
861 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
862 } else {
863 *TranslationResult = res.ResultCode;
864 }
865
866 return hr;
867 }
868
869 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
870 .Size = sizeof(WHV_EMULATOR_CALLBACKS),
871 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
872 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
873 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
874 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
875 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
876 };
877
whpx_handle_mmio(CPUState * cpu,WHV_MEMORY_ACCESS_CONTEXT * ctx)878 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
879 {
880 HRESULT hr;
881 AccelCPUState *vcpu = cpu->accel;
882 WHV_EMULATOR_STATUS emu_status;
883
884 hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
885 vcpu->emulator, cpu,
886 &vcpu->exit_ctx.VpContext, ctx,
887 &emu_status);
888 if (FAILED(hr)) {
889 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
890 return -1;
891 }
892
893 if (!emu_status.EmulationSuccessful) {
894 error_report("WHPX: Failed to emulate MMIO access with"
895 " EmulatorReturnStatus: %u", emu_status.AsUINT32);
896 return -1;
897 }
898
899 return 0;
900 }
901
whpx_handle_portio(CPUState * cpu,WHV_X64_IO_PORT_ACCESS_CONTEXT * ctx)902 static int whpx_handle_portio(CPUState *cpu,
903 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
904 {
905 HRESULT hr;
906 AccelCPUState *vcpu = cpu->accel;
907 WHV_EMULATOR_STATUS emu_status;
908
909 hr = whp_dispatch.WHvEmulatorTryIoEmulation(
910 vcpu->emulator, cpu,
911 &vcpu->exit_ctx.VpContext, ctx,
912 &emu_status);
913 if (FAILED(hr)) {
914 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
915 return -1;
916 }
917
918 if (!emu_status.EmulationSuccessful) {
919 error_report("WHPX: Failed to emulate PortIO access with"
920 " EmulatorReturnStatus: %u", emu_status.AsUINT32);
921 return -1;
922 }
923
924 return 0;
925 }
926
927 /*
928 * Controls whether we should intercept various exceptions on the guest,
929 * namely breakpoint/single-step events.
930 *
931 * The 'exceptions' argument accepts a bitmask, e.g:
932 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
933 */
whpx_set_exception_exit_bitmap(UINT64 exceptions)934 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
935 {
936 struct whpx_state *whpx = &whpx_global;
937 WHV_PARTITION_PROPERTY prop = { 0, };
938 HRESULT hr;
939
940 if (exceptions == whpx->exception_exit_bitmap) {
941 return S_OK;
942 }
943
944 prop.ExceptionExitBitmap = exceptions;
945
946 hr = whp_dispatch.WHvSetPartitionProperty(
947 whpx->partition,
948 WHvPartitionPropertyCodeExceptionExitBitmap,
949 &prop,
950 sizeof(WHV_PARTITION_PROPERTY));
951
952 if (SUCCEEDED(hr)) {
953 whpx->exception_exit_bitmap = exceptions;
954 }
955
956 return hr;
957 }
958
959
960 /*
961 * This function is called before/after stepping over a single instruction.
962 * It will update the CPU registers to arm/disarm the instruction stepping
963 * accordingly.
964 */
whpx_vcpu_configure_single_stepping(CPUState * cpu,bool set,uint64_t * exit_context_rflags)965 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
966 bool set,
967 uint64_t *exit_context_rflags)
968 {
969 WHV_REGISTER_NAME reg_name;
970 WHV_REGISTER_VALUE reg_value;
971 HRESULT hr;
972 struct whpx_state *whpx = &whpx_global;
973
974 /*
975 * If we are trying to step over a single instruction, we need to set the
976 * TF bit in rflags. Otherwise, clear it.
977 */
978 reg_name = WHvX64RegisterRflags;
979 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
980 whpx->partition,
981 cpu->cpu_index,
982 ®_name,
983 1,
984 ®_value);
985
986 if (FAILED(hr)) {
987 error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
988 return hr;
989 }
990
991 if (exit_context_rflags) {
992 assert(*exit_context_rflags == reg_value.Reg64);
993 }
994
995 if (set) {
996 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
997 reg_value.Reg64 |= TF_MASK;
998 } else {
999 reg_value.Reg64 &= ~TF_MASK;
1000 }
1001
1002 if (exit_context_rflags) {
1003 *exit_context_rflags = reg_value.Reg64;
1004 }
1005
1006 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1007 whpx->partition,
1008 cpu->cpu_index,
1009 ®_name,
1010 1,
1011 ®_value);
1012
1013 if (FAILED(hr)) {
1014 error_report("WHPX: Failed to set rflags,"
1015 " hr=%08lx",
1016 hr);
1017 return hr;
1018 }
1019
1020 reg_name = WHvRegisterInterruptState;
1021 reg_value.Reg64 = 0;
1022
1023 /* Suspend delivery of hardware interrupts during single-stepping. */
1024 reg_value.InterruptState.InterruptShadow = set != 0;
1025
1026 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1027 whpx->partition,
1028 cpu->cpu_index,
1029 ®_name,
1030 1,
1031 ®_value);
1032
1033 if (FAILED(hr)) {
1034 error_report("WHPX: Failed to set InterruptState,"
1035 " hr=%08lx",
1036 hr);
1037 return hr;
1038 }
1039
1040 if (!set) {
1041 /*
1042 * We have just finished stepping over a single instruction,
1043 * and intercepted the INT1 generated by it.
1044 * We need to now hide the INT1 from the guest,
1045 * as it would not be expecting it.
1046 */
1047
1048 reg_name = WHvX64RegisterPendingDebugException;
1049 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1050 whpx->partition,
1051 cpu->cpu_index,
1052 ®_name,
1053 1,
1054 ®_value);
1055
1056 if (FAILED(hr)) {
1057 error_report("WHPX: Failed to get pending debug exceptions,"
1058 "hr=%08lx", hr);
1059 return hr;
1060 }
1061
1062 if (reg_value.PendingDebugException.SingleStep) {
1063 reg_value.PendingDebugException.SingleStep = 0;
1064
1065 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1066 whpx->partition,
1067 cpu->cpu_index,
1068 ®_name,
1069 1,
1070 ®_value);
1071
1072 if (FAILED(hr)) {
1073 error_report("WHPX: Failed to clear pending debug exceptions,"
1074 "hr=%08lx", hr);
1075 return hr;
1076 }
1077 }
1078
1079 }
1080
1081 return S_OK;
1082 }
1083
1084 /* Tries to find a breakpoint at the specified address. */
whpx_lookup_breakpoint_by_addr(uint64_t address)1085 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1086 {
1087 struct whpx_state *whpx = &whpx_global;
1088 int i;
1089
1090 if (whpx->breakpoints.breakpoints) {
1091 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1092 if (address == whpx->breakpoints.breakpoints->data[i].address) {
1093 return &whpx->breakpoints.breakpoints->data[i];
1094 }
1095 }
1096 }
1097
1098 return NULL;
1099 }
1100
1101 /*
1102 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1103 * debugging user-mode applications. Since the WHPX API does not offer
1104 * an easy way to pass the intercepted exception back to the guest, we
1105 * resort to using INT1 instead, and let the guest always handle INT3.
1106 */
1107 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1108
1109 /*
1110 * The WHPX QEMU backend implements breakpoints by writing the INT1
1111 * instruction into memory (ignoring the DRx registers). This raises a few
1112 * issues that need to be carefully handled:
1113 *
1114 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1115 * at the same location, and later remove them in arbitrary order.
1116 * This should not cause memory corruption, and should only remove the
1117 * physical breakpoint instruction when the last QEMU breakpoint is gone.
1118 *
1119 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1120 * physical location. Hence, physically adding/removing a breakpoint can
1121 * theoretically fail at any time. We need to keep track of it.
1122 *
1123 * The function below rebuilds a list of low-level breakpoints (one per
1124 * address, tracking the original instruction and any errors) from the list of
1125 * high-level breakpoints (set via cpu_breakpoint_insert()).
1126 *
1127 * In order to optimize performance, this function stores the list of
1128 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1129 * low-level ones, so that it won't be re-invoked until these breakpoints
1130 * change.
1131 *
1132 * Note that this function decides which breakpoints should be inserted into,
1133 * memory, but doesn't actually do it. The memory accessing is done in
1134 * whpx_apply_breakpoints().
1135 */
whpx_translate_cpu_breakpoints(struct whpx_breakpoints * breakpoints,CPUState * cpu,int cpu_breakpoint_count)1136 static void whpx_translate_cpu_breakpoints(
1137 struct whpx_breakpoints *breakpoints,
1138 CPUState *cpu,
1139 int cpu_breakpoint_count)
1140 {
1141 CPUBreakpoint *bp;
1142 int cpu_bp_index = 0;
1143
1144 breakpoints->original_addresses =
1145 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1146
1147 breakpoints->original_address_count = cpu_breakpoint_count;
1148
1149 int max_breakpoints = cpu_breakpoint_count +
1150 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1151
1152 struct whpx_breakpoint_collection *new_breakpoints =
1153 g_malloc0(sizeof(struct whpx_breakpoint_collection)
1154 + max_breakpoints * sizeof(struct whpx_breakpoint));
1155
1156 new_breakpoints->allocated = max_breakpoints;
1157 new_breakpoints->used = 0;
1158
1159 /*
1160 * 1. Preserve all old breakpoints that could not be automatically
1161 * cleared when the CPU got stopped.
1162 */
1163 if (breakpoints->breakpoints) {
1164 int i;
1165 for (i = 0; i < breakpoints->breakpoints->used; i++) {
1166 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1167 new_breakpoints->data[new_breakpoints->used++] =
1168 breakpoints->breakpoints->data[i];
1169 }
1170 }
1171 }
1172
1173 /* 2. Map all CPU breakpoints to WHPX breakpoints */
1174 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1175 int i;
1176 bool found = false;
1177
1178 /* This will be used to detect changed CPU breakpoints later. */
1179 breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1180
1181 for (i = 0; i < new_breakpoints->used; i++) {
1182 /*
1183 * WARNING: This loop has O(N^2) complexity, where N is the
1184 * number of breakpoints. It should not be a bottleneck in
1185 * real-world scenarios, since it only needs to run once after
1186 * the breakpoints have been modified.
1187 * If this ever becomes a concern, it can be optimized by storing
1188 * high-level breakpoint objects in a tree or hash map.
1189 */
1190
1191 if (new_breakpoints->data[i].address == bp->pc) {
1192 /* There was already a breakpoint at this address. */
1193 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1194 new_breakpoints->data[i].state = WHPX_BP_SET;
1195 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1196 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1197 }
1198
1199 found = true;
1200 break;
1201 }
1202 }
1203
1204 if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1205 /* No WHPX breakpoint at this address. Create one. */
1206 new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1207 new_breakpoints->data[new_breakpoints->used].state =
1208 WHPX_BP_SET_PENDING;
1209 new_breakpoints->used++;
1210 }
1211 }
1212
1213 /*
1214 * Free the previous breakpoint list. This can be optimized by keeping
1215 * it as shadow buffer for the next computation instead of freeing
1216 * it immediately.
1217 */
1218 g_free(breakpoints->breakpoints);
1219
1220 breakpoints->breakpoints = new_breakpoints;
1221 }
1222
1223 /*
1224 * Physically inserts/removes the breakpoints by reading and writing the
1225 * physical memory, keeping a track of the failed attempts.
1226 *
1227 * Passing resuming=true will try to set all previously unset breakpoints.
1228 * Passing resuming=false will remove all inserted ones.
1229 */
whpx_apply_breakpoints(struct whpx_breakpoint_collection * breakpoints,CPUState * cpu,bool resuming)1230 static void whpx_apply_breakpoints(
1231 struct whpx_breakpoint_collection *breakpoints,
1232 CPUState *cpu,
1233 bool resuming)
1234 {
1235 int i, rc;
1236 if (!breakpoints) {
1237 return;
1238 }
1239
1240 for (i = 0; i < breakpoints->used; i++) {
1241 /* Decide what to do right now based on the last known state. */
1242 WhpxBreakpointState state = breakpoints->data[i].state;
1243 switch (state) {
1244 case WHPX_BP_CLEARED:
1245 if (resuming) {
1246 state = WHPX_BP_SET_PENDING;
1247 }
1248 break;
1249 case WHPX_BP_SET_PENDING:
1250 if (!resuming) {
1251 state = WHPX_BP_CLEARED;
1252 }
1253 break;
1254 case WHPX_BP_SET:
1255 if (!resuming) {
1256 state = WHPX_BP_CLEAR_PENDING;
1257 }
1258 break;
1259 case WHPX_BP_CLEAR_PENDING:
1260 if (resuming) {
1261 state = WHPX_BP_SET;
1262 }
1263 break;
1264 }
1265
1266 if (state == WHPX_BP_SET_PENDING) {
1267 /* Remember the original instruction. */
1268 rc = cpu_memory_rw_debug(cpu,
1269 breakpoints->data[i].address,
1270 &breakpoints->data[i].original_instruction,
1271 1,
1272 false);
1273
1274 if (!rc) {
1275 /* Write the breakpoint instruction. */
1276 rc = cpu_memory_rw_debug(cpu,
1277 breakpoints->data[i].address,
1278 (void *)&whpx_breakpoint_instruction,
1279 1,
1280 true);
1281 }
1282
1283 if (!rc) {
1284 state = WHPX_BP_SET;
1285 }
1286
1287 }
1288
1289 if (state == WHPX_BP_CLEAR_PENDING) {
1290 /* Restore the original instruction. */
1291 rc = cpu_memory_rw_debug(cpu,
1292 breakpoints->data[i].address,
1293 &breakpoints->data[i].original_instruction,
1294 1,
1295 true);
1296
1297 if (!rc) {
1298 state = WHPX_BP_CLEARED;
1299 }
1300 }
1301
1302 breakpoints->data[i].state = state;
1303 }
1304 }
1305
1306 /*
1307 * This function is called when the a VCPU is about to start and no other
1308 * VCPUs have been started so far. Since the VCPU start order could be
1309 * arbitrary, it doesn't have to be VCPU#0.
1310 *
1311 * It is used to commit the breakpoints into memory, and configure WHPX
1312 * to intercept debug exceptions.
1313 *
1314 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1315 * more VCPUs are already running, so this is the best place to do it.
1316 */
whpx_first_vcpu_starting(CPUState * cpu)1317 static int whpx_first_vcpu_starting(CPUState *cpu)
1318 {
1319 struct whpx_state *whpx = &whpx_global;
1320 HRESULT hr;
1321
1322 g_assert(bql_locked());
1323
1324 if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1325 (whpx->breakpoints.breakpoints &&
1326 whpx->breakpoints.breakpoints->used)) {
1327 CPUBreakpoint *bp;
1328 int i = 0;
1329 bool update_pending = false;
1330
1331 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1332 if (i >= whpx->breakpoints.original_address_count ||
1333 bp->pc != whpx->breakpoints.original_addresses[i]) {
1334 update_pending = true;
1335 }
1336
1337 i++;
1338 }
1339
1340 if (i != whpx->breakpoints.original_address_count) {
1341 update_pending = true;
1342 }
1343
1344 if (update_pending) {
1345 /*
1346 * The CPU breakpoints have changed since the last call to
1347 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1348 * now be recomputed.
1349 */
1350 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1351 }
1352
1353 /* Actually insert the breakpoints into the memory. */
1354 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1355 }
1356
1357 uint64_t exception_mask;
1358 if (whpx->step_pending ||
1359 (whpx->breakpoints.breakpoints &&
1360 whpx->breakpoints.breakpoints->used)) {
1361 /*
1362 * We are either attempting to single-step one or more CPUs, or
1363 * have one or more breakpoints enabled. Both require intercepting
1364 * the WHvX64ExceptionTypeBreakpointTrap exception.
1365 */
1366
1367 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1368 } else {
1369 /* Let the guest handle all exceptions. */
1370 exception_mask = 0;
1371 }
1372
1373 hr = whpx_set_exception_exit_bitmap(exception_mask);
1374 if (!SUCCEEDED(hr)) {
1375 error_report("WHPX: Failed to update exception exit mask,"
1376 "hr=%08lx.", hr);
1377 return 1;
1378 }
1379
1380 return 0;
1381 }
1382
1383 /*
1384 * This function is called when the last VCPU has finished running.
1385 * It is used to remove any previously set breakpoints from memory.
1386 */
whpx_last_vcpu_stopping(CPUState * cpu)1387 static int whpx_last_vcpu_stopping(CPUState *cpu)
1388 {
1389 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1390 return 0;
1391 }
1392
1393 /* Returns the address of the next instruction that is about to be executed. */
whpx_vcpu_get_pc(CPUState * cpu,bool exit_context_valid)1394 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1395 {
1396 if (cpu->vcpu_dirty) {
1397 /* The CPU registers have been modified by other parts of QEMU. */
1398 return cpu_env(cpu)->eip;
1399 } else if (exit_context_valid) {
1400 /*
1401 * The CPU registers have not been modified by neither other parts
1402 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1403 * This is the most common case.
1404 */
1405 AccelCPUState *vcpu = cpu->accel;
1406 return vcpu->exit_ctx.VpContext.Rip;
1407 } else {
1408 /*
1409 * The CPU registers have been modified by a call to
1410 * WHvSetVirtualProcessorRegisters() and must be re-queried from
1411 * the target.
1412 */
1413 WHV_REGISTER_VALUE reg_value;
1414 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1415 HRESULT hr;
1416 struct whpx_state *whpx = &whpx_global;
1417
1418 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1419 whpx->partition,
1420 cpu->cpu_index,
1421 ®_name,
1422 1,
1423 ®_value);
1424
1425 if (FAILED(hr)) {
1426 error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1427 return 0;
1428 }
1429
1430 return reg_value.Reg64;
1431 }
1432 }
1433
whpx_handle_halt(CPUState * cpu)1434 static int whpx_handle_halt(CPUState *cpu)
1435 {
1436 int ret = 0;
1437
1438 bql_lock();
1439 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1440 (cpu_env(cpu)->eflags & IF_MASK)) &&
1441 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1442 cpu->exception_index = EXCP_HLT;
1443 cpu->halted = true;
1444 ret = 1;
1445 }
1446 bql_unlock();
1447
1448 return ret;
1449 }
1450
whpx_vcpu_pre_run(CPUState * cpu)1451 static void whpx_vcpu_pre_run(CPUState *cpu)
1452 {
1453 HRESULT hr;
1454 struct whpx_state *whpx = &whpx_global;
1455 AccelCPUState *vcpu = cpu->accel;
1456 X86CPU *x86_cpu = X86_CPU(cpu);
1457 CPUX86State *env = &x86_cpu->env;
1458 int irq;
1459 uint8_t tpr;
1460 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1461 UINT32 reg_count = 0;
1462 WHV_REGISTER_VALUE reg_values[3];
1463 WHV_REGISTER_NAME reg_names[3];
1464
1465 memset(&new_int, 0, sizeof(new_int));
1466 memset(reg_values, 0, sizeof(reg_values));
1467
1468 bql_lock();
1469
1470 /* Inject NMI */
1471 if (!vcpu->interruption_pending &&
1472 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1473 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1474 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1475 vcpu->interruptable = false;
1476 new_int.InterruptionType = WHvX64PendingNmi;
1477 new_int.InterruptionPending = 1;
1478 new_int.InterruptionVector = 2;
1479 }
1480 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1481 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1482 }
1483 }
1484
1485 /*
1486 * Force the VCPU out of its inner loop to process any INIT requests or
1487 * commit pending TPR access.
1488 */
1489 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1490 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1491 !(env->hflags & HF_SMM_MASK)) {
1492 cpu->exit_request = 1;
1493 }
1494 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1495 cpu->exit_request = 1;
1496 }
1497 }
1498
1499 /* Get pending hard interruption or replay one that was overwritten */
1500 if (!whpx_apic_in_platform()) {
1501 if (!vcpu->interruption_pending &&
1502 vcpu->interruptable && (env->eflags & IF_MASK)) {
1503 assert(!new_int.InterruptionPending);
1504 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1505 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1506 irq = cpu_get_pic_interrupt(env);
1507 if (irq >= 0) {
1508 new_int.InterruptionType = WHvX64PendingInterrupt;
1509 new_int.InterruptionPending = 1;
1510 new_int.InterruptionVector = irq;
1511 }
1512 }
1513 }
1514
1515 /* Setup interrupt state if new one was prepared */
1516 if (new_int.InterruptionPending) {
1517 reg_values[reg_count].PendingInterruption = new_int;
1518 reg_names[reg_count] = WHvRegisterPendingInterruption;
1519 reg_count += 1;
1520 }
1521 } else if (vcpu->ready_for_pic_interrupt &&
1522 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1523 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1524 irq = cpu_get_pic_interrupt(env);
1525 if (irq >= 0) {
1526 reg_names[reg_count] = WHvRegisterPendingEvent;
1527 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1528 {
1529 .EventPending = 1,
1530 .EventType = WHvX64PendingEventExtInt,
1531 .Vector = irq,
1532 };
1533 reg_count += 1;
1534 }
1535 }
1536
1537 /* Sync the TPR to the CR8 if was modified during the intercept */
1538 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1539 if (tpr != vcpu->tpr) {
1540 vcpu->tpr = tpr;
1541 reg_values[reg_count].Reg64 = tpr;
1542 cpu->exit_request = 1;
1543 reg_names[reg_count] = WHvX64RegisterCr8;
1544 reg_count += 1;
1545 }
1546
1547 /* Update the state of the interrupt delivery notification */
1548 if (!vcpu->window_registered &&
1549 cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1550 reg_values[reg_count].DeliverabilityNotifications =
1551 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1552 .InterruptNotification = 1
1553 };
1554 vcpu->window_registered = 1;
1555 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1556 reg_count += 1;
1557 }
1558
1559 bql_unlock();
1560 vcpu->ready_for_pic_interrupt = false;
1561
1562 if (reg_count) {
1563 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1564 whpx->partition, cpu->cpu_index,
1565 reg_names, reg_count, reg_values);
1566 if (FAILED(hr)) {
1567 error_report("WHPX: Failed to set interrupt state registers,"
1568 " hr=%08lx", hr);
1569 }
1570 }
1571 }
1572
whpx_vcpu_post_run(CPUState * cpu)1573 static void whpx_vcpu_post_run(CPUState *cpu)
1574 {
1575 AccelCPUState *vcpu = cpu->accel;
1576 X86CPU *x86_cpu = X86_CPU(cpu);
1577 CPUX86State *env = &x86_cpu->env;
1578
1579 env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1580
1581 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1582 if (vcpu->tpr != tpr) {
1583 vcpu->tpr = tpr;
1584 bql_lock();
1585 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1586 bql_unlock();
1587 }
1588
1589 vcpu->interruption_pending =
1590 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1591
1592 vcpu->interruptable =
1593 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1594 }
1595
whpx_vcpu_process_async_events(CPUState * cpu)1596 static void whpx_vcpu_process_async_events(CPUState *cpu)
1597 {
1598 X86CPU *x86_cpu = X86_CPU(cpu);
1599 CPUX86State *env = &x86_cpu->env;
1600 AccelCPUState *vcpu = cpu->accel;
1601
1602 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1603 !(env->hflags & HF_SMM_MASK)) {
1604 whpx_cpu_synchronize_state(cpu);
1605 do_cpu_init(x86_cpu);
1606 vcpu->interruptable = true;
1607 }
1608
1609 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1610 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1611 apic_poll_irq(x86_cpu->apic_state);
1612 }
1613
1614 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1615 (env->eflags & IF_MASK)) ||
1616 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1617 cpu->halted = false;
1618 }
1619
1620 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1621 cpu_reset_interrupt(cpu, CPU_INTERRUPT_SIPI);
1622 whpx_cpu_synchronize_state(cpu);
1623 do_cpu_sipi(x86_cpu);
1624 }
1625
1626 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1627 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1628 whpx_cpu_synchronize_state(cpu);
1629 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1630 env->tpr_access_type);
1631 }
1632 }
1633
whpx_vcpu_run(CPUState * cpu)1634 static int whpx_vcpu_run(CPUState *cpu)
1635 {
1636 HRESULT hr;
1637 struct whpx_state *whpx = &whpx_global;
1638 AccelCPUState *vcpu = cpu->accel;
1639 struct whpx_breakpoint *stepped_over_bp = NULL;
1640 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1641 int ret;
1642
1643 g_assert(bql_locked());
1644
1645 if (whpx->running_cpus++ == 0) {
1646 /* Insert breakpoints into memory, update exception exit bitmap. */
1647 ret = whpx_first_vcpu_starting(cpu);
1648 if (ret != 0) {
1649 return ret;
1650 }
1651 }
1652
1653 if (whpx->breakpoints.breakpoints &&
1654 whpx->breakpoints.breakpoints->used > 0)
1655 {
1656 uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1657 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1658 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1659 stepped_over_bp = NULL;
1660 }
1661
1662 if (stepped_over_bp) {
1663 /*
1664 * We are trying to run the instruction overwritten by an active
1665 * breakpoint. We will temporarily disable the breakpoint, suspend
1666 * other CPUs, and step over the instruction.
1667 */
1668 exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1669 }
1670 }
1671
1672 if (exclusive_step_mode == WHPX_STEP_NONE) {
1673 whpx_vcpu_process_async_events(cpu);
1674 if (cpu->halted && !whpx_apic_in_platform()) {
1675 cpu->exception_index = EXCP_HLT;
1676 qatomic_set(&cpu->exit_request, false);
1677 return 0;
1678 }
1679 }
1680
1681 bql_unlock();
1682
1683 if (exclusive_step_mode != WHPX_STEP_NONE) {
1684 start_exclusive();
1685 g_assert(cpu == current_cpu);
1686 g_assert(!cpu->running);
1687 cpu->running = true;
1688
1689 hr = whpx_set_exception_exit_bitmap(
1690 1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1691 if (!SUCCEEDED(hr)) {
1692 error_report("WHPX: Failed to update exception exit mask, "
1693 "hr=%08lx.", hr);
1694 return 1;
1695 }
1696
1697 if (stepped_over_bp) {
1698 /* Temporarily disable the triggered breakpoint. */
1699 cpu_memory_rw_debug(cpu,
1700 stepped_over_bp->address,
1701 &stepped_over_bp->original_instruction,
1702 1,
1703 true);
1704 }
1705 } else {
1706 cpu_exec_start(cpu);
1707 }
1708
1709 do {
1710 if (cpu->vcpu_dirty) {
1711 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1712 cpu->vcpu_dirty = false;
1713 }
1714
1715 if (exclusive_step_mode == WHPX_STEP_NONE) {
1716 whpx_vcpu_pre_run(cpu);
1717
1718 if (qatomic_read(&cpu->exit_request)) {
1719 whpx_vcpu_kick(cpu);
1720 }
1721 }
1722
1723 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1724 whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1725 }
1726
1727 hr = whp_dispatch.WHvRunVirtualProcessor(
1728 whpx->partition, cpu->cpu_index,
1729 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1730
1731 if (FAILED(hr)) {
1732 error_report("WHPX: Failed to exec a virtual processor,"
1733 " hr=%08lx", hr);
1734 ret = -1;
1735 break;
1736 }
1737
1738 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1739 whpx_vcpu_configure_single_stepping(cpu,
1740 false,
1741 &vcpu->exit_ctx.VpContext.Rflags);
1742 }
1743
1744 whpx_vcpu_post_run(cpu);
1745
1746 switch (vcpu->exit_ctx.ExitReason) {
1747 case WHvRunVpExitReasonMemoryAccess:
1748 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1749 break;
1750
1751 case WHvRunVpExitReasonX64IoPortAccess:
1752 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1753 break;
1754
1755 case WHvRunVpExitReasonX64InterruptWindow:
1756 vcpu->ready_for_pic_interrupt = 1;
1757 vcpu->window_registered = 0;
1758 ret = 0;
1759 break;
1760
1761 case WHvRunVpExitReasonX64ApicEoi:
1762 assert(whpx_apic_in_platform());
1763 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1764 break;
1765
1766 case WHvRunVpExitReasonX64Halt:
1767 /*
1768 * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1769 * longer used.
1770 */
1771 ret = whpx_handle_halt(cpu);
1772 break;
1773
1774 case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1775 WHV_INTERRUPT_CONTROL ipi = {0};
1776 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1777 uint32_t delivery_mode =
1778 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1779 int dest_shorthand =
1780 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1781 bool broadcast = false;
1782 bool include_self = false;
1783 uint32_t i;
1784
1785 /* We only registered for INIT and SIPI exits. */
1786 if ((delivery_mode != APIC_DM_INIT) &&
1787 (delivery_mode != APIC_DM_SIPI)) {
1788 error_report(
1789 "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1790 break;
1791 }
1792
1793 if (delivery_mode == APIC_DM_INIT) {
1794 ipi.Type = WHvX64InterruptTypeInit;
1795 } else {
1796 ipi.Type = WHvX64InterruptTypeSipi;
1797 }
1798
1799 ipi.DestinationMode =
1800 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1801 WHvX64InterruptDestinationModeLogical :
1802 WHvX64InterruptDestinationModePhysical;
1803
1804 ipi.TriggerMode =
1805 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1806 WHvX64InterruptTriggerModeLevel :
1807 WHvX64InterruptTriggerModeEdge;
1808
1809 ipi.Vector = icr & APIC_VECTOR_MASK;
1810 switch (dest_shorthand) {
1811 /* no shorthand. Bits 56-63 contain the destination. */
1812 case 0:
1813 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1814 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1815 &ipi, sizeof(ipi));
1816 if (FAILED(hr)) {
1817 error_report("WHPX: Failed to request interrupt hr=%08lx",
1818 hr);
1819 }
1820
1821 break;
1822
1823 /* self */
1824 case 1:
1825 include_self = true;
1826 break;
1827
1828 /* broadcast, including self */
1829 case 2:
1830 broadcast = true;
1831 include_self = true;
1832 break;
1833
1834 /* broadcast, excluding self */
1835 case 3:
1836 broadcast = true;
1837 break;
1838 }
1839
1840 if (!broadcast && !include_self) {
1841 break;
1842 }
1843
1844 for (i = 0; i <= max_vcpu_index; i++) {
1845 if (i == cpu->cpu_index && !include_self) {
1846 continue;
1847 }
1848
1849 /*
1850 * Assuming that APIC Ids are identity mapped since
1851 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1852 * are not handled yet and the hypervisor doesn't allow the
1853 * guest to modify the APIC ID.
1854 */
1855 ipi.Destination = i;
1856 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1857 &ipi, sizeof(ipi));
1858 if (FAILED(hr)) {
1859 error_report(
1860 "WHPX: Failed to request SIPI for %d, hr=%08lx",
1861 i, hr);
1862 }
1863 }
1864
1865 break;
1866 }
1867
1868 case WHvRunVpExitReasonCanceled:
1869 if (exclusive_step_mode != WHPX_STEP_NONE) {
1870 /*
1871 * We are trying to step over a single instruction, and
1872 * likely got a request to stop from another thread.
1873 * Delay it until we are done stepping
1874 * over.
1875 */
1876 ret = 0;
1877 } else {
1878 cpu->exception_index = EXCP_INTERRUPT;
1879 ret = 1;
1880 }
1881 break;
1882 case WHvRunVpExitReasonX64MsrAccess: {
1883 WHV_REGISTER_VALUE reg_values[3] = {0};
1884 WHV_REGISTER_NAME reg_names[3];
1885 UINT32 reg_count;
1886
1887 reg_names[0] = WHvX64RegisterRip;
1888 reg_names[1] = WHvX64RegisterRax;
1889 reg_names[2] = WHvX64RegisterRdx;
1890
1891 reg_values[0].Reg64 =
1892 vcpu->exit_ctx.VpContext.Rip +
1893 vcpu->exit_ctx.VpContext.InstructionLength;
1894
1895 /*
1896 * For all unsupported MSR access we:
1897 * ignore writes
1898 * return 0 on read.
1899 */
1900 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1901 1 : 3;
1902
1903 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1904 whpx->partition,
1905 cpu->cpu_index,
1906 reg_names, reg_count,
1907 reg_values);
1908
1909 if (FAILED(hr)) {
1910 error_report("WHPX: Failed to set MsrAccess state "
1911 " registers, hr=%08lx", hr);
1912 }
1913 ret = 0;
1914 break;
1915 }
1916 case WHvRunVpExitReasonX64Cpuid: {
1917 WHV_REGISTER_VALUE reg_values[5];
1918 WHV_REGISTER_NAME reg_names[5];
1919 UINT32 reg_count = 5;
1920 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1921 X86CPU *x86_cpu = X86_CPU(cpu);
1922 CPUX86State *env = &x86_cpu->env;
1923
1924 memset(reg_values, 0, sizeof(reg_values));
1925
1926 rip = vcpu->exit_ctx.VpContext.Rip +
1927 vcpu->exit_ctx.VpContext.InstructionLength;
1928 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1929
1930 /*
1931 * Ideally, these should be supplied to the hypervisor during VCPU
1932 * initialization and it should be able to satisfy this request.
1933 * But, currently, WHPX doesn't support setting CPUID values in the
1934 * hypervisor once the partition has been setup, which is too late
1935 * since VCPUs are realized later. For now, use the values from
1936 * QEMU to satisfy these requests, until WHPX adds support for
1937 * being able to set these values in the hypervisor at runtime.
1938 */
1939 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1940 (UINT32 *)&rcx, (UINT32 *)&rdx);
1941 switch (cpuid_fn) {
1942 case 0x40000000:
1943 /* Expose the vmware cpu frequency cpuid leaf */
1944 rax = 0x40000010;
1945 rbx = rcx = rdx = 0;
1946 break;
1947
1948 case 0x40000010:
1949 rax = env->tsc_khz;
1950 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1951 rcx = rdx = 0;
1952 break;
1953
1954 case 0x80000001:
1955 /* Remove any support of OSVW */
1956 rcx &= ~CPUID_EXT3_OSVW;
1957 break;
1958 }
1959
1960 reg_names[0] = WHvX64RegisterRip;
1961 reg_names[1] = WHvX64RegisterRax;
1962 reg_names[2] = WHvX64RegisterRcx;
1963 reg_names[3] = WHvX64RegisterRdx;
1964 reg_names[4] = WHvX64RegisterRbx;
1965
1966 reg_values[0].Reg64 = rip;
1967 reg_values[1].Reg64 = rax;
1968 reg_values[2].Reg64 = rcx;
1969 reg_values[3].Reg64 = rdx;
1970 reg_values[4].Reg64 = rbx;
1971
1972 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1973 whpx->partition, cpu->cpu_index,
1974 reg_names,
1975 reg_count,
1976 reg_values);
1977
1978 if (FAILED(hr)) {
1979 error_report("WHPX: Failed to set CpuidAccess state registers,"
1980 " hr=%08lx", hr);
1981 }
1982 ret = 0;
1983 break;
1984 }
1985 case WHvRunVpExitReasonException:
1986 whpx_get_registers(cpu);
1987
1988 if ((vcpu->exit_ctx.VpException.ExceptionType ==
1989 WHvX64ExceptionTypeDebugTrapOrFault) &&
1990 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1991 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1992 whpx_breakpoint_instruction)) {
1993 /* Stopped at a software breakpoint. */
1994 cpu->exception_index = EXCP_DEBUG;
1995 } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
1996 WHvX64ExceptionTypeDebugTrapOrFault) &&
1997 !cpu->singlestep_enabled) {
1998 /*
1999 * Just finished stepping over a breakpoint, but the
2000 * gdb does not expect us to do single-stepping.
2001 * Don't do anything special.
2002 */
2003 cpu->exception_index = EXCP_INTERRUPT;
2004 } else {
2005 /* Another exception or debug event. Report it to GDB. */
2006 cpu->exception_index = EXCP_DEBUG;
2007 }
2008
2009 ret = 1;
2010 break;
2011 case WHvRunVpExitReasonNone:
2012 case WHvRunVpExitReasonUnrecoverableException:
2013 case WHvRunVpExitReasonInvalidVpRegisterValue:
2014 case WHvRunVpExitReasonUnsupportedFeature:
2015 default:
2016 error_report("WHPX: Unexpected VP exit code %d",
2017 vcpu->exit_ctx.ExitReason);
2018 whpx_get_registers(cpu);
2019 bql_lock();
2020 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2021 bql_unlock();
2022 break;
2023 }
2024
2025 } while (!ret);
2026
2027 if (stepped_over_bp) {
2028 /* Restore the breakpoint we stepped over */
2029 cpu_memory_rw_debug(cpu,
2030 stepped_over_bp->address,
2031 (void *)&whpx_breakpoint_instruction,
2032 1,
2033 true);
2034 }
2035
2036 if (exclusive_step_mode != WHPX_STEP_NONE) {
2037 g_assert(cpu_in_exclusive_context(cpu));
2038 cpu->running = false;
2039 end_exclusive();
2040
2041 exclusive_step_mode = WHPX_STEP_NONE;
2042 } else {
2043 cpu_exec_end(cpu);
2044 }
2045
2046 bql_lock();
2047 current_cpu = cpu;
2048
2049 if (--whpx->running_cpus == 0) {
2050 whpx_last_vcpu_stopping(cpu);
2051 }
2052
2053 qatomic_set(&cpu->exit_request, false);
2054
2055 return ret < 0;
2056 }
2057
do_whpx_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2058 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2059 {
2060 if (!cpu->vcpu_dirty) {
2061 whpx_get_registers(cpu);
2062 cpu->vcpu_dirty = true;
2063 }
2064 }
2065
do_whpx_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2066 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2067 run_on_cpu_data arg)
2068 {
2069 whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2070 cpu->vcpu_dirty = false;
2071 }
2072
do_whpx_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2073 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2074 run_on_cpu_data arg)
2075 {
2076 whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2077 cpu->vcpu_dirty = false;
2078 }
2079
do_whpx_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2080 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2081 run_on_cpu_data arg)
2082 {
2083 cpu->vcpu_dirty = true;
2084 }
2085
2086 /*
2087 * CPU support.
2088 */
2089
whpx_cpu_synchronize_state(CPUState * cpu)2090 void whpx_cpu_synchronize_state(CPUState *cpu)
2091 {
2092 if (!cpu->vcpu_dirty) {
2093 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2094 }
2095 }
2096
whpx_cpu_synchronize_post_reset(CPUState * cpu)2097 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2098 {
2099 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2100 }
2101
whpx_cpu_synchronize_post_init(CPUState * cpu)2102 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2103 {
2104 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2105 }
2106
whpx_cpu_synchronize_pre_loadvm(CPUState * cpu)2107 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2108 {
2109 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2110 }
2111
whpx_pre_resume_vm(AccelState * as,bool step_pending)2112 static void whpx_pre_resume_vm(AccelState *as, bool step_pending)
2113 {
2114 whpx_global.step_pending = step_pending;
2115 }
2116
2117 /*
2118 * Vcpu support.
2119 */
2120
2121 static Error *whpx_migration_blocker;
2122
whpx_cpu_update_state(void * opaque,bool running,RunState state)2123 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2124 {
2125 CPUX86State *env = opaque;
2126
2127 if (running) {
2128 env->tsc_valid = false;
2129 }
2130 }
2131
whpx_init_vcpu(CPUState * cpu)2132 int whpx_init_vcpu(CPUState *cpu)
2133 {
2134 HRESULT hr;
2135 struct whpx_state *whpx = &whpx_global;
2136 AccelCPUState *vcpu = NULL;
2137 Error *local_error = NULL;
2138 X86CPU *x86_cpu = X86_CPU(cpu);
2139 CPUX86State *env = &x86_cpu->env;
2140 UINT64 freq = 0;
2141 int ret;
2142
2143 /* Add migration blockers for all unsupported features of the
2144 * Windows Hypervisor Platform
2145 */
2146 if (whpx_migration_blocker == NULL) {
2147 error_setg(&whpx_migration_blocker,
2148 "State blocked due to non-migratable CPUID feature support,"
2149 "dirty memory tracking support, and XSAVE/XRSTOR support");
2150
2151 if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2152 error_report_err(local_error);
2153 ret = -EINVAL;
2154 goto error;
2155 }
2156 }
2157
2158 vcpu = g_new0(AccelCPUState, 1);
2159
2160 hr = whp_dispatch.WHvEmulatorCreateEmulator(
2161 &whpx_emu_callbacks,
2162 &vcpu->emulator);
2163 if (FAILED(hr)) {
2164 error_report("WHPX: Failed to setup instruction completion support,"
2165 " hr=%08lx", hr);
2166 ret = -EINVAL;
2167 goto error;
2168 }
2169
2170 hr = whp_dispatch.WHvCreateVirtualProcessor(
2171 whpx->partition, cpu->cpu_index, 0);
2172 if (FAILED(hr)) {
2173 error_report("WHPX: Failed to create a virtual processor,"
2174 " hr=%08lx", hr);
2175 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2176 ret = -EINVAL;
2177 goto error;
2178 }
2179
2180 /*
2181 * vcpu's TSC frequency is either specified by user, or use the value
2182 * provided by Hyper-V if the former is not present. In the latter case, we
2183 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2184 * frequency can be migrated later via this field.
2185 */
2186 if (!env->tsc_khz) {
2187 hr = whp_dispatch.WHvGetCapability(
2188 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2189 NULL);
2190 if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2191 if (FAILED(hr)) {
2192 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2193 } else {
2194 env->tsc_khz = freq / 1000; /* Hz to KHz */
2195 }
2196 }
2197 }
2198
2199 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2200 hr = whp_dispatch.WHvGetCapability(
2201 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2202 if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2203 if (FAILED(hr)) {
2204 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2205 } else {
2206 env->apic_bus_freq = freq;
2207 }
2208 }
2209
2210 /*
2211 * If the vmware cpuid frequency leaf option is set, and we have a valid
2212 * tsc value, trap the corresponding cpuid's.
2213 */
2214 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2215 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2216
2217 hr = whp_dispatch.WHvSetPartitionProperty(
2218 whpx->partition,
2219 WHvPartitionPropertyCodeCpuidExitList,
2220 cpuidExitList,
2221 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2222
2223 if (FAILED(hr)) {
2224 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2225 hr);
2226 ret = -EINVAL;
2227 goto error;
2228 }
2229 }
2230
2231 vcpu->interruptable = true;
2232 cpu->vcpu_dirty = true;
2233 cpu->accel = vcpu;
2234 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2235 qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2236
2237 return 0;
2238
2239 error:
2240 g_free(vcpu);
2241
2242 return ret;
2243 }
2244
whpx_vcpu_exec(CPUState * cpu)2245 int whpx_vcpu_exec(CPUState *cpu)
2246 {
2247 int ret;
2248 int fatal;
2249
2250 for (;;) {
2251 if (cpu->exception_index >= EXCP_INTERRUPT) {
2252 ret = cpu->exception_index;
2253 cpu->exception_index = -1;
2254 break;
2255 }
2256
2257 fatal = whpx_vcpu_run(cpu);
2258
2259 if (fatal) {
2260 error_report("WHPX: Failed to exec a virtual processor");
2261 abort();
2262 }
2263 }
2264
2265 return ret;
2266 }
2267
whpx_destroy_vcpu(CPUState * cpu)2268 void whpx_destroy_vcpu(CPUState *cpu)
2269 {
2270 struct whpx_state *whpx = &whpx_global;
2271 AccelCPUState *vcpu = cpu->accel;
2272
2273 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2274 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2275 g_free(cpu->accel);
2276 }
2277
whpx_vcpu_kick(CPUState * cpu)2278 void whpx_vcpu_kick(CPUState *cpu)
2279 {
2280 struct whpx_state *whpx = &whpx_global;
2281 whp_dispatch.WHvCancelRunVirtualProcessor(
2282 whpx->partition, cpu->cpu_index, 0);
2283 }
2284
2285 /*
2286 * Memory support.
2287 */
2288
whpx_update_mapping(hwaddr start_pa,ram_addr_t size,void * host_va,int add,int rom,const char * name)2289 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2290 void *host_va, int add, int rom,
2291 const char *name)
2292 {
2293 struct whpx_state *whpx = &whpx_global;
2294 HRESULT hr;
2295
2296 /*
2297 if (add) {
2298 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2299 (void*)start_pa, (void*)size, host_va,
2300 (rom ? "ROM" : "RAM"), name);
2301 } else {
2302 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n",
2303 (void*)start_pa, (void*)size, host_va, name);
2304 }
2305 */
2306
2307 if (add) {
2308 hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2309 host_va,
2310 start_pa,
2311 size,
2312 (WHvMapGpaRangeFlagRead |
2313 WHvMapGpaRangeFlagExecute |
2314 (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2315 } else {
2316 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2317 start_pa,
2318 size);
2319 }
2320
2321 if (FAILED(hr)) {
2322 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2323 " Host:%p, hr=%08lx",
2324 (add ? "MAP" : "UNMAP"), name,
2325 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2326 }
2327 }
2328
whpx_process_section(MemoryRegionSection * section,int add)2329 static void whpx_process_section(MemoryRegionSection *section, int add)
2330 {
2331 MemoryRegion *mr = section->mr;
2332 hwaddr start_pa = section->offset_within_address_space;
2333 ram_addr_t size = int128_get64(section->size);
2334 unsigned int delta;
2335 uint64_t host_va;
2336
2337 if (!memory_region_is_ram(mr)) {
2338 return;
2339 }
2340
2341 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2342 delta &= ~qemu_real_host_page_mask();
2343 if (delta > size) {
2344 return;
2345 }
2346 start_pa += delta;
2347 size -= delta;
2348 size &= qemu_real_host_page_mask();
2349 if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2350 return;
2351 }
2352
2353 host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2354 + section->offset_within_region + delta;
2355
2356 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2357 memory_region_is_rom(mr), mr->name);
2358 }
2359
whpx_region_add(MemoryListener * listener,MemoryRegionSection * section)2360 static void whpx_region_add(MemoryListener *listener,
2361 MemoryRegionSection *section)
2362 {
2363 memory_region_ref(section->mr);
2364 whpx_process_section(section, 1);
2365 }
2366
whpx_region_del(MemoryListener * listener,MemoryRegionSection * section)2367 static void whpx_region_del(MemoryListener *listener,
2368 MemoryRegionSection *section)
2369 {
2370 whpx_process_section(section, 0);
2371 memory_region_unref(section->mr);
2372 }
2373
whpx_transaction_begin(MemoryListener * listener)2374 static void whpx_transaction_begin(MemoryListener *listener)
2375 {
2376 }
2377
whpx_transaction_commit(MemoryListener * listener)2378 static void whpx_transaction_commit(MemoryListener *listener)
2379 {
2380 }
2381
whpx_log_sync(MemoryListener * listener,MemoryRegionSection * section)2382 static void whpx_log_sync(MemoryListener *listener,
2383 MemoryRegionSection *section)
2384 {
2385 MemoryRegion *mr = section->mr;
2386
2387 if (!memory_region_is_ram(mr)) {
2388 return;
2389 }
2390
2391 memory_region_set_dirty(mr, 0, int128_get64(section->size));
2392 }
2393
2394 static MemoryListener whpx_memory_listener = {
2395 .name = "whpx",
2396 .begin = whpx_transaction_begin,
2397 .commit = whpx_transaction_commit,
2398 .region_add = whpx_region_add,
2399 .region_del = whpx_region_del,
2400 .log_sync = whpx_log_sync,
2401 .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2402 };
2403
whpx_memory_init(void)2404 static void whpx_memory_init(void)
2405 {
2406 memory_listener_register(&whpx_memory_listener, &address_space_memory);
2407 }
2408
2409 /*
2410 * Load the functions from the given library, using the given handle. If a
2411 * handle is provided, it is used, otherwise the library is opened. The
2412 * handle will be updated on return with the opened one.
2413 */
load_whp_dispatch_fns(HMODULE * handle,WHPFunctionList function_list)2414 static bool load_whp_dispatch_fns(HMODULE *handle,
2415 WHPFunctionList function_list)
2416 {
2417 HMODULE hLib = *handle;
2418
2419 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2420 #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2421 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2422 whp_dispatch.function_name = \
2423 (function_name ## _t)GetProcAddress(hLib, #function_name); \
2424
2425 #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2426 whp_dispatch.function_name = \
2427 (function_name ## _t)GetProcAddress(hLib, #function_name); \
2428 if (!whp_dispatch.function_name) { \
2429 error_report("Could not load function %s", #function_name); \
2430 goto error; \
2431 } \
2432
2433 #define WHP_LOAD_LIB(lib_name, handle_lib) \
2434 if (!handle_lib) { \
2435 handle_lib = LoadLibrary(lib_name); \
2436 if (!handle_lib) { \
2437 error_report("Could not load library %s.", lib_name); \
2438 goto error; \
2439 } \
2440 } \
2441
2442 switch (function_list) {
2443 case WINHV_PLATFORM_FNS_DEFAULT:
2444 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2445 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2446 break;
2447
2448 case WINHV_EMULATION_FNS_DEFAULT:
2449 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2450 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2451 break;
2452
2453 case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2454 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2455 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2456 break;
2457 }
2458
2459 *handle = hLib;
2460 return true;
2461
2462 error:
2463 if (hLib) {
2464 FreeLibrary(hLib);
2465 }
2466
2467 return false;
2468 }
2469
whpx_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)2470 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2471 const char *name, void *opaque,
2472 Error **errp)
2473 {
2474 struct whpx_state *whpx = &whpx_global;
2475 OnOffSplit mode;
2476
2477 if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2478 return;
2479 }
2480
2481 switch (mode) {
2482 case ON_OFF_SPLIT_ON:
2483 whpx->kernel_irqchip_allowed = true;
2484 whpx->kernel_irqchip_required = true;
2485 break;
2486
2487 case ON_OFF_SPLIT_OFF:
2488 whpx->kernel_irqchip_allowed = false;
2489 whpx->kernel_irqchip_required = false;
2490 break;
2491
2492 case ON_OFF_SPLIT_SPLIT:
2493 error_setg(errp, "WHPX: split irqchip currently not supported");
2494 error_append_hint(errp,
2495 "Try without kernel-irqchip or with kernel-irqchip=on|off");
2496 break;
2497
2498 default:
2499 /*
2500 * The value was checked in visit_type_OnOffSplit() above. If
2501 * we get here, then something is wrong in QEMU.
2502 */
2503 abort();
2504 }
2505 }
2506
whpx_cpu_instance_init(CPUState * cs)2507 static void whpx_cpu_instance_init(CPUState *cs)
2508 {
2509 X86CPU *cpu = X86_CPU(cs);
2510
2511 host_cpu_instance_init(cpu);
2512 }
2513
whpx_cpu_accel_class_init(ObjectClass * oc,const void * data)2514 static void whpx_cpu_accel_class_init(ObjectClass *oc, const void *data)
2515 {
2516 AccelCPUClass *acc = ACCEL_CPU_CLASS(oc);
2517
2518 acc->cpu_instance_init = whpx_cpu_instance_init;
2519 }
2520
2521 static const TypeInfo whpx_cpu_accel_type = {
2522 .name = ACCEL_CPU_NAME("whpx"),
2523
2524 .parent = TYPE_ACCEL_CPU,
2525 .class_init = whpx_cpu_accel_class_init,
2526 .abstract = true,
2527 };
2528
2529 /*
2530 * Partition support
2531 */
2532
whpx_accel_init(AccelState * as,MachineState * ms)2533 static int whpx_accel_init(AccelState *as, MachineState *ms)
2534 {
2535 struct whpx_state *whpx;
2536 int ret;
2537 HRESULT hr;
2538 WHV_CAPABILITY whpx_cap;
2539 UINT32 whpx_cap_size;
2540 WHV_PARTITION_PROPERTY prop;
2541 UINT32 cpuidExitList[] = {1, 0x80000001};
2542 WHV_CAPABILITY_FEATURES features = {0};
2543
2544 whpx = &whpx_global;
2545
2546 if (!init_whp_dispatch()) {
2547 ret = -ENOSYS;
2548 goto error;
2549 }
2550
2551 whpx->mem_quota = ms->ram_size;
2552
2553 hr = whp_dispatch.WHvGetCapability(
2554 WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2555 sizeof(whpx_cap), &whpx_cap_size);
2556 if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2557 error_report("WHPX: No accelerator found, hr=%08lx", hr);
2558 ret = -ENOSPC;
2559 goto error;
2560 }
2561
2562 hr = whp_dispatch.WHvGetCapability(
2563 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2564 if (FAILED(hr)) {
2565 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2566 ret = -EINVAL;
2567 goto error;
2568 }
2569
2570 hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2571 if (FAILED(hr)) {
2572 error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2573 ret = -EINVAL;
2574 goto error;
2575 }
2576
2577 /*
2578 * Query the XSAVE capability of the partition. Any error here is not
2579 * considered fatal.
2580 */
2581 hr = whp_dispatch.WHvGetPartitionProperty(
2582 whpx->partition,
2583 WHvPartitionPropertyCodeProcessorXsaveFeatures,
2584 &whpx_xsave_cap,
2585 sizeof(whpx_xsave_cap),
2586 &whpx_cap_size);
2587
2588 /*
2589 * Windows version which don't support this property will return with the
2590 * specific error code.
2591 */
2592 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2593 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2594 }
2595
2596 if (!whpx_has_xsave()) {
2597 printf("WHPX: Partition is not XSAVE capable\n");
2598 }
2599
2600 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2601 prop.ProcessorCount = ms->smp.cpus;
2602 hr = whp_dispatch.WHvSetPartitionProperty(
2603 whpx->partition,
2604 WHvPartitionPropertyCodeProcessorCount,
2605 &prop,
2606 sizeof(WHV_PARTITION_PROPERTY));
2607
2608 if (FAILED(hr)) {
2609 error_report("WHPX: Failed to set partition processor count to %u,"
2610 " hr=%08lx", prop.ProcessorCount, hr);
2611 ret = -EINVAL;
2612 goto error;
2613 }
2614
2615 /*
2616 * Error out if WHP doesn't support apic emulation and user is requiring
2617 * it.
2618 */
2619 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2620 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2621 error_report("WHPX: kernel irqchip requested, but unavailable. "
2622 "Try without kernel-irqchip or with kernel-irqchip=off");
2623 ret = -EINVAL;
2624 goto error;
2625 }
2626
2627 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2628 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2629 WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2630 WHvX64LocalApicEmulationModeXApic;
2631 printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2632 hr = whp_dispatch.WHvSetPartitionProperty(
2633 whpx->partition,
2634 WHvPartitionPropertyCodeLocalApicEmulationMode,
2635 &mode,
2636 sizeof(mode));
2637 if (FAILED(hr)) {
2638 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2639 if (whpx->kernel_irqchip_required) {
2640 error_report("WHPX: kernel irqchip requested, but unavailable");
2641 ret = -EINVAL;
2642 goto error;
2643 }
2644 } else {
2645 whpx->apic_in_platform = true;
2646 }
2647 }
2648
2649 /* Register for MSR and CPUID exits */
2650 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2651 prop.ExtendedVmExits.X64MsrExit = 1;
2652 prop.ExtendedVmExits.X64CpuidExit = 1;
2653 prop.ExtendedVmExits.ExceptionExit = 1;
2654 if (whpx_apic_in_platform()) {
2655 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2656 }
2657
2658 hr = whp_dispatch.WHvSetPartitionProperty(
2659 whpx->partition,
2660 WHvPartitionPropertyCodeExtendedVmExits,
2661 &prop,
2662 sizeof(WHV_PARTITION_PROPERTY));
2663 if (FAILED(hr)) {
2664 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2665 ret = -EINVAL;
2666 goto error;
2667 }
2668
2669 hr = whp_dispatch.WHvSetPartitionProperty(
2670 whpx->partition,
2671 WHvPartitionPropertyCodeCpuidExitList,
2672 cpuidExitList,
2673 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2674
2675 if (FAILED(hr)) {
2676 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2677 hr);
2678 ret = -EINVAL;
2679 goto error;
2680 }
2681
2682 /*
2683 * We do not want to intercept any exceptions from the guest,
2684 * until we actually start debugging with gdb.
2685 */
2686 whpx->exception_exit_bitmap = -1;
2687 hr = whpx_set_exception_exit_bitmap(0);
2688
2689 if (FAILED(hr)) {
2690 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2691 ret = -EINVAL;
2692 goto error;
2693 }
2694
2695 hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2696 if (FAILED(hr)) {
2697 error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2698 ret = -EINVAL;
2699 goto error;
2700 }
2701
2702 whpx_memory_init();
2703
2704 printf("Windows Hypervisor Platform accelerator is operational\n");
2705 return 0;
2706
2707 error:
2708
2709 if (NULL != whpx->partition) {
2710 whp_dispatch.WHvDeletePartition(whpx->partition);
2711 whpx->partition = NULL;
2712 }
2713
2714 return ret;
2715 }
2716
whpx_apic_in_platform(void)2717 bool whpx_apic_in_platform(void) {
2718 return whpx_global.apic_in_platform;
2719 }
2720
whpx_accel_class_init(ObjectClass * oc,const void * data)2721 static void whpx_accel_class_init(ObjectClass *oc, const void *data)
2722 {
2723 AccelClass *ac = ACCEL_CLASS(oc);
2724 ac->name = "WHPX";
2725 ac->init_machine = whpx_accel_init;
2726 ac->pre_resume_vm = whpx_pre_resume_vm;
2727 ac->allowed = &whpx_allowed;
2728
2729 object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2730 NULL, whpx_set_kernel_irqchip,
2731 NULL, NULL);
2732 object_class_property_set_description(oc, "kernel-irqchip",
2733 "Configure WHPX in-kernel irqchip");
2734 }
2735
whpx_accel_instance_init(Object * obj)2736 static void whpx_accel_instance_init(Object *obj)
2737 {
2738 struct whpx_state *whpx = &whpx_global;
2739
2740 memset(whpx, 0, sizeof(struct whpx_state));
2741 /* Turn on kernel-irqchip, by default */
2742 whpx->kernel_irqchip_allowed = true;
2743 }
2744
2745 static const TypeInfo whpx_accel_type = {
2746 .name = ACCEL_CLASS_NAME("whpx"),
2747 .parent = TYPE_ACCEL,
2748 .instance_init = whpx_accel_instance_init,
2749 .class_init = whpx_accel_class_init,
2750 };
2751
whpx_type_init(void)2752 static void whpx_type_init(void)
2753 {
2754 type_register_static(&whpx_accel_type);
2755 type_register_static(&whpx_cpu_accel_type);
2756 }
2757
init_whp_dispatch(void)2758 bool init_whp_dispatch(void)
2759 {
2760 if (whp_dispatch_initialized) {
2761 return true;
2762 }
2763
2764 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2765 goto error;
2766 }
2767
2768 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2769 goto error;
2770 }
2771
2772 assert(load_whp_dispatch_fns(&hWinHvPlatform,
2773 WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2774 whp_dispatch_initialized = true;
2775
2776 return true;
2777 error:
2778 if (hWinHvPlatform) {
2779 FreeLibrary(hWinHvPlatform);
2780 }
2781
2782 if (hWinHvEmulation) {
2783 FreeLibrary(hWinHvEmulation);
2784 }
2785
2786 return false;
2787 }
2788
2789 type_init(whpx_type_init);
2790