1 /* 2 * QEMU Windows Hypervisor Platform accelerator (WHPX) 3 * 4 * Copyright Microsoft Corp. 2017 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 11 #include "qemu/osdep.h" 12 #include "cpu.h" 13 #include "system/address-spaces.h" 14 #include "system/ioport.h" 15 #include "gdbstub/helpers.h" 16 #include "qemu/accel.h" 17 #include "accel/accel-ops.h" 18 #include "system/whpx.h" 19 #include "system/cpus.h" 20 #include "system/runstate.h" 21 #include "qemu/main-loop.h" 22 #include "hw/boards.h" 23 #include "hw/intc/ioapic.h" 24 #include "hw/i386/apic_internal.h" 25 #include "qemu/error-report.h" 26 #include "qapi/error.h" 27 #include "qapi/qapi-types-common.h" 28 #include "qapi/qapi-visit-common.h" 29 #include "migration/blocker.h" 30 #include "host-cpu.h" 31 #include "accel/accel-cpu-target.h" 32 #include <winerror.h> 33 34 #include "whpx-internal.h" 35 #include "whpx-accel-ops.h" 36 37 #include <winhvplatform.h> 38 #include <winhvemulation.h> 39 40 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) 41 42 static const WHV_REGISTER_NAME whpx_register_names[] = { 43 44 /* X64 General purpose registers */ 45 WHvX64RegisterRax, 46 WHvX64RegisterRcx, 47 WHvX64RegisterRdx, 48 WHvX64RegisterRbx, 49 WHvX64RegisterRsp, 50 WHvX64RegisterRbp, 51 WHvX64RegisterRsi, 52 WHvX64RegisterRdi, 53 WHvX64RegisterR8, 54 WHvX64RegisterR9, 55 WHvX64RegisterR10, 56 WHvX64RegisterR11, 57 WHvX64RegisterR12, 58 WHvX64RegisterR13, 59 WHvX64RegisterR14, 60 WHvX64RegisterR15, 61 WHvX64RegisterRip, 62 WHvX64RegisterRflags, 63 64 /* X64 Segment registers */ 65 WHvX64RegisterEs, 66 WHvX64RegisterCs, 67 WHvX64RegisterSs, 68 WHvX64RegisterDs, 69 WHvX64RegisterFs, 70 WHvX64RegisterGs, 71 WHvX64RegisterLdtr, 72 WHvX64RegisterTr, 73 74 /* X64 Table registers */ 75 WHvX64RegisterIdtr, 76 WHvX64RegisterGdtr, 77 78 /* X64 Control Registers */ 79 WHvX64RegisterCr0, 80 WHvX64RegisterCr2, 81 WHvX64RegisterCr3, 82 WHvX64RegisterCr4, 83 WHvX64RegisterCr8, 84 85 /* X64 Debug Registers */ 86 /* 87 * WHvX64RegisterDr0, 88 * WHvX64RegisterDr1, 89 * WHvX64RegisterDr2, 90 * WHvX64RegisterDr3, 91 * WHvX64RegisterDr6, 92 * WHvX64RegisterDr7, 93 */ 94 95 /* X64 Floating Point and Vector Registers */ 96 WHvX64RegisterXmm0, 97 WHvX64RegisterXmm1, 98 WHvX64RegisterXmm2, 99 WHvX64RegisterXmm3, 100 WHvX64RegisterXmm4, 101 WHvX64RegisterXmm5, 102 WHvX64RegisterXmm6, 103 WHvX64RegisterXmm7, 104 WHvX64RegisterXmm8, 105 WHvX64RegisterXmm9, 106 WHvX64RegisterXmm10, 107 WHvX64RegisterXmm11, 108 WHvX64RegisterXmm12, 109 WHvX64RegisterXmm13, 110 WHvX64RegisterXmm14, 111 WHvX64RegisterXmm15, 112 WHvX64RegisterFpMmx0, 113 WHvX64RegisterFpMmx1, 114 WHvX64RegisterFpMmx2, 115 WHvX64RegisterFpMmx3, 116 WHvX64RegisterFpMmx4, 117 WHvX64RegisterFpMmx5, 118 WHvX64RegisterFpMmx6, 119 WHvX64RegisterFpMmx7, 120 WHvX64RegisterFpControlStatus, 121 WHvX64RegisterXmmControlStatus, 122 123 /* X64 MSRs */ 124 WHvX64RegisterEfer, 125 #ifdef TARGET_X86_64 126 WHvX64RegisterKernelGsBase, 127 #endif 128 WHvX64RegisterApicBase, 129 /* WHvX64RegisterPat, */ 130 WHvX64RegisterSysenterCs, 131 WHvX64RegisterSysenterEip, 132 WHvX64RegisterSysenterEsp, 133 WHvX64RegisterStar, 134 #ifdef TARGET_X86_64 135 WHvX64RegisterLstar, 136 WHvX64RegisterCstar, 137 WHvX64RegisterSfmask, 138 #endif 139 140 /* Interrupt / Event Registers */ 141 /* 142 * WHvRegisterPendingInterruption, 143 * WHvRegisterInterruptState, 144 * WHvRegisterPendingEvent0, 145 * WHvRegisterPendingEvent1 146 * WHvX64RegisterDeliverabilityNotifications, 147 */ 148 }; 149 150 struct whpx_register_set { 151 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; 152 }; 153 154 /* 155 * The current implementation of instruction stepping sets the TF flag 156 * in RFLAGS, causing the CPU to raise an INT1 after each instruction. 157 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. 158 * 159 * This approach has a few limitations: 160 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag 161 * along with the other flags, possibly restoring it later. It would 162 * result in another INT1 when the flags are restored, triggering 163 * a stop in gdb that could be cleared by doing another step. 164 * 165 * Stepping over a POPF/LAHF instruction will let it overwrite the 166 * TF flags, ending the stepping mode. 167 * 168 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, 169 * or anything that could result in a page fault) will save the flags 170 * to the stack, clear the TF flag, and let the guest execute the 171 * handler. Normally, the guest will restore the original flags, 172 * that will continue single-stepping. 173 * 174 * 3. Debuggers running on the guest may wish to set TF to do instruction 175 * stepping. INT1 events generated by it would be intercepted by us, 176 * as long as the gdb is connected to QEMU. 177 * 178 * In practice this means that: 179 * 1. Stepping through flags-modifying instructions may cause gdb to 180 * continue or stop in unexpected places. This will be fully recoverable 181 * and will not crash the target. 182 * 183 * 2. Stepping over an instruction that triggers an exception will step 184 * over the exception handler, not into it. 185 * 186 * 3. Debugging the guest via gdb, while running debugger on the guest 187 * at the same time may lead to unexpected effects. Removing all 188 * breakpoints set via QEMU will prevent any further interference 189 * with the guest-level debuggers. 190 * 191 * The limitations can be addressed as shown below: 192 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of 193 * stepping through them. The exact semantics of the instructions is 194 * defined in the "Combined Volume Set of Intel 64 and IA-32 195 * Architectures Software Developer's Manuals", however it involves a 196 * fair amount of corner cases due to compatibility with real mode, 197 * virtual 8086 mode, and differences between 64-bit and 32-bit modes. 198 * 199 * 2. We could step into the guest's exception handlers using the following 200 * sequence: 201 * a. Temporarily enable catching of all exception types via 202 * whpx_set_exception_exit_bitmap(). 203 * b. Once an exception is intercepted, read the IDT/GDT and locate 204 * the original handler. 205 * c. Patch the original handler, injecting an INT3 at the beginning. 206 * d. Update the exception exit bitmap to only catch the 207 * WHvX64ExceptionTypeBreakpointTrap exception. 208 * e. Let the affected CPU run in the exclusive mode. 209 * f. Restore the original handler and the exception exit bitmap. 210 * Note that handling all corner cases related to IDT/GDT is harder 211 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a 212 * rough idea. 213 * 214 * 3. In order to properly support guest-level debugging in parallel with 215 * the QEMU-level debugging, we would need to be able to pass some INT1 216 * events to the guest. This could be done via the following methods: 217 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, 218 * it seems to only work for interrupts and not software 219 * exceptions. 220 * b. Locating and patching the original handler by parsing IDT/GDT. 221 * This involves relatively complex logic outlined in the previous 222 * paragraph. 223 * c. Emulating the exception invocation (i.e. manually updating RIP, 224 * RFLAGS, and pushing the old values to stack). This is even more 225 * complicated than the previous option, since it involves checking 226 * CPL, gate attributes, and doing various adjustments depending 227 * on the current CPU mode, whether the CPL is changing, etc. 228 */ 229 typedef enum WhpxStepMode { 230 WHPX_STEP_NONE = 0, 231 /* Halt other VCPUs */ 232 WHPX_STEP_EXCLUSIVE, 233 } WhpxStepMode; 234 235 struct AccelCPUState { 236 WHV_EMULATOR_HANDLE emulator; 237 bool window_registered; 238 bool interruptable; 239 bool ready_for_pic_interrupt; 240 uint64_t tpr; 241 uint64_t apic_base; 242 bool interruption_pending; 243 244 /* Must be the last field as it may have a tail */ 245 WHV_RUN_VP_EXIT_CONTEXT exit_ctx; 246 }; 247 248 bool whpx_allowed; 249 static bool whp_dispatch_initialized; 250 static HMODULE hWinHvPlatform, hWinHvEmulation; 251 static uint32_t max_vcpu_index; 252 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap; 253 254 struct whpx_state whpx_global; 255 struct WHPDispatch whp_dispatch; 256 257 static bool whpx_has_xsave(void) 258 { 259 return whpx_xsave_cap.XsaveSupport; 260 } 261 262 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, 263 int r86) 264 { 265 WHV_X64_SEGMENT_REGISTER hs; 266 unsigned flags = qs->flags; 267 268 hs.Base = qs->base; 269 hs.Limit = qs->limit; 270 hs.Selector = qs->selector; 271 272 if (v86) { 273 hs.Attributes = 0; 274 hs.SegmentType = 3; 275 hs.Present = 1; 276 hs.DescriptorPrivilegeLevel = 3; 277 hs.NonSystemSegment = 1; 278 279 } else { 280 hs.Attributes = (flags >> DESC_TYPE_SHIFT); 281 282 if (r86) { 283 /* hs.Base &= 0xfffff; */ 284 } 285 } 286 287 return hs; 288 } 289 290 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) 291 { 292 SegmentCache qs; 293 294 qs.base = hs->Base; 295 qs.limit = hs->Limit; 296 qs.selector = hs->Selector; 297 298 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; 299 300 return qs; 301 } 302 303 /* X64 Extended Control Registers */ 304 static void whpx_set_xcrs(CPUState *cpu) 305 { 306 HRESULT hr; 307 struct whpx_state *whpx = &whpx_global; 308 WHV_REGISTER_VALUE xcr0; 309 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 310 311 if (!whpx_has_xsave()) { 312 return; 313 } 314 315 /* Only xcr0 is supported by the hypervisor currently */ 316 xcr0.Reg64 = cpu_env(cpu)->xcr0; 317 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 318 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 319 if (FAILED(hr)) { 320 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr); 321 } 322 } 323 324 static int whpx_set_tsc(CPUState *cpu) 325 { 326 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 327 WHV_REGISTER_VALUE tsc_val; 328 HRESULT hr; 329 struct whpx_state *whpx = &whpx_global; 330 331 /* 332 * Suspend the partition prior to setting the TSC to reduce the variance 333 * in TSC across vCPUs. When the first vCPU runs post suspend, the 334 * partition is automatically resumed. 335 */ 336 if (whp_dispatch.WHvSuspendPartitionTime) { 337 338 /* 339 * Unable to suspend partition while setting TSC is not a fatal 340 * error. It just increases the likelihood of TSC variance between 341 * vCPUs and some guest OS are able to handle that just fine. 342 */ 343 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); 344 if (FAILED(hr)) { 345 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); 346 } 347 } 348 349 tsc_val.Reg64 = cpu_env(cpu)->tsc; 350 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 351 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 352 if (FAILED(hr)) { 353 error_report("WHPX: Failed to set TSC, hr=%08lx", hr); 354 return -1; 355 } 356 357 return 0; 358 } 359 360 /* 361 * The CR8 register in the CPU is mapped to the TPR register of the APIC, 362 * however, they use a slightly different encoding. Specifically: 363 * 364 * APIC.TPR[bits 7:4] = CR8[bits 3:0] 365 * 366 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 367 * and IA-32 Architectures Software Developer's Manual. 368 * 369 * The functions below translate the value of CR8 to TPR and vice versa. 370 */ 371 372 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) 373 { 374 return tpr >> 4; 375 } 376 377 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8) 378 { 379 return cr8 << 4; 380 } 381 382 static void whpx_set_registers(CPUState *cpu, int level) 383 { 384 struct whpx_state *whpx = &whpx_global; 385 AccelCPUState *vcpu = cpu->accel; 386 X86CPU *x86_cpu = X86_CPU(cpu); 387 CPUX86State *env = &x86_cpu->env; 388 struct whpx_register_set vcxt; 389 HRESULT hr; 390 int idx; 391 int idx_next; 392 int i; 393 int v86, r86; 394 395 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 396 397 /* 398 * Following MSRs have side effects on the guest or are too heavy for 399 * runtime. Limit them to full state update. 400 */ 401 if (level >= WHPX_SET_RESET_STATE) { 402 whpx_set_tsc(cpu); 403 } 404 405 memset(&vcxt, 0, sizeof(struct whpx_register_set)); 406 407 v86 = (env->eflags & VM_MASK); 408 r86 = !(env->cr[0] & CR0_PE_MASK); 409 410 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 411 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); 412 413 idx = 0; 414 415 /* Indexes for first 16 registers match between HV and QEMU definitions */ 416 idx_next = 16; 417 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 418 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; 419 } 420 idx = idx_next; 421 422 /* Same goes for RIP and RFLAGS */ 423 assert(whpx_register_names[idx] == WHvX64RegisterRip); 424 vcxt.values[idx++].Reg64 = env->eip; 425 426 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 427 vcxt.values[idx++].Reg64 = env->eflags; 428 429 /* Translate 6+4 segment registers. HV and QEMU order matches */ 430 assert(idx == WHvX64RegisterEs); 431 for (i = 0; i < 6; i += 1, idx += 1) { 432 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); 433 } 434 435 assert(idx == WHvX64RegisterLdtr); 436 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); 437 438 assert(idx == WHvX64RegisterTr); 439 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); 440 441 assert(idx == WHvX64RegisterIdtr); 442 vcxt.values[idx].Table.Base = env->idt.base; 443 vcxt.values[idx].Table.Limit = env->idt.limit; 444 idx += 1; 445 446 assert(idx == WHvX64RegisterGdtr); 447 vcxt.values[idx].Table.Base = env->gdt.base; 448 vcxt.values[idx].Table.Limit = env->gdt.limit; 449 idx += 1; 450 451 /* CR0, 2, 3, 4, 8 */ 452 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 453 vcxt.values[idx++].Reg64 = env->cr[0]; 454 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 455 vcxt.values[idx++].Reg64 = env->cr[2]; 456 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 457 vcxt.values[idx++].Reg64 = env->cr[3]; 458 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 459 vcxt.values[idx++].Reg64 = env->cr[4]; 460 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 461 vcxt.values[idx++].Reg64 = vcpu->tpr; 462 463 /* 8 Debug Registers - Skipped */ 464 465 /* 466 * Extended control registers needs to be handled separately depending 467 * on whether xsave is supported/enabled or not. 468 */ 469 whpx_set_xcrs(cpu); 470 471 /* 16 XMM registers */ 472 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 473 idx_next = idx + 16; 474 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 475 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); 476 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); 477 } 478 idx = idx_next; 479 480 /* 8 FP registers */ 481 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 482 for (i = 0; i < 8; i += 1, idx += 1) { 483 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); 484 /* vcxt.values[idx].Fp.AsUINT128.High64 = 485 env->fpregs[i].mmx.MMX_Q(1); 486 */ 487 } 488 489 /* FP control status register */ 490 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 491 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; 492 vcxt.values[idx].FpControlStatus.FpStatus = 493 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; 494 vcxt.values[idx].FpControlStatus.FpTag = 0; 495 for (i = 0; i < 8; ++i) { 496 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; 497 } 498 vcxt.values[idx].FpControlStatus.Reserved = 0; 499 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; 500 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; 501 idx += 1; 502 503 /* XMM control status register */ 504 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 505 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; 506 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; 507 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; 508 idx += 1; 509 510 /* MSRs */ 511 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 512 vcxt.values[idx++].Reg64 = env->efer; 513 #ifdef TARGET_X86_64 514 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 515 vcxt.values[idx++].Reg64 = env->kernelgsbase; 516 #endif 517 518 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 519 vcxt.values[idx++].Reg64 = vcpu->apic_base; 520 521 /* WHvX64RegisterPat - Skipped */ 522 523 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 524 vcxt.values[idx++].Reg64 = env->sysenter_cs; 525 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 526 vcxt.values[idx++].Reg64 = env->sysenter_eip; 527 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 528 vcxt.values[idx++].Reg64 = env->sysenter_esp; 529 assert(whpx_register_names[idx] == WHvX64RegisterStar); 530 vcxt.values[idx++].Reg64 = env->star; 531 #ifdef TARGET_X86_64 532 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 533 vcxt.values[idx++].Reg64 = env->lstar; 534 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 535 vcxt.values[idx++].Reg64 = env->cstar; 536 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 537 vcxt.values[idx++].Reg64 = env->fmask; 538 #endif 539 540 /* Interrupt / Event Registers - Skipped */ 541 542 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 543 544 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 545 whpx->partition, cpu->cpu_index, 546 whpx_register_names, 547 RTL_NUMBER_OF(whpx_register_names), 548 &vcxt.values[0]); 549 550 if (FAILED(hr)) { 551 error_report("WHPX: Failed to set virtual processor context, hr=%08lx", 552 hr); 553 } 554 } 555 556 static int whpx_get_tsc(CPUState *cpu) 557 { 558 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 559 WHV_REGISTER_VALUE tsc_val; 560 HRESULT hr; 561 struct whpx_state *whpx = &whpx_global; 562 563 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 564 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 565 if (FAILED(hr)) { 566 error_report("WHPX: Failed to get TSC, hr=%08lx", hr); 567 return -1; 568 } 569 570 cpu_env(cpu)->tsc = tsc_val.Reg64; 571 return 0; 572 } 573 574 /* X64 Extended Control Registers */ 575 static void whpx_get_xcrs(CPUState *cpu) 576 { 577 HRESULT hr; 578 struct whpx_state *whpx = &whpx_global; 579 WHV_REGISTER_VALUE xcr0; 580 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 581 582 if (!whpx_has_xsave()) { 583 return; 584 } 585 586 /* Only xcr0 is supported by the hypervisor currently */ 587 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 588 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 589 if (FAILED(hr)) { 590 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr); 591 return; 592 } 593 594 cpu_env(cpu)->xcr0 = xcr0.Reg64; 595 } 596 597 static void whpx_get_registers(CPUState *cpu) 598 { 599 struct whpx_state *whpx = &whpx_global; 600 AccelCPUState *vcpu = cpu->accel; 601 X86CPU *x86_cpu = X86_CPU(cpu); 602 CPUX86State *env = &x86_cpu->env; 603 struct whpx_register_set vcxt; 604 uint64_t tpr, apic_base; 605 HRESULT hr; 606 int idx; 607 int idx_next; 608 int i; 609 610 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 611 612 if (!env->tsc_valid) { 613 whpx_get_tsc(cpu); 614 env->tsc_valid = !runstate_is_running(); 615 } 616 617 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 618 whpx->partition, cpu->cpu_index, 619 whpx_register_names, 620 RTL_NUMBER_OF(whpx_register_names), 621 &vcxt.values[0]); 622 if (FAILED(hr)) { 623 error_report("WHPX: Failed to get virtual processor context, hr=%08lx", 624 hr); 625 } 626 627 if (whpx_apic_in_platform()) { 628 /* 629 * Fetch the TPR value from the emulated APIC. It may get overwritten 630 * below with the value from CR8 returned by 631 * WHvGetVirtualProcessorRegisters(). 632 */ 633 whpx_apic_get(x86_cpu->apic_state); 634 vcpu->tpr = whpx_apic_tpr_to_cr8( 635 cpu_get_apic_tpr(x86_cpu->apic_state)); 636 } 637 638 idx = 0; 639 640 /* Indexes for first 16 registers match between HV and QEMU definitions */ 641 idx_next = 16; 642 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 643 env->regs[idx] = vcxt.values[idx].Reg64; 644 } 645 idx = idx_next; 646 647 /* Same goes for RIP and RFLAGS */ 648 assert(whpx_register_names[idx] == WHvX64RegisterRip); 649 env->eip = vcxt.values[idx++].Reg64; 650 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 651 env->eflags = vcxt.values[idx++].Reg64; 652 653 /* Translate 6+4 segment registers. HV and QEMU order matches */ 654 assert(idx == WHvX64RegisterEs); 655 for (i = 0; i < 6; i += 1, idx += 1) { 656 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); 657 } 658 659 assert(idx == WHvX64RegisterLdtr); 660 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); 661 assert(idx == WHvX64RegisterTr); 662 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); 663 assert(idx == WHvX64RegisterIdtr); 664 env->idt.base = vcxt.values[idx].Table.Base; 665 env->idt.limit = vcxt.values[idx].Table.Limit; 666 idx += 1; 667 assert(idx == WHvX64RegisterGdtr); 668 env->gdt.base = vcxt.values[idx].Table.Base; 669 env->gdt.limit = vcxt.values[idx].Table.Limit; 670 idx += 1; 671 672 /* CR0, 2, 3, 4, 8 */ 673 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 674 env->cr[0] = vcxt.values[idx++].Reg64; 675 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 676 env->cr[2] = vcxt.values[idx++].Reg64; 677 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 678 env->cr[3] = vcxt.values[idx++].Reg64; 679 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 680 env->cr[4] = vcxt.values[idx++].Reg64; 681 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 682 tpr = vcxt.values[idx++].Reg64; 683 if (tpr != vcpu->tpr) { 684 vcpu->tpr = tpr; 685 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr)); 686 } 687 688 /* 8 Debug Registers - Skipped */ 689 690 /* 691 * Extended control registers needs to be handled separately depending 692 * on whether xsave is supported/enabled or not. 693 */ 694 whpx_get_xcrs(cpu); 695 696 /* 16 XMM registers */ 697 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 698 idx_next = idx + 16; 699 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 700 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; 701 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; 702 } 703 idx = idx_next; 704 705 /* 8 FP registers */ 706 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 707 for (i = 0; i < 8; i += 1, idx += 1) { 708 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; 709 /* env->fpregs[i].mmx.MMX_Q(1) = 710 vcxt.values[idx].Fp.AsUINT128.High64; 711 */ 712 } 713 714 /* FP control status register */ 715 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 716 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; 717 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; 718 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; 719 for (i = 0; i < 8; ++i) { 720 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); 721 } 722 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; 723 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; 724 idx += 1; 725 726 /* XMM control status register */ 727 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 728 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; 729 idx += 1; 730 731 /* MSRs */ 732 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 733 env->efer = vcxt.values[idx++].Reg64; 734 #ifdef TARGET_X86_64 735 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 736 env->kernelgsbase = vcxt.values[idx++].Reg64; 737 #endif 738 739 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 740 apic_base = vcxt.values[idx++].Reg64; 741 if (apic_base != vcpu->apic_base) { 742 vcpu->apic_base = apic_base; 743 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); 744 } 745 746 /* WHvX64RegisterPat - Skipped */ 747 748 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 749 env->sysenter_cs = vcxt.values[idx++].Reg64; 750 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 751 env->sysenter_eip = vcxt.values[idx++].Reg64; 752 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 753 env->sysenter_esp = vcxt.values[idx++].Reg64; 754 assert(whpx_register_names[idx] == WHvX64RegisterStar); 755 env->star = vcxt.values[idx++].Reg64; 756 #ifdef TARGET_X86_64 757 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 758 env->lstar = vcxt.values[idx++].Reg64; 759 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 760 env->cstar = vcxt.values[idx++].Reg64; 761 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 762 env->fmask = vcxt.values[idx++].Reg64; 763 #endif 764 765 /* Interrupt / Event Registers - Skipped */ 766 767 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 768 769 if (whpx_apic_in_platform()) { 770 whpx_apic_get(x86_cpu->apic_state); 771 } 772 773 x86_update_hflags(env); 774 } 775 776 static HRESULT CALLBACK whpx_emu_ioport_callback( 777 void *ctx, 778 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) 779 { 780 MemTxAttrs attrs = { 0 }; 781 address_space_rw(&address_space_io, IoAccess->Port, attrs, 782 &IoAccess->Data, IoAccess->AccessSize, 783 IoAccess->Direction); 784 return S_OK; 785 } 786 787 static HRESULT CALLBACK whpx_emu_mmio_callback( 788 void *ctx, 789 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) 790 { 791 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, 792 ma->Direction); 793 return S_OK; 794 } 795 796 static HRESULT CALLBACK whpx_emu_getreg_callback( 797 void *ctx, 798 const WHV_REGISTER_NAME *RegisterNames, 799 UINT32 RegisterCount, 800 WHV_REGISTER_VALUE *RegisterValues) 801 { 802 HRESULT hr; 803 struct whpx_state *whpx = &whpx_global; 804 CPUState *cpu = (CPUState *)ctx; 805 806 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 807 whpx->partition, cpu->cpu_index, 808 RegisterNames, RegisterCount, 809 RegisterValues); 810 if (FAILED(hr)) { 811 error_report("WHPX: Failed to get virtual processor registers," 812 " hr=%08lx", hr); 813 } 814 815 return hr; 816 } 817 818 static HRESULT CALLBACK whpx_emu_setreg_callback( 819 void *ctx, 820 const WHV_REGISTER_NAME *RegisterNames, 821 UINT32 RegisterCount, 822 const WHV_REGISTER_VALUE *RegisterValues) 823 { 824 HRESULT hr; 825 struct whpx_state *whpx = &whpx_global; 826 CPUState *cpu = (CPUState *)ctx; 827 828 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 829 whpx->partition, cpu->cpu_index, 830 RegisterNames, RegisterCount, 831 RegisterValues); 832 if (FAILED(hr)) { 833 error_report("WHPX: Failed to set virtual processor registers," 834 " hr=%08lx", hr); 835 } 836 837 /* 838 * The emulator just successfully wrote the register state. We clear the 839 * dirty state so we avoid the double write on resume of the VP. 840 */ 841 cpu->vcpu_dirty = false; 842 843 return hr; 844 } 845 846 static HRESULT CALLBACK whpx_emu_translate_callback( 847 void *ctx, 848 WHV_GUEST_VIRTUAL_ADDRESS Gva, 849 WHV_TRANSLATE_GVA_FLAGS TranslateFlags, 850 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, 851 WHV_GUEST_PHYSICAL_ADDRESS *Gpa) 852 { 853 HRESULT hr; 854 struct whpx_state *whpx = &whpx_global; 855 CPUState *cpu = (CPUState *)ctx; 856 WHV_TRANSLATE_GVA_RESULT res; 857 858 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, 859 Gva, TranslateFlags, &res, Gpa); 860 if (FAILED(hr)) { 861 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); 862 } else { 863 *TranslationResult = res.ResultCode; 864 } 865 866 return hr; 867 } 868 869 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { 870 .Size = sizeof(WHV_EMULATOR_CALLBACKS), 871 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, 872 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, 873 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, 874 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, 875 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, 876 }; 877 878 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) 879 { 880 HRESULT hr; 881 AccelCPUState *vcpu = cpu->accel; 882 WHV_EMULATOR_STATUS emu_status; 883 884 hr = whp_dispatch.WHvEmulatorTryMmioEmulation( 885 vcpu->emulator, cpu, 886 &vcpu->exit_ctx.VpContext, ctx, 887 &emu_status); 888 if (FAILED(hr)) { 889 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); 890 return -1; 891 } 892 893 if (!emu_status.EmulationSuccessful) { 894 error_report("WHPX: Failed to emulate MMIO access with" 895 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 896 return -1; 897 } 898 899 return 0; 900 } 901 902 static int whpx_handle_portio(CPUState *cpu, 903 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) 904 { 905 HRESULT hr; 906 AccelCPUState *vcpu = cpu->accel; 907 WHV_EMULATOR_STATUS emu_status; 908 909 hr = whp_dispatch.WHvEmulatorTryIoEmulation( 910 vcpu->emulator, cpu, 911 &vcpu->exit_ctx.VpContext, ctx, 912 &emu_status); 913 if (FAILED(hr)) { 914 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); 915 return -1; 916 } 917 918 if (!emu_status.EmulationSuccessful) { 919 error_report("WHPX: Failed to emulate PortIO access with" 920 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 921 return -1; 922 } 923 924 return 0; 925 } 926 927 /* 928 * Controls whether we should intercept various exceptions on the guest, 929 * namely breakpoint/single-step events. 930 * 931 * The 'exceptions' argument accepts a bitmask, e.g: 932 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) 933 */ 934 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) 935 { 936 struct whpx_state *whpx = &whpx_global; 937 WHV_PARTITION_PROPERTY prop = { 0, }; 938 HRESULT hr; 939 940 if (exceptions == whpx->exception_exit_bitmap) { 941 return S_OK; 942 } 943 944 prop.ExceptionExitBitmap = exceptions; 945 946 hr = whp_dispatch.WHvSetPartitionProperty( 947 whpx->partition, 948 WHvPartitionPropertyCodeExceptionExitBitmap, 949 &prop, 950 sizeof(WHV_PARTITION_PROPERTY)); 951 952 if (SUCCEEDED(hr)) { 953 whpx->exception_exit_bitmap = exceptions; 954 } 955 956 return hr; 957 } 958 959 960 /* 961 * This function is called before/after stepping over a single instruction. 962 * It will update the CPU registers to arm/disarm the instruction stepping 963 * accordingly. 964 */ 965 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, 966 bool set, 967 uint64_t *exit_context_rflags) 968 { 969 WHV_REGISTER_NAME reg_name; 970 WHV_REGISTER_VALUE reg_value; 971 HRESULT hr; 972 struct whpx_state *whpx = &whpx_global; 973 974 /* 975 * If we are trying to step over a single instruction, we need to set the 976 * TF bit in rflags. Otherwise, clear it. 977 */ 978 reg_name = WHvX64RegisterRflags; 979 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 980 whpx->partition, 981 cpu->cpu_index, 982 ®_name, 983 1, 984 ®_value); 985 986 if (FAILED(hr)) { 987 error_report("WHPX: Failed to get rflags, hr=%08lx", hr); 988 return hr; 989 } 990 991 if (exit_context_rflags) { 992 assert(*exit_context_rflags == reg_value.Reg64); 993 } 994 995 if (set) { 996 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ 997 reg_value.Reg64 |= TF_MASK; 998 } else { 999 reg_value.Reg64 &= ~TF_MASK; 1000 } 1001 1002 if (exit_context_rflags) { 1003 *exit_context_rflags = reg_value.Reg64; 1004 } 1005 1006 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1007 whpx->partition, 1008 cpu->cpu_index, 1009 ®_name, 1010 1, 1011 ®_value); 1012 1013 if (FAILED(hr)) { 1014 error_report("WHPX: Failed to set rflags," 1015 " hr=%08lx", 1016 hr); 1017 return hr; 1018 } 1019 1020 reg_name = WHvRegisterInterruptState; 1021 reg_value.Reg64 = 0; 1022 1023 /* Suspend delivery of hardware interrupts during single-stepping. */ 1024 reg_value.InterruptState.InterruptShadow = set != 0; 1025 1026 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1027 whpx->partition, 1028 cpu->cpu_index, 1029 ®_name, 1030 1, 1031 ®_value); 1032 1033 if (FAILED(hr)) { 1034 error_report("WHPX: Failed to set InterruptState," 1035 " hr=%08lx", 1036 hr); 1037 return hr; 1038 } 1039 1040 if (!set) { 1041 /* 1042 * We have just finished stepping over a single instruction, 1043 * and intercepted the INT1 generated by it. 1044 * We need to now hide the INT1 from the guest, 1045 * as it would not be expecting it. 1046 */ 1047 1048 reg_name = WHvX64RegisterPendingDebugException; 1049 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1050 whpx->partition, 1051 cpu->cpu_index, 1052 ®_name, 1053 1, 1054 ®_value); 1055 1056 if (FAILED(hr)) { 1057 error_report("WHPX: Failed to get pending debug exceptions," 1058 "hr=%08lx", hr); 1059 return hr; 1060 } 1061 1062 if (reg_value.PendingDebugException.SingleStep) { 1063 reg_value.PendingDebugException.SingleStep = 0; 1064 1065 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1066 whpx->partition, 1067 cpu->cpu_index, 1068 ®_name, 1069 1, 1070 ®_value); 1071 1072 if (FAILED(hr)) { 1073 error_report("WHPX: Failed to clear pending debug exceptions," 1074 "hr=%08lx", hr); 1075 return hr; 1076 } 1077 } 1078 1079 } 1080 1081 return S_OK; 1082 } 1083 1084 /* Tries to find a breakpoint at the specified address. */ 1085 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) 1086 { 1087 struct whpx_state *whpx = &whpx_global; 1088 int i; 1089 1090 if (whpx->breakpoints.breakpoints) { 1091 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { 1092 if (address == whpx->breakpoints.breakpoints->data[i].address) { 1093 return &whpx->breakpoints.breakpoints->data[i]; 1094 } 1095 } 1096 } 1097 1098 return NULL; 1099 } 1100 1101 /* 1102 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for 1103 * debugging user-mode applications. Since the WHPX API does not offer 1104 * an easy way to pass the intercepted exception back to the guest, we 1105 * resort to using INT1 instead, and let the guest always handle INT3. 1106 */ 1107 static const uint8_t whpx_breakpoint_instruction = 0xF1; 1108 1109 /* 1110 * The WHPX QEMU backend implements breakpoints by writing the INT1 1111 * instruction into memory (ignoring the DRx registers). This raises a few 1112 * issues that need to be carefully handled: 1113 * 1114 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints 1115 * at the same location, and later remove them in arbitrary order. 1116 * This should not cause memory corruption, and should only remove the 1117 * physical breakpoint instruction when the last QEMU breakpoint is gone. 1118 * 1119 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid 1120 * physical location. Hence, physically adding/removing a breakpoint can 1121 * theoretically fail at any time. We need to keep track of it. 1122 * 1123 * The function below rebuilds a list of low-level breakpoints (one per 1124 * address, tracking the original instruction and any errors) from the list of 1125 * high-level breakpoints (set via cpu_breakpoint_insert()). 1126 * 1127 * In order to optimize performance, this function stores the list of 1128 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the 1129 * low-level ones, so that it won't be re-invoked until these breakpoints 1130 * change. 1131 * 1132 * Note that this function decides which breakpoints should be inserted into, 1133 * memory, but doesn't actually do it. The memory accessing is done in 1134 * whpx_apply_breakpoints(). 1135 */ 1136 static void whpx_translate_cpu_breakpoints( 1137 struct whpx_breakpoints *breakpoints, 1138 CPUState *cpu, 1139 int cpu_breakpoint_count) 1140 { 1141 CPUBreakpoint *bp; 1142 int cpu_bp_index = 0; 1143 1144 breakpoints->original_addresses = 1145 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); 1146 1147 breakpoints->original_address_count = cpu_breakpoint_count; 1148 1149 int max_breakpoints = cpu_breakpoint_count + 1150 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); 1151 1152 struct whpx_breakpoint_collection *new_breakpoints = 1153 g_malloc0(sizeof(struct whpx_breakpoint_collection) 1154 + max_breakpoints * sizeof(struct whpx_breakpoint)); 1155 1156 new_breakpoints->allocated = max_breakpoints; 1157 new_breakpoints->used = 0; 1158 1159 /* 1160 * 1. Preserve all old breakpoints that could not be automatically 1161 * cleared when the CPU got stopped. 1162 */ 1163 if (breakpoints->breakpoints) { 1164 int i; 1165 for (i = 0; i < breakpoints->breakpoints->used; i++) { 1166 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { 1167 new_breakpoints->data[new_breakpoints->used++] = 1168 breakpoints->breakpoints->data[i]; 1169 } 1170 } 1171 } 1172 1173 /* 2. Map all CPU breakpoints to WHPX breakpoints */ 1174 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1175 int i; 1176 bool found = false; 1177 1178 /* This will be used to detect changed CPU breakpoints later. */ 1179 breakpoints->original_addresses[cpu_bp_index++] = bp->pc; 1180 1181 for (i = 0; i < new_breakpoints->used; i++) { 1182 /* 1183 * WARNING: This loop has O(N^2) complexity, where N is the 1184 * number of breakpoints. It should not be a bottleneck in 1185 * real-world scenarios, since it only needs to run once after 1186 * the breakpoints have been modified. 1187 * If this ever becomes a concern, it can be optimized by storing 1188 * high-level breakpoint objects in a tree or hash map. 1189 */ 1190 1191 if (new_breakpoints->data[i].address == bp->pc) { 1192 /* There was already a breakpoint at this address. */ 1193 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { 1194 new_breakpoints->data[i].state = WHPX_BP_SET; 1195 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { 1196 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; 1197 } 1198 1199 found = true; 1200 break; 1201 } 1202 } 1203 1204 if (!found && new_breakpoints->used < new_breakpoints->allocated) { 1205 /* No WHPX breakpoint at this address. Create one. */ 1206 new_breakpoints->data[new_breakpoints->used].address = bp->pc; 1207 new_breakpoints->data[new_breakpoints->used].state = 1208 WHPX_BP_SET_PENDING; 1209 new_breakpoints->used++; 1210 } 1211 } 1212 1213 /* 1214 * Free the previous breakpoint list. This can be optimized by keeping 1215 * it as shadow buffer for the next computation instead of freeing 1216 * it immediately. 1217 */ 1218 g_free(breakpoints->breakpoints); 1219 1220 breakpoints->breakpoints = new_breakpoints; 1221 } 1222 1223 /* 1224 * Physically inserts/removes the breakpoints by reading and writing the 1225 * physical memory, keeping a track of the failed attempts. 1226 * 1227 * Passing resuming=true will try to set all previously unset breakpoints. 1228 * Passing resuming=false will remove all inserted ones. 1229 */ 1230 static void whpx_apply_breakpoints( 1231 struct whpx_breakpoint_collection *breakpoints, 1232 CPUState *cpu, 1233 bool resuming) 1234 { 1235 int i, rc; 1236 if (!breakpoints) { 1237 return; 1238 } 1239 1240 for (i = 0; i < breakpoints->used; i++) { 1241 /* Decide what to do right now based on the last known state. */ 1242 WhpxBreakpointState state = breakpoints->data[i].state; 1243 switch (state) { 1244 case WHPX_BP_CLEARED: 1245 if (resuming) { 1246 state = WHPX_BP_SET_PENDING; 1247 } 1248 break; 1249 case WHPX_BP_SET_PENDING: 1250 if (!resuming) { 1251 state = WHPX_BP_CLEARED; 1252 } 1253 break; 1254 case WHPX_BP_SET: 1255 if (!resuming) { 1256 state = WHPX_BP_CLEAR_PENDING; 1257 } 1258 break; 1259 case WHPX_BP_CLEAR_PENDING: 1260 if (resuming) { 1261 state = WHPX_BP_SET; 1262 } 1263 break; 1264 } 1265 1266 if (state == WHPX_BP_SET_PENDING) { 1267 /* Remember the original instruction. */ 1268 rc = cpu_memory_rw_debug(cpu, 1269 breakpoints->data[i].address, 1270 &breakpoints->data[i].original_instruction, 1271 1, 1272 false); 1273 1274 if (!rc) { 1275 /* Write the breakpoint instruction. */ 1276 rc = cpu_memory_rw_debug(cpu, 1277 breakpoints->data[i].address, 1278 (void *)&whpx_breakpoint_instruction, 1279 1, 1280 true); 1281 } 1282 1283 if (!rc) { 1284 state = WHPX_BP_SET; 1285 } 1286 1287 } 1288 1289 if (state == WHPX_BP_CLEAR_PENDING) { 1290 /* Restore the original instruction. */ 1291 rc = cpu_memory_rw_debug(cpu, 1292 breakpoints->data[i].address, 1293 &breakpoints->data[i].original_instruction, 1294 1, 1295 true); 1296 1297 if (!rc) { 1298 state = WHPX_BP_CLEARED; 1299 } 1300 } 1301 1302 breakpoints->data[i].state = state; 1303 } 1304 } 1305 1306 /* 1307 * This function is called when the a VCPU is about to start and no other 1308 * VCPUs have been started so far. Since the VCPU start order could be 1309 * arbitrary, it doesn't have to be VCPU#0. 1310 * 1311 * It is used to commit the breakpoints into memory, and configure WHPX 1312 * to intercept debug exceptions. 1313 * 1314 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or 1315 * more VCPUs are already running, so this is the best place to do it. 1316 */ 1317 static int whpx_first_vcpu_starting(CPUState *cpu) 1318 { 1319 struct whpx_state *whpx = &whpx_global; 1320 HRESULT hr; 1321 1322 g_assert(bql_locked()); 1323 1324 if (!QTAILQ_EMPTY(&cpu->breakpoints) || 1325 (whpx->breakpoints.breakpoints && 1326 whpx->breakpoints.breakpoints->used)) { 1327 CPUBreakpoint *bp; 1328 int i = 0; 1329 bool update_pending = false; 1330 1331 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1332 if (i >= whpx->breakpoints.original_address_count || 1333 bp->pc != whpx->breakpoints.original_addresses[i]) { 1334 update_pending = true; 1335 } 1336 1337 i++; 1338 } 1339 1340 if (i != whpx->breakpoints.original_address_count) { 1341 update_pending = true; 1342 } 1343 1344 if (update_pending) { 1345 /* 1346 * The CPU breakpoints have changed since the last call to 1347 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must 1348 * now be recomputed. 1349 */ 1350 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); 1351 } 1352 1353 /* Actually insert the breakpoints into the memory. */ 1354 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); 1355 } 1356 1357 uint64_t exception_mask; 1358 if (whpx->step_pending || 1359 (whpx->breakpoints.breakpoints && 1360 whpx->breakpoints.breakpoints->used)) { 1361 /* 1362 * We are either attempting to single-step one or more CPUs, or 1363 * have one or more breakpoints enabled. Both require intercepting 1364 * the WHvX64ExceptionTypeBreakpointTrap exception. 1365 */ 1366 1367 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; 1368 } else { 1369 /* Let the guest handle all exceptions. */ 1370 exception_mask = 0; 1371 } 1372 1373 hr = whpx_set_exception_exit_bitmap(exception_mask); 1374 if (!SUCCEEDED(hr)) { 1375 error_report("WHPX: Failed to update exception exit mask," 1376 "hr=%08lx.", hr); 1377 return 1; 1378 } 1379 1380 return 0; 1381 } 1382 1383 /* 1384 * This function is called when the last VCPU has finished running. 1385 * It is used to remove any previously set breakpoints from memory. 1386 */ 1387 static int whpx_last_vcpu_stopping(CPUState *cpu) 1388 { 1389 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); 1390 return 0; 1391 } 1392 1393 /* Returns the address of the next instruction that is about to be executed. */ 1394 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) 1395 { 1396 if (cpu->vcpu_dirty) { 1397 /* The CPU registers have been modified by other parts of QEMU. */ 1398 return cpu_env(cpu)->eip; 1399 } else if (exit_context_valid) { 1400 /* 1401 * The CPU registers have not been modified by neither other parts 1402 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). 1403 * This is the most common case. 1404 */ 1405 AccelCPUState *vcpu = cpu->accel; 1406 return vcpu->exit_ctx.VpContext.Rip; 1407 } else { 1408 /* 1409 * The CPU registers have been modified by a call to 1410 * WHvSetVirtualProcessorRegisters() and must be re-queried from 1411 * the target. 1412 */ 1413 WHV_REGISTER_VALUE reg_value; 1414 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; 1415 HRESULT hr; 1416 struct whpx_state *whpx = &whpx_global; 1417 1418 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1419 whpx->partition, 1420 cpu->cpu_index, 1421 ®_name, 1422 1, 1423 ®_value); 1424 1425 if (FAILED(hr)) { 1426 error_report("WHPX: Failed to get PC, hr=%08lx", hr); 1427 return 0; 1428 } 1429 1430 return reg_value.Reg64; 1431 } 1432 } 1433 1434 static int whpx_handle_halt(CPUState *cpu) 1435 { 1436 int ret = 0; 1437 1438 bql_lock(); 1439 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1440 (cpu_env(cpu)->eflags & IF_MASK)) && 1441 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1442 cpu->exception_index = EXCP_HLT; 1443 cpu->halted = true; 1444 ret = 1; 1445 } 1446 bql_unlock(); 1447 1448 return ret; 1449 } 1450 1451 static void whpx_vcpu_pre_run(CPUState *cpu) 1452 { 1453 HRESULT hr; 1454 struct whpx_state *whpx = &whpx_global; 1455 AccelCPUState *vcpu = cpu->accel; 1456 X86CPU *x86_cpu = X86_CPU(cpu); 1457 CPUX86State *env = &x86_cpu->env; 1458 int irq; 1459 uint8_t tpr; 1460 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; 1461 UINT32 reg_count = 0; 1462 WHV_REGISTER_VALUE reg_values[3]; 1463 WHV_REGISTER_NAME reg_names[3]; 1464 1465 memset(&new_int, 0, sizeof(new_int)); 1466 memset(reg_values, 0, sizeof(reg_values)); 1467 1468 bql_lock(); 1469 1470 /* Inject NMI */ 1471 if (!vcpu->interruption_pending && 1472 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 1473 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 1474 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 1475 vcpu->interruptable = false; 1476 new_int.InterruptionType = WHvX64PendingNmi; 1477 new_int.InterruptionPending = 1; 1478 new_int.InterruptionVector = 2; 1479 } 1480 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 1481 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 1482 } 1483 } 1484 1485 /* 1486 * Force the VCPU out of its inner loop to process any INIT requests or 1487 * commit pending TPR access. 1488 */ 1489 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 1490 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1491 !(env->hflags & HF_SMM_MASK)) { 1492 cpu->exit_request = 1; 1493 } 1494 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1495 cpu->exit_request = 1; 1496 } 1497 } 1498 1499 /* Get pending hard interruption or replay one that was overwritten */ 1500 if (!whpx_apic_in_platform()) { 1501 if (!vcpu->interruption_pending && 1502 vcpu->interruptable && (env->eflags & IF_MASK)) { 1503 assert(!new_int.InterruptionPending); 1504 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1505 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1506 irq = cpu_get_pic_interrupt(env); 1507 if (irq >= 0) { 1508 new_int.InterruptionType = WHvX64PendingInterrupt; 1509 new_int.InterruptionPending = 1; 1510 new_int.InterruptionVector = irq; 1511 } 1512 } 1513 } 1514 1515 /* Setup interrupt state if new one was prepared */ 1516 if (new_int.InterruptionPending) { 1517 reg_values[reg_count].PendingInterruption = new_int; 1518 reg_names[reg_count] = WHvRegisterPendingInterruption; 1519 reg_count += 1; 1520 } 1521 } else if (vcpu->ready_for_pic_interrupt && 1522 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 1523 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1524 irq = cpu_get_pic_interrupt(env); 1525 if (irq >= 0) { 1526 reg_names[reg_count] = WHvRegisterPendingEvent; 1527 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) 1528 { 1529 .EventPending = 1, 1530 .EventType = WHvX64PendingEventExtInt, 1531 .Vector = irq, 1532 }; 1533 reg_count += 1; 1534 } 1535 } 1536 1537 /* Sync the TPR to the CR8 if was modified during the intercept */ 1538 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 1539 if (tpr != vcpu->tpr) { 1540 vcpu->tpr = tpr; 1541 reg_values[reg_count].Reg64 = tpr; 1542 cpu->exit_request = 1; 1543 reg_names[reg_count] = WHvX64RegisterCr8; 1544 reg_count += 1; 1545 } 1546 1547 /* Update the state of the interrupt delivery notification */ 1548 if (!vcpu->window_registered && 1549 cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1550 reg_values[reg_count].DeliverabilityNotifications = 1551 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { 1552 .InterruptNotification = 1 1553 }; 1554 vcpu->window_registered = 1; 1555 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; 1556 reg_count += 1; 1557 } 1558 1559 bql_unlock(); 1560 vcpu->ready_for_pic_interrupt = false; 1561 1562 if (reg_count) { 1563 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1564 whpx->partition, cpu->cpu_index, 1565 reg_names, reg_count, reg_values); 1566 if (FAILED(hr)) { 1567 error_report("WHPX: Failed to set interrupt state registers," 1568 " hr=%08lx", hr); 1569 } 1570 } 1571 } 1572 1573 static void whpx_vcpu_post_run(CPUState *cpu) 1574 { 1575 AccelCPUState *vcpu = cpu->accel; 1576 X86CPU *x86_cpu = X86_CPU(cpu); 1577 CPUX86State *env = &x86_cpu->env; 1578 1579 env->eflags = vcpu->exit_ctx.VpContext.Rflags; 1580 1581 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; 1582 if (vcpu->tpr != tpr) { 1583 vcpu->tpr = tpr; 1584 bql_lock(); 1585 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr)); 1586 bql_unlock(); 1587 } 1588 1589 vcpu->interruption_pending = 1590 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; 1591 1592 vcpu->interruptable = 1593 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; 1594 } 1595 1596 static void whpx_vcpu_process_async_events(CPUState *cpu) 1597 { 1598 X86CPU *x86_cpu = X86_CPU(cpu); 1599 CPUX86State *env = &x86_cpu->env; 1600 AccelCPUState *vcpu = cpu->accel; 1601 1602 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1603 !(env->hflags & HF_SMM_MASK)) { 1604 whpx_cpu_synchronize_state(cpu); 1605 do_cpu_init(x86_cpu); 1606 vcpu->interruptable = true; 1607 } 1608 1609 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 1610 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 1611 apic_poll_irq(x86_cpu->apic_state); 1612 } 1613 1614 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1615 (env->eflags & IF_MASK)) || 1616 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1617 cpu->halted = false; 1618 } 1619 1620 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 1621 whpx_cpu_synchronize_state(cpu); 1622 do_cpu_sipi(x86_cpu); 1623 } 1624 1625 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1626 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 1627 whpx_cpu_synchronize_state(cpu); 1628 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 1629 env->tpr_access_type); 1630 } 1631 } 1632 1633 static int whpx_vcpu_run(CPUState *cpu) 1634 { 1635 HRESULT hr; 1636 struct whpx_state *whpx = &whpx_global; 1637 AccelCPUState *vcpu = cpu->accel; 1638 struct whpx_breakpoint *stepped_over_bp = NULL; 1639 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; 1640 int ret; 1641 1642 g_assert(bql_locked()); 1643 1644 if (whpx->running_cpus++ == 0) { 1645 /* Insert breakpoints into memory, update exception exit bitmap. */ 1646 ret = whpx_first_vcpu_starting(cpu); 1647 if (ret != 0) { 1648 return ret; 1649 } 1650 } 1651 1652 if (whpx->breakpoints.breakpoints && 1653 whpx->breakpoints.breakpoints->used > 0) 1654 { 1655 uint64_t pc = whpx_vcpu_get_pc(cpu, true); 1656 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); 1657 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { 1658 stepped_over_bp = NULL; 1659 } 1660 1661 if (stepped_over_bp) { 1662 /* 1663 * We are trying to run the instruction overwritten by an active 1664 * breakpoint. We will temporarily disable the breakpoint, suspend 1665 * other CPUs, and step over the instruction. 1666 */ 1667 exclusive_step_mode = WHPX_STEP_EXCLUSIVE; 1668 } 1669 } 1670 1671 if (exclusive_step_mode == WHPX_STEP_NONE) { 1672 whpx_vcpu_process_async_events(cpu); 1673 if (cpu->halted && !whpx_apic_in_platform()) { 1674 cpu->exception_index = EXCP_HLT; 1675 qatomic_set(&cpu->exit_request, false); 1676 return 0; 1677 } 1678 } 1679 1680 bql_unlock(); 1681 1682 if (exclusive_step_mode != WHPX_STEP_NONE) { 1683 start_exclusive(); 1684 g_assert(cpu == current_cpu); 1685 g_assert(!cpu->running); 1686 cpu->running = true; 1687 1688 hr = whpx_set_exception_exit_bitmap( 1689 1UL << WHvX64ExceptionTypeDebugTrapOrFault); 1690 if (!SUCCEEDED(hr)) { 1691 error_report("WHPX: Failed to update exception exit mask, " 1692 "hr=%08lx.", hr); 1693 return 1; 1694 } 1695 1696 if (stepped_over_bp) { 1697 /* Temporarily disable the triggered breakpoint. */ 1698 cpu_memory_rw_debug(cpu, 1699 stepped_over_bp->address, 1700 &stepped_over_bp->original_instruction, 1701 1, 1702 true); 1703 } 1704 } else { 1705 cpu_exec_start(cpu); 1706 } 1707 1708 do { 1709 if (cpu->vcpu_dirty) { 1710 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); 1711 cpu->vcpu_dirty = false; 1712 } 1713 1714 if (exclusive_step_mode == WHPX_STEP_NONE) { 1715 whpx_vcpu_pre_run(cpu); 1716 1717 if (qatomic_read(&cpu->exit_request)) { 1718 whpx_vcpu_kick(cpu); 1719 } 1720 } 1721 1722 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1723 whpx_vcpu_configure_single_stepping(cpu, true, NULL); 1724 } 1725 1726 hr = whp_dispatch.WHvRunVirtualProcessor( 1727 whpx->partition, cpu->cpu_index, 1728 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); 1729 1730 if (FAILED(hr)) { 1731 error_report("WHPX: Failed to exec a virtual processor," 1732 " hr=%08lx", hr); 1733 ret = -1; 1734 break; 1735 } 1736 1737 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1738 whpx_vcpu_configure_single_stepping(cpu, 1739 false, 1740 &vcpu->exit_ctx.VpContext.Rflags); 1741 } 1742 1743 whpx_vcpu_post_run(cpu); 1744 1745 switch (vcpu->exit_ctx.ExitReason) { 1746 case WHvRunVpExitReasonMemoryAccess: 1747 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); 1748 break; 1749 1750 case WHvRunVpExitReasonX64IoPortAccess: 1751 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); 1752 break; 1753 1754 case WHvRunVpExitReasonX64InterruptWindow: 1755 vcpu->ready_for_pic_interrupt = 1; 1756 vcpu->window_registered = 0; 1757 ret = 0; 1758 break; 1759 1760 case WHvRunVpExitReasonX64ApicEoi: 1761 assert(whpx_apic_in_platform()); 1762 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); 1763 break; 1764 1765 case WHvRunVpExitReasonX64Halt: 1766 /* 1767 * WARNING: as of build 19043.1526 (21H1), this exit reason is no 1768 * longer used. 1769 */ 1770 ret = whpx_handle_halt(cpu); 1771 break; 1772 1773 case WHvRunVpExitReasonX64ApicInitSipiTrap: { 1774 WHV_INTERRUPT_CONTROL ipi = {0}; 1775 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; 1776 uint32_t delivery_mode = 1777 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; 1778 int dest_shorthand = 1779 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; 1780 bool broadcast = false; 1781 bool include_self = false; 1782 uint32_t i; 1783 1784 /* We only registered for INIT and SIPI exits. */ 1785 if ((delivery_mode != APIC_DM_INIT) && 1786 (delivery_mode != APIC_DM_SIPI)) { 1787 error_report( 1788 "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); 1789 break; 1790 } 1791 1792 if (delivery_mode == APIC_DM_INIT) { 1793 ipi.Type = WHvX64InterruptTypeInit; 1794 } else { 1795 ipi.Type = WHvX64InterruptTypeSipi; 1796 } 1797 1798 ipi.DestinationMode = 1799 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? 1800 WHvX64InterruptDestinationModeLogical : 1801 WHvX64InterruptDestinationModePhysical; 1802 1803 ipi.TriggerMode = 1804 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? 1805 WHvX64InterruptTriggerModeLevel : 1806 WHvX64InterruptTriggerModeEdge; 1807 1808 ipi.Vector = icr & APIC_VECTOR_MASK; 1809 switch (dest_shorthand) { 1810 /* no shorthand. Bits 56-63 contain the destination. */ 1811 case 0: 1812 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; 1813 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1814 &ipi, sizeof(ipi)); 1815 if (FAILED(hr)) { 1816 error_report("WHPX: Failed to request interrupt hr=%08lx", 1817 hr); 1818 } 1819 1820 break; 1821 1822 /* self */ 1823 case 1: 1824 include_self = true; 1825 break; 1826 1827 /* broadcast, including self */ 1828 case 2: 1829 broadcast = true; 1830 include_self = true; 1831 break; 1832 1833 /* broadcast, excluding self */ 1834 case 3: 1835 broadcast = true; 1836 break; 1837 } 1838 1839 if (!broadcast && !include_self) { 1840 break; 1841 } 1842 1843 for (i = 0; i <= max_vcpu_index; i++) { 1844 if (i == cpu->cpu_index && !include_self) { 1845 continue; 1846 } 1847 1848 /* 1849 * Assuming that APIC Ids are identity mapped since 1850 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers 1851 * are not handled yet and the hypervisor doesn't allow the 1852 * guest to modify the APIC ID. 1853 */ 1854 ipi.Destination = i; 1855 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1856 &ipi, sizeof(ipi)); 1857 if (FAILED(hr)) { 1858 error_report( 1859 "WHPX: Failed to request SIPI for %d, hr=%08lx", 1860 i, hr); 1861 } 1862 } 1863 1864 break; 1865 } 1866 1867 case WHvRunVpExitReasonCanceled: 1868 if (exclusive_step_mode != WHPX_STEP_NONE) { 1869 /* 1870 * We are trying to step over a single instruction, and 1871 * likely got a request to stop from another thread. 1872 * Delay it until we are done stepping 1873 * over. 1874 */ 1875 ret = 0; 1876 } else { 1877 cpu->exception_index = EXCP_INTERRUPT; 1878 ret = 1; 1879 } 1880 break; 1881 case WHvRunVpExitReasonX64MsrAccess: { 1882 WHV_REGISTER_VALUE reg_values[3] = {0}; 1883 WHV_REGISTER_NAME reg_names[3]; 1884 UINT32 reg_count; 1885 1886 reg_names[0] = WHvX64RegisterRip; 1887 reg_names[1] = WHvX64RegisterRax; 1888 reg_names[2] = WHvX64RegisterRdx; 1889 1890 reg_values[0].Reg64 = 1891 vcpu->exit_ctx.VpContext.Rip + 1892 vcpu->exit_ctx.VpContext.InstructionLength; 1893 1894 /* 1895 * For all unsupported MSR access we: 1896 * ignore writes 1897 * return 0 on read. 1898 */ 1899 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? 1900 1 : 3; 1901 1902 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1903 whpx->partition, 1904 cpu->cpu_index, 1905 reg_names, reg_count, 1906 reg_values); 1907 1908 if (FAILED(hr)) { 1909 error_report("WHPX: Failed to set MsrAccess state " 1910 " registers, hr=%08lx", hr); 1911 } 1912 ret = 0; 1913 break; 1914 } 1915 case WHvRunVpExitReasonX64Cpuid: { 1916 WHV_REGISTER_VALUE reg_values[5]; 1917 WHV_REGISTER_NAME reg_names[5]; 1918 UINT32 reg_count = 5; 1919 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; 1920 X86CPU *x86_cpu = X86_CPU(cpu); 1921 CPUX86State *env = &x86_cpu->env; 1922 1923 memset(reg_values, 0, sizeof(reg_values)); 1924 1925 rip = vcpu->exit_ctx.VpContext.Rip + 1926 vcpu->exit_ctx.VpContext.InstructionLength; 1927 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; 1928 1929 /* 1930 * Ideally, these should be supplied to the hypervisor during VCPU 1931 * initialization and it should be able to satisfy this request. 1932 * But, currently, WHPX doesn't support setting CPUID values in the 1933 * hypervisor once the partition has been setup, which is too late 1934 * since VCPUs are realized later. For now, use the values from 1935 * QEMU to satisfy these requests, until WHPX adds support for 1936 * being able to set these values in the hypervisor at runtime. 1937 */ 1938 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, 1939 (UINT32 *)&rcx, (UINT32 *)&rdx); 1940 switch (cpuid_fn) { 1941 case 0x40000000: 1942 /* Expose the vmware cpu frequency cpuid leaf */ 1943 rax = 0x40000010; 1944 rbx = rcx = rdx = 0; 1945 break; 1946 1947 case 0x40000010: 1948 rax = env->tsc_khz; 1949 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1950 rcx = rdx = 0; 1951 break; 1952 1953 case 0x80000001: 1954 /* Remove any support of OSVW */ 1955 rcx &= ~CPUID_EXT3_OSVW; 1956 break; 1957 } 1958 1959 reg_names[0] = WHvX64RegisterRip; 1960 reg_names[1] = WHvX64RegisterRax; 1961 reg_names[2] = WHvX64RegisterRcx; 1962 reg_names[3] = WHvX64RegisterRdx; 1963 reg_names[4] = WHvX64RegisterRbx; 1964 1965 reg_values[0].Reg64 = rip; 1966 reg_values[1].Reg64 = rax; 1967 reg_values[2].Reg64 = rcx; 1968 reg_values[3].Reg64 = rdx; 1969 reg_values[4].Reg64 = rbx; 1970 1971 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1972 whpx->partition, cpu->cpu_index, 1973 reg_names, 1974 reg_count, 1975 reg_values); 1976 1977 if (FAILED(hr)) { 1978 error_report("WHPX: Failed to set CpuidAccess state registers," 1979 " hr=%08lx", hr); 1980 } 1981 ret = 0; 1982 break; 1983 } 1984 case WHvRunVpExitReasonException: 1985 whpx_get_registers(cpu); 1986 1987 if ((vcpu->exit_ctx.VpException.ExceptionType == 1988 WHvX64ExceptionTypeDebugTrapOrFault) && 1989 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && 1990 (vcpu->exit_ctx.VpException.InstructionBytes[0] == 1991 whpx_breakpoint_instruction)) { 1992 /* Stopped at a software breakpoint. */ 1993 cpu->exception_index = EXCP_DEBUG; 1994 } else if ((vcpu->exit_ctx.VpException.ExceptionType == 1995 WHvX64ExceptionTypeDebugTrapOrFault) && 1996 !cpu->singlestep_enabled) { 1997 /* 1998 * Just finished stepping over a breakpoint, but the 1999 * gdb does not expect us to do single-stepping. 2000 * Don't do anything special. 2001 */ 2002 cpu->exception_index = EXCP_INTERRUPT; 2003 } else { 2004 /* Another exception or debug event. Report it to GDB. */ 2005 cpu->exception_index = EXCP_DEBUG; 2006 } 2007 2008 ret = 1; 2009 break; 2010 case WHvRunVpExitReasonNone: 2011 case WHvRunVpExitReasonUnrecoverableException: 2012 case WHvRunVpExitReasonInvalidVpRegisterValue: 2013 case WHvRunVpExitReasonUnsupportedFeature: 2014 default: 2015 error_report("WHPX: Unexpected VP exit code %d", 2016 vcpu->exit_ctx.ExitReason); 2017 whpx_get_registers(cpu); 2018 bql_lock(); 2019 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2020 bql_unlock(); 2021 break; 2022 } 2023 2024 } while (!ret); 2025 2026 if (stepped_over_bp) { 2027 /* Restore the breakpoint we stepped over */ 2028 cpu_memory_rw_debug(cpu, 2029 stepped_over_bp->address, 2030 (void *)&whpx_breakpoint_instruction, 2031 1, 2032 true); 2033 } 2034 2035 if (exclusive_step_mode != WHPX_STEP_NONE) { 2036 g_assert(cpu_in_exclusive_context(cpu)); 2037 cpu->running = false; 2038 end_exclusive(); 2039 2040 exclusive_step_mode = WHPX_STEP_NONE; 2041 } else { 2042 cpu_exec_end(cpu); 2043 } 2044 2045 bql_lock(); 2046 current_cpu = cpu; 2047 2048 if (--whpx->running_cpus == 0) { 2049 whpx_last_vcpu_stopping(cpu); 2050 } 2051 2052 qatomic_set(&cpu->exit_request, false); 2053 2054 return ret < 0; 2055 } 2056 2057 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2058 { 2059 if (!cpu->vcpu_dirty) { 2060 whpx_get_registers(cpu); 2061 cpu->vcpu_dirty = true; 2062 } 2063 } 2064 2065 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, 2066 run_on_cpu_data arg) 2067 { 2068 whpx_set_registers(cpu, WHPX_SET_RESET_STATE); 2069 cpu->vcpu_dirty = false; 2070 } 2071 2072 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, 2073 run_on_cpu_data arg) 2074 { 2075 whpx_set_registers(cpu, WHPX_SET_FULL_STATE); 2076 cpu->vcpu_dirty = false; 2077 } 2078 2079 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, 2080 run_on_cpu_data arg) 2081 { 2082 cpu->vcpu_dirty = true; 2083 } 2084 2085 /* 2086 * CPU support. 2087 */ 2088 2089 void whpx_cpu_synchronize_state(CPUState *cpu) 2090 { 2091 if (!cpu->vcpu_dirty) { 2092 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); 2093 } 2094 } 2095 2096 void whpx_cpu_synchronize_post_reset(CPUState *cpu) 2097 { 2098 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2099 } 2100 2101 void whpx_cpu_synchronize_post_init(CPUState *cpu) 2102 { 2103 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2104 } 2105 2106 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) 2107 { 2108 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2109 } 2110 2111 static void whpx_pre_resume_vm(AccelState *as, bool step_pending) 2112 { 2113 whpx_global.step_pending = step_pending; 2114 } 2115 2116 /* 2117 * Vcpu support. 2118 */ 2119 2120 static Error *whpx_migration_blocker; 2121 2122 static void whpx_cpu_update_state(void *opaque, bool running, RunState state) 2123 { 2124 CPUX86State *env = opaque; 2125 2126 if (running) { 2127 env->tsc_valid = false; 2128 } 2129 } 2130 2131 int whpx_init_vcpu(CPUState *cpu) 2132 { 2133 HRESULT hr; 2134 struct whpx_state *whpx = &whpx_global; 2135 AccelCPUState *vcpu = NULL; 2136 Error *local_error = NULL; 2137 X86CPU *x86_cpu = X86_CPU(cpu); 2138 CPUX86State *env = &x86_cpu->env; 2139 UINT64 freq = 0; 2140 int ret; 2141 2142 /* Add migration blockers for all unsupported features of the 2143 * Windows Hypervisor Platform 2144 */ 2145 if (whpx_migration_blocker == NULL) { 2146 error_setg(&whpx_migration_blocker, 2147 "State blocked due to non-migratable CPUID feature support," 2148 "dirty memory tracking support, and XSAVE/XRSTOR support"); 2149 2150 if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) { 2151 error_report_err(local_error); 2152 ret = -EINVAL; 2153 goto error; 2154 } 2155 } 2156 2157 vcpu = g_new0(AccelCPUState, 1); 2158 2159 hr = whp_dispatch.WHvEmulatorCreateEmulator( 2160 &whpx_emu_callbacks, 2161 &vcpu->emulator); 2162 if (FAILED(hr)) { 2163 error_report("WHPX: Failed to setup instruction completion support," 2164 " hr=%08lx", hr); 2165 ret = -EINVAL; 2166 goto error; 2167 } 2168 2169 hr = whp_dispatch.WHvCreateVirtualProcessor( 2170 whpx->partition, cpu->cpu_index, 0); 2171 if (FAILED(hr)) { 2172 error_report("WHPX: Failed to create a virtual processor," 2173 " hr=%08lx", hr); 2174 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2175 ret = -EINVAL; 2176 goto error; 2177 } 2178 2179 /* 2180 * vcpu's TSC frequency is either specified by user, or use the value 2181 * provided by Hyper-V if the former is not present. In the latter case, we 2182 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC 2183 * frequency can be migrated later via this field. 2184 */ 2185 if (!env->tsc_khz) { 2186 hr = whp_dispatch.WHvGetCapability( 2187 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), 2188 NULL); 2189 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2190 if (FAILED(hr)) { 2191 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); 2192 } else { 2193 env->tsc_khz = freq / 1000; /* Hz to KHz */ 2194 } 2195 } 2196 } 2197 2198 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; 2199 hr = whp_dispatch.WHvGetCapability( 2200 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); 2201 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2202 if (FAILED(hr)) { 2203 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); 2204 } else { 2205 env->apic_bus_freq = freq; 2206 } 2207 } 2208 2209 /* 2210 * If the vmware cpuid frequency leaf option is set, and we have a valid 2211 * tsc value, trap the corresponding cpuid's. 2212 */ 2213 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { 2214 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; 2215 2216 hr = whp_dispatch.WHvSetPartitionProperty( 2217 whpx->partition, 2218 WHvPartitionPropertyCodeCpuidExitList, 2219 cpuidExitList, 2220 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2221 2222 if (FAILED(hr)) { 2223 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2224 hr); 2225 ret = -EINVAL; 2226 goto error; 2227 } 2228 } 2229 2230 vcpu->interruptable = true; 2231 cpu->vcpu_dirty = true; 2232 cpu->accel = vcpu; 2233 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); 2234 qemu_add_vm_change_state_handler(whpx_cpu_update_state, env); 2235 2236 return 0; 2237 2238 error: 2239 g_free(vcpu); 2240 2241 return ret; 2242 } 2243 2244 int whpx_vcpu_exec(CPUState *cpu) 2245 { 2246 int ret; 2247 int fatal; 2248 2249 for (;;) { 2250 if (cpu->exception_index >= EXCP_INTERRUPT) { 2251 ret = cpu->exception_index; 2252 cpu->exception_index = -1; 2253 break; 2254 } 2255 2256 fatal = whpx_vcpu_run(cpu); 2257 2258 if (fatal) { 2259 error_report("WHPX: Failed to exec a virtual processor"); 2260 abort(); 2261 } 2262 } 2263 2264 return ret; 2265 } 2266 2267 void whpx_destroy_vcpu(CPUState *cpu) 2268 { 2269 struct whpx_state *whpx = &whpx_global; 2270 AccelCPUState *vcpu = cpu->accel; 2271 2272 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); 2273 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2274 g_free(cpu->accel); 2275 } 2276 2277 void whpx_vcpu_kick(CPUState *cpu) 2278 { 2279 struct whpx_state *whpx = &whpx_global; 2280 whp_dispatch.WHvCancelRunVirtualProcessor( 2281 whpx->partition, cpu->cpu_index, 0); 2282 } 2283 2284 /* 2285 * Memory support. 2286 */ 2287 2288 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, 2289 void *host_va, int add, int rom, 2290 const char *name) 2291 { 2292 struct whpx_state *whpx = &whpx_global; 2293 HRESULT hr; 2294 2295 /* 2296 if (add) { 2297 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", 2298 (void*)start_pa, (void*)size, host_va, 2299 (rom ? "ROM" : "RAM"), name); 2300 } else { 2301 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", 2302 (void*)start_pa, (void*)size, host_va, name); 2303 } 2304 */ 2305 2306 if (add) { 2307 hr = whp_dispatch.WHvMapGpaRange(whpx->partition, 2308 host_va, 2309 start_pa, 2310 size, 2311 (WHvMapGpaRangeFlagRead | 2312 WHvMapGpaRangeFlagExecute | 2313 (rom ? 0 : WHvMapGpaRangeFlagWrite))); 2314 } else { 2315 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, 2316 start_pa, 2317 size); 2318 } 2319 2320 if (FAILED(hr)) { 2321 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," 2322 " Host:%p, hr=%08lx", 2323 (add ? "MAP" : "UNMAP"), name, 2324 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); 2325 } 2326 } 2327 2328 static void whpx_process_section(MemoryRegionSection *section, int add) 2329 { 2330 MemoryRegion *mr = section->mr; 2331 hwaddr start_pa = section->offset_within_address_space; 2332 ram_addr_t size = int128_get64(section->size); 2333 unsigned int delta; 2334 uint64_t host_va; 2335 2336 if (!memory_region_is_ram(mr)) { 2337 return; 2338 } 2339 2340 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 2341 delta &= ~qemu_real_host_page_mask(); 2342 if (delta > size) { 2343 return; 2344 } 2345 start_pa += delta; 2346 size -= delta; 2347 size &= qemu_real_host_page_mask(); 2348 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 2349 return; 2350 } 2351 2352 host_va = (uintptr_t)memory_region_get_ram_ptr(mr) 2353 + section->offset_within_region + delta; 2354 2355 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, 2356 memory_region_is_rom(mr), mr->name); 2357 } 2358 2359 static void whpx_region_add(MemoryListener *listener, 2360 MemoryRegionSection *section) 2361 { 2362 memory_region_ref(section->mr); 2363 whpx_process_section(section, 1); 2364 } 2365 2366 static void whpx_region_del(MemoryListener *listener, 2367 MemoryRegionSection *section) 2368 { 2369 whpx_process_section(section, 0); 2370 memory_region_unref(section->mr); 2371 } 2372 2373 static void whpx_transaction_begin(MemoryListener *listener) 2374 { 2375 } 2376 2377 static void whpx_transaction_commit(MemoryListener *listener) 2378 { 2379 } 2380 2381 static void whpx_log_sync(MemoryListener *listener, 2382 MemoryRegionSection *section) 2383 { 2384 MemoryRegion *mr = section->mr; 2385 2386 if (!memory_region_is_ram(mr)) { 2387 return; 2388 } 2389 2390 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 2391 } 2392 2393 static MemoryListener whpx_memory_listener = { 2394 .name = "whpx", 2395 .begin = whpx_transaction_begin, 2396 .commit = whpx_transaction_commit, 2397 .region_add = whpx_region_add, 2398 .region_del = whpx_region_del, 2399 .log_sync = whpx_log_sync, 2400 .priority = MEMORY_LISTENER_PRIORITY_ACCEL, 2401 }; 2402 2403 static void whpx_memory_init(void) 2404 { 2405 memory_listener_register(&whpx_memory_listener, &address_space_memory); 2406 } 2407 2408 /* 2409 * Load the functions from the given library, using the given handle. If a 2410 * handle is provided, it is used, otherwise the library is opened. The 2411 * handle will be updated on return with the opened one. 2412 */ 2413 static bool load_whp_dispatch_fns(HMODULE *handle, 2414 WHPFunctionList function_list) 2415 { 2416 HMODULE hLib = *handle; 2417 2418 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" 2419 #define WINHV_EMULATION_DLL "WinHvEmulation.dll" 2420 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ 2421 whp_dispatch.function_name = \ 2422 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2423 2424 #define WHP_LOAD_FIELD(return_type, function_name, signature) \ 2425 whp_dispatch.function_name = \ 2426 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2427 if (!whp_dispatch.function_name) { \ 2428 error_report("Could not load function %s", #function_name); \ 2429 goto error; \ 2430 } \ 2431 2432 #define WHP_LOAD_LIB(lib_name, handle_lib) \ 2433 if (!handle_lib) { \ 2434 handle_lib = LoadLibrary(lib_name); \ 2435 if (!handle_lib) { \ 2436 error_report("Could not load library %s.", lib_name); \ 2437 goto error; \ 2438 } \ 2439 } \ 2440 2441 switch (function_list) { 2442 case WINHV_PLATFORM_FNS_DEFAULT: 2443 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2444 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) 2445 break; 2446 2447 case WINHV_EMULATION_FNS_DEFAULT: 2448 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) 2449 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) 2450 break; 2451 2452 case WINHV_PLATFORM_FNS_SUPPLEMENTAL: 2453 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2454 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) 2455 break; 2456 } 2457 2458 *handle = hLib; 2459 return true; 2460 2461 error: 2462 if (hLib) { 2463 FreeLibrary(hLib); 2464 } 2465 2466 return false; 2467 } 2468 2469 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, 2470 const char *name, void *opaque, 2471 Error **errp) 2472 { 2473 struct whpx_state *whpx = &whpx_global; 2474 OnOffSplit mode; 2475 2476 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 2477 return; 2478 } 2479 2480 switch (mode) { 2481 case ON_OFF_SPLIT_ON: 2482 whpx->kernel_irqchip_allowed = true; 2483 whpx->kernel_irqchip_required = true; 2484 break; 2485 2486 case ON_OFF_SPLIT_OFF: 2487 whpx->kernel_irqchip_allowed = false; 2488 whpx->kernel_irqchip_required = false; 2489 break; 2490 2491 case ON_OFF_SPLIT_SPLIT: 2492 error_setg(errp, "WHPX: split irqchip currently not supported"); 2493 error_append_hint(errp, 2494 "Try without kernel-irqchip or with kernel-irqchip=on|off"); 2495 break; 2496 2497 default: 2498 /* 2499 * The value was checked in visit_type_OnOffSplit() above. If 2500 * we get here, then something is wrong in QEMU. 2501 */ 2502 abort(); 2503 } 2504 } 2505 2506 static void whpx_cpu_instance_init(CPUState *cs) 2507 { 2508 X86CPU *cpu = X86_CPU(cs); 2509 2510 host_cpu_instance_init(cpu); 2511 } 2512 2513 static void whpx_cpu_accel_class_init(ObjectClass *oc, const void *data) 2514 { 2515 AccelCPUClass *acc = ACCEL_CPU_CLASS(oc); 2516 2517 acc->cpu_instance_init = whpx_cpu_instance_init; 2518 } 2519 2520 static const TypeInfo whpx_cpu_accel_type = { 2521 .name = ACCEL_CPU_NAME("whpx"), 2522 2523 .parent = TYPE_ACCEL_CPU, 2524 .class_init = whpx_cpu_accel_class_init, 2525 .abstract = true, 2526 }; 2527 2528 /* 2529 * Partition support 2530 */ 2531 2532 static int whpx_accel_init(AccelState *as, MachineState *ms) 2533 { 2534 struct whpx_state *whpx; 2535 int ret; 2536 HRESULT hr; 2537 WHV_CAPABILITY whpx_cap; 2538 UINT32 whpx_cap_size; 2539 WHV_PARTITION_PROPERTY prop; 2540 UINT32 cpuidExitList[] = {1, 0x80000001}; 2541 WHV_CAPABILITY_FEATURES features = {0}; 2542 2543 whpx = &whpx_global; 2544 2545 if (!init_whp_dispatch()) { 2546 ret = -ENOSYS; 2547 goto error; 2548 } 2549 2550 whpx->mem_quota = ms->ram_size; 2551 2552 hr = whp_dispatch.WHvGetCapability( 2553 WHvCapabilityCodeHypervisorPresent, &whpx_cap, 2554 sizeof(whpx_cap), &whpx_cap_size); 2555 if (FAILED(hr) || !whpx_cap.HypervisorPresent) { 2556 error_report("WHPX: No accelerator found, hr=%08lx", hr); 2557 ret = -ENOSPC; 2558 goto error; 2559 } 2560 2561 hr = whp_dispatch.WHvGetCapability( 2562 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); 2563 if (FAILED(hr)) { 2564 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); 2565 ret = -EINVAL; 2566 goto error; 2567 } 2568 2569 hr = whp_dispatch.WHvCreatePartition(&whpx->partition); 2570 if (FAILED(hr)) { 2571 error_report("WHPX: Failed to create partition, hr=%08lx", hr); 2572 ret = -EINVAL; 2573 goto error; 2574 } 2575 2576 /* 2577 * Query the XSAVE capability of the partition. Any error here is not 2578 * considered fatal. 2579 */ 2580 hr = whp_dispatch.WHvGetPartitionProperty( 2581 whpx->partition, 2582 WHvPartitionPropertyCodeProcessorXsaveFeatures, 2583 &whpx_xsave_cap, 2584 sizeof(whpx_xsave_cap), 2585 &whpx_cap_size); 2586 2587 /* 2588 * Windows version which don't support this property will return with the 2589 * specific error code. 2590 */ 2591 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) { 2592 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr); 2593 } 2594 2595 if (!whpx_has_xsave()) { 2596 printf("WHPX: Partition is not XSAVE capable\n"); 2597 } 2598 2599 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2600 prop.ProcessorCount = ms->smp.cpus; 2601 hr = whp_dispatch.WHvSetPartitionProperty( 2602 whpx->partition, 2603 WHvPartitionPropertyCodeProcessorCount, 2604 &prop, 2605 sizeof(WHV_PARTITION_PROPERTY)); 2606 2607 if (FAILED(hr)) { 2608 error_report("WHPX: Failed to set partition processor count to %u," 2609 " hr=%08lx", prop.ProcessorCount, hr); 2610 ret = -EINVAL; 2611 goto error; 2612 } 2613 2614 /* 2615 * Error out if WHP doesn't support apic emulation and user is requiring 2616 * it. 2617 */ 2618 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || 2619 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { 2620 error_report("WHPX: kernel irqchip requested, but unavailable. " 2621 "Try without kernel-irqchip or with kernel-irqchip=off"); 2622 ret = -EINVAL; 2623 goto error; 2624 } 2625 2626 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && 2627 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { 2628 WHV_X64_LOCAL_APIC_EMULATION_MODE mode = 2629 WHvX64LocalApicEmulationModeXApic; 2630 printf("WHPX: setting APIC emulation mode in the hypervisor\n"); 2631 hr = whp_dispatch.WHvSetPartitionProperty( 2632 whpx->partition, 2633 WHvPartitionPropertyCodeLocalApicEmulationMode, 2634 &mode, 2635 sizeof(mode)); 2636 if (FAILED(hr)) { 2637 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); 2638 if (whpx->kernel_irqchip_required) { 2639 error_report("WHPX: kernel irqchip requested, but unavailable"); 2640 ret = -EINVAL; 2641 goto error; 2642 } 2643 } else { 2644 whpx->apic_in_platform = true; 2645 } 2646 } 2647 2648 /* Register for MSR and CPUID exits */ 2649 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2650 prop.ExtendedVmExits.X64MsrExit = 1; 2651 prop.ExtendedVmExits.X64CpuidExit = 1; 2652 prop.ExtendedVmExits.ExceptionExit = 1; 2653 if (whpx_apic_in_platform()) { 2654 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; 2655 } 2656 2657 hr = whp_dispatch.WHvSetPartitionProperty( 2658 whpx->partition, 2659 WHvPartitionPropertyCodeExtendedVmExits, 2660 &prop, 2661 sizeof(WHV_PARTITION_PROPERTY)); 2662 if (FAILED(hr)) { 2663 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); 2664 ret = -EINVAL; 2665 goto error; 2666 } 2667 2668 hr = whp_dispatch.WHvSetPartitionProperty( 2669 whpx->partition, 2670 WHvPartitionPropertyCodeCpuidExitList, 2671 cpuidExitList, 2672 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2673 2674 if (FAILED(hr)) { 2675 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2676 hr); 2677 ret = -EINVAL; 2678 goto error; 2679 } 2680 2681 /* 2682 * We do not want to intercept any exceptions from the guest, 2683 * until we actually start debugging with gdb. 2684 */ 2685 whpx->exception_exit_bitmap = -1; 2686 hr = whpx_set_exception_exit_bitmap(0); 2687 2688 if (FAILED(hr)) { 2689 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); 2690 ret = -EINVAL; 2691 goto error; 2692 } 2693 2694 hr = whp_dispatch.WHvSetupPartition(whpx->partition); 2695 if (FAILED(hr)) { 2696 error_report("WHPX: Failed to setup partition, hr=%08lx", hr); 2697 ret = -EINVAL; 2698 goto error; 2699 } 2700 2701 whpx_memory_init(); 2702 2703 printf("Windows Hypervisor Platform accelerator is operational\n"); 2704 return 0; 2705 2706 error: 2707 2708 if (NULL != whpx->partition) { 2709 whp_dispatch.WHvDeletePartition(whpx->partition); 2710 whpx->partition = NULL; 2711 } 2712 2713 return ret; 2714 } 2715 2716 bool whpx_apic_in_platform(void) { 2717 return whpx_global.apic_in_platform; 2718 } 2719 2720 static void whpx_accel_class_init(ObjectClass *oc, const void *data) 2721 { 2722 AccelClass *ac = ACCEL_CLASS(oc); 2723 ac->name = "WHPX"; 2724 ac->init_machine = whpx_accel_init; 2725 ac->pre_resume_vm = whpx_pre_resume_vm; 2726 ac->allowed = &whpx_allowed; 2727 2728 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 2729 NULL, whpx_set_kernel_irqchip, 2730 NULL, NULL); 2731 object_class_property_set_description(oc, "kernel-irqchip", 2732 "Configure WHPX in-kernel irqchip"); 2733 } 2734 2735 static void whpx_accel_instance_init(Object *obj) 2736 { 2737 struct whpx_state *whpx = &whpx_global; 2738 2739 memset(whpx, 0, sizeof(struct whpx_state)); 2740 /* Turn on kernel-irqchip, by default */ 2741 whpx->kernel_irqchip_allowed = true; 2742 } 2743 2744 static const TypeInfo whpx_accel_type = { 2745 .name = ACCEL_CLASS_NAME("whpx"), 2746 .parent = TYPE_ACCEL, 2747 .instance_init = whpx_accel_instance_init, 2748 .class_init = whpx_accel_class_init, 2749 }; 2750 2751 static void whpx_type_init(void) 2752 { 2753 type_register_static(&whpx_accel_type); 2754 type_register_static(&whpx_cpu_accel_type); 2755 } 2756 2757 bool init_whp_dispatch(void) 2758 { 2759 if (whp_dispatch_initialized) { 2760 return true; 2761 } 2762 2763 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { 2764 goto error; 2765 } 2766 2767 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { 2768 goto error; 2769 } 2770 2771 assert(load_whp_dispatch_fns(&hWinHvPlatform, 2772 WINHV_PLATFORM_FNS_SUPPLEMENTAL)); 2773 whp_dispatch_initialized = true; 2774 2775 return true; 2776 error: 2777 if (hWinHvPlatform) { 2778 FreeLibrary(hWinHvPlatform); 2779 } 2780 2781 if (hWinHvEmulation) { 2782 FreeLibrary(hWinHvEmulation); 2783 } 2784 2785 return false; 2786 } 2787 2788 type_init(whpx_type_init); 2789