1 /* 2 * QEMU Windows Hypervisor Platform accelerator (WHPX) 3 * 4 * Copyright Microsoft Corp. 2017 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 11 #include "qemu/osdep.h" 12 #include "cpu.h" 13 #include "system/address-spaces.h" 14 #include "system/ioport.h" 15 #include "gdbstub/helpers.h" 16 #include "qemu/accel.h" 17 #include "system/whpx.h" 18 #include "system/cpus.h" 19 #include "system/runstate.h" 20 #include "qemu/main-loop.h" 21 #include "hw/boards.h" 22 #include "hw/intc/ioapic.h" 23 #include "hw/i386/apic_internal.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "qapi/qapi-types-common.h" 27 #include "qapi/qapi-visit-common.h" 28 #include "migration/blocker.h" 29 #include "host-cpu.h" 30 #include "accel/accel-cpu-target.h" 31 #include <winerror.h> 32 33 #include "whpx-internal.h" 34 #include "whpx-accel-ops.h" 35 36 #include <winhvplatform.h> 37 #include <winhvemulation.h> 38 39 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) 40 41 static const WHV_REGISTER_NAME whpx_register_names[] = { 42 43 /* X64 General purpose registers */ 44 WHvX64RegisterRax, 45 WHvX64RegisterRcx, 46 WHvX64RegisterRdx, 47 WHvX64RegisterRbx, 48 WHvX64RegisterRsp, 49 WHvX64RegisterRbp, 50 WHvX64RegisterRsi, 51 WHvX64RegisterRdi, 52 WHvX64RegisterR8, 53 WHvX64RegisterR9, 54 WHvX64RegisterR10, 55 WHvX64RegisterR11, 56 WHvX64RegisterR12, 57 WHvX64RegisterR13, 58 WHvX64RegisterR14, 59 WHvX64RegisterR15, 60 WHvX64RegisterRip, 61 WHvX64RegisterRflags, 62 63 /* X64 Segment registers */ 64 WHvX64RegisterEs, 65 WHvX64RegisterCs, 66 WHvX64RegisterSs, 67 WHvX64RegisterDs, 68 WHvX64RegisterFs, 69 WHvX64RegisterGs, 70 WHvX64RegisterLdtr, 71 WHvX64RegisterTr, 72 73 /* X64 Table registers */ 74 WHvX64RegisterIdtr, 75 WHvX64RegisterGdtr, 76 77 /* X64 Control Registers */ 78 WHvX64RegisterCr0, 79 WHvX64RegisterCr2, 80 WHvX64RegisterCr3, 81 WHvX64RegisterCr4, 82 WHvX64RegisterCr8, 83 84 /* X64 Debug Registers */ 85 /* 86 * WHvX64RegisterDr0, 87 * WHvX64RegisterDr1, 88 * WHvX64RegisterDr2, 89 * WHvX64RegisterDr3, 90 * WHvX64RegisterDr6, 91 * WHvX64RegisterDr7, 92 */ 93 94 /* X64 Floating Point and Vector Registers */ 95 WHvX64RegisterXmm0, 96 WHvX64RegisterXmm1, 97 WHvX64RegisterXmm2, 98 WHvX64RegisterXmm3, 99 WHvX64RegisterXmm4, 100 WHvX64RegisterXmm5, 101 WHvX64RegisterXmm6, 102 WHvX64RegisterXmm7, 103 WHvX64RegisterXmm8, 104 WHvX64RegisterXmm9, 105 WHvX64RegisterXmm10, 106 WHvX64RegisterXmm11, 107 WHvX64RegisterXmm12, 108 WHvX64RegisterXmm13, 109 WHvX64RegisterXmm14, 110 WHvX64RegisterXmm15, 111 WHvX64RegisterFpMmx0, 112 WHvX64RegisterFpMmx1, 113 WHvX64RegisterFpMmx2, 114 WHvX64RegisterFpMmx3, 115 WHvX64RegisterFpMmx4, 116 WHvX64RegisterFpMmx5, 117 WHvX64RegisterFpMmx6, 118 WHvX64RegisterFpMmx7, 119 WHvX64RegisterFpControlStatus, 120 WHvX64RegisterXmmControlStatus, 121 122 /* X64 MSRs */ 123 WHvX64RegisterEfer, 124 #ifdef TARGET_X86_64 125 WHvX64RegisterKernelGsBase, 126 #endif 127 WHvX64RegisterApicBase, 128 /* WHvX64RegisterPat, */ 129 WHvX64RegisterSysenterCs, 130 WHvX64RegisterSysenterEip, 131 WHvX64RegisterSysenterEsp, 132 WHvX64RegisterStar, 133 #ifdef TARGET_X86_64 134 WHvX64RegisterLstar, 135 WHvX64RegisterCstar, 136 WHvX64RegisterSfmask, 137 #endif 138 139 /* Interrupt / Event Registers */ 140 /* 141 * WHvRegisterPendingInterruption, 142 * WHvRegisterInterruptState, 143 * WHvRegisterPendingEvent0, 144 * WHvRegisterPendingEvent1 145 * WHvX64RegisterDeliverabilityNotifications, 146 */ 147 }; 148 149 struct whpx_register_set { 150 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; 151 }; 152 153 /* 154 * The current implementation of instruction stepping sets the TF flag 155 * in RFLAGS, causing the CPU to raise an INT1 after each instruction. 156 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. 157 * 158 * This approach has a few limitations: 159 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag 160 * along with the other flags, possibly restoring it later. It would 161 * result in another INT1 when the flags are restored, triggering 162 * a stop in gdb that could be cleared by doing another step. 163 * 164 * Stepping over a POPF/LAHF instruction will let it overwrite the 165 * TF flags, ending the stepping mode. 166 * 167 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, 168 * or anything that could result in a page fault) will save the flags 169 * to the stack, clear the TF flag, and let the guest execute the 170 * handler. Normally, the guest will restore the original flags, 171 * that will continue single-stepping. 172 * 173 * 3. Debuggers running on the guest may wish to set TF to do instruction 174 * stepping. INT1 events generated by it would be intercepted by us, 175 * as long as the gdb is connected to QEMU. 176 * 177 * In practice this means that: 178 * 1. Stepping through flags-modifying instructions may cause gdb to 179 * continue or stop in unexpected places. This will be fully recoverable 180 * and will not crash the target. 181 * 182 * 2. Stepping over an instruction that triggers an exception will step 183 * over the exception handler, not into it. 184 * 185 * 3. Debugging the guest via gdb, while running debugger on the guest 186 * at the same time may lead to unexpected effects. Removing all 187 * breakpoints set via QEMU will prevent any further interference 188 * with the guest-level debuggers. 189 * 190 * The limitations can be addressed as shown below: 191 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of 192 * stepping through them. The exact semantics of the instructions is 193 * defined in the "Combined Volume Set of Intel 64 and IA-32 194 * Architectures Software Developer's Manuals", however it involves a 195 * fair amount of corner cases due to compatibility with real mode, 196 * virtual 8086 mode, and differences between 64-bit and 32-bit modes. 197 * 198 * 2. We could step into the guest's exception handlers using the following 199 * sequence: 200 * a. Temporarily enable catching of all exception types via 201 * whpx_set_exception_exit_bitmap(). 202 * b. Once an exception is intercepted, read the IDT/GDT and locate 203 * the original handler. 204 * c. Patch the original handler, injecting an INT3 at the beginning. 205 * d. Update the exception exit bitmap to only catch the 206 * WHvX64ExceptionTypeBreakpointTrap exception. 207 * e. Let the affected CPU run in the exclusive mode. 208 * f. Restore the original handler and the exception exit bitmap. 209 * Note that handling all corner cases related to IDT/GDT is harder 210 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a 211 * rough idea. 212 * 213 * 3. In order to properly support guest-level debugging in parallel with 214 * the QEMU-level debugging, we would need to be able to pass some INT1 215 * events to the guest. This could be done via the following methods: 216 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, 217 * it seems to only work for interrupts and not software 218 * exceptions. 219 * b. Locating and patching the original handler by parsing IDT/GDT. 220 * This involves relatively complex logic outlined in the previous 221 * paragraph. 222 * c. Emulating the exception invocation (i.e. manually updating RIP, 223 * RFLAGS, and pushing the old values to stack). This is even more 224 * complicated than the previous option, since it involves checking 225 * CPL, gate attributes, and doing various adjustments depending 226 * on the current CPU mode, whether the CPL is changing, etc. 227 */ 228 typedef enum WhpxStepMode { 229 WHPX_STEP_NONE = 0, 230 /* Halt other VCPUs */ 231 WHPX_STEP_EXCLUSIVE, 232 } WhpxStepMode; 233 234 struct AccelCPUState { 235 WHV_EMULATOR_HANDLE emulator; 236 bool window_registered; 237 bool interruptable; 238 bool ready_for_pic_interrupt; 239 uint64_t tpr; 240 uint64_t apic_base; 241 bool interruption_pending; 242 243 /* Must be the last field as it may have a tail */ 244 WHV_RUN_VP_EXIT_CONTEXT exit_ctx; 245 }; 246 247 bool whpx_allowed; 248 static bool whp_dispatch_initialized; 249 static HMODULE hWinHvPlatform, hWinHvEmulation; 250 static uint32_t max_vcpu_index; 251 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap; 252 253 struct whpx_state whpx_global; 254 struct WHPDispatch whp_dispatch; 255 256 static bool whpx_has_xsave(void) 257 { 258 return whpx_xsave_cap.XsaveSupport; 259 } 260 261 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, 262 int r86) 263 { 264 WHV_X64_SEGMENT_REGISTER hs; 265 unsigned flags = qs->flags; 266 267 hs.Base = qs->base; 268 hs.Limit = qs->limit; 269 hs.Selector = qs->selector; 270 271 if (v86) { 272 hs.Attributes = 0; 273 hs.SegmentType = 3; 274 hs.Present = 1; 275 hs.DescriptorPrivilegeLevel = 3; 276 hs.NonSystemSegment = 1; 277 278 } else { 279 hs.Attributes = (flags >> DESC_TYPE_SHIFT); 280 281 if (r86) { 282 /* hs.Base &= 0xfffff; */ 283 } 284 } 285 286 return hs; 287 } 288 289 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) 290 { 291 SegmentCache qs; 292 293 qs.base = hs->Base; 294 qs.limit = hs->Limit; 295 qs.selector = hs->Selector; 296 297 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; 298 299 return qs; 300 } 301 302 /* X64 Extended Control Registers */ 303 static void whpx_set_xcrs(CPUState *cpu) 304 { 305 HRESULT hr; 306 struct whpx_state *whpx = &whpx_global; 307 WHV_REGISTER_VALUE xcr0; 308 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 309 310 if (!whpx_has_xsave()) { 311 return; 312 } 313 314 /* Only xcr0 is supported by the hypervisor currently */ 315 xcr0.Reg64 = cpu_env(cpu)->xcr0; 316 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 317 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 318 if (FAILED(hr)) { 319 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr); 320 } 321 } 322 323 static int whpx_set_tsc(CPUState *cpu) 324 { 325 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 326 WHV_REGISTER_VALUE tsc_val; 327 HRESULT hr; 328 struct whpx_state *whpx = &whpx_global; 329 330 /* 331 * Suspend the partition prior to setting the TSC to reduce the variance 332 * in TSC across vCPUs. When the first vCPU runs post suspend, the 333 * partition is automatically resumed. 334 */ 335 if (whp_dispatch.WHvSuspendPartitionTime) { 336 337 /* 338 * Unable to suspend partition while setting TSC is not a fatal 339 * error. It just increases the likelihood of TSC variance between 340 * vCPUs and some guest OS are able to handle that just fine. 341 */ 342 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); 343 if (FAILED(hr)) { 344 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); 345 } 346 } 347 348 tsc_val.Reg64 = cpu_env(cpu)->tsc; 349 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 350 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 351 if (FAILED(hr)) { 352 error_report("WHPX: Failed to set TSC, hr=%08lx", hr); 353 return -1; 354 } 355 356 return 0; 357 } 358 359 /* 360 * The CR8 register in the CPU is mapped to the TPR register of the APIC, 361 * however, they use a slightly different encoding. Specifically: 362 * 363 * APIC.TPR[bits 7:4] = CR8[bits 3:0] 364 * 365 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 366 * and IA-32 Architectures Software Developer's Manual. 367 * 368 * The functions below translate the value of CR8 to TPR and vice versa. 369 */ 370 371 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) 372 { 373 return tpr >> 4; 374 } 375 376 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8) 377 { 378 return cr8 << 4; 379 } 380 381 static void whpx_set_registers(CPUState *cpu, int level) 382 { 383 struct whpx_state *whpx = &whpx_global; 384 AccelCPUState *vcpu = cpu->accel; 385 X86CPU *x86_cpu = X86_CPU(cpu); 386 CPUX86State *env = &x86_cpu->env; 387 struct whpx_register_set vcxt; 388 HRESULT hr; 389 int idx; 390 int idx_next; 391 int i; 392 int v86, r86; 393 394 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 395 396 /* 397 * Following MSRs have side effects on the guest or are too heavy for 398 * runtime. Limit them to full state update. 399 */ 400 if (level >= WHPX_SET_RESET_STATE) { 401 whpx_set_tsc(cpu); 402 } 403 404 memset(&vcxt, 0, sizeof(struct whpx_register_set)); 405 406 v86 = (env->eflags & VM_MASK); 407 r86 = !(env->cr[0] & CR0_PE_MASK); 408 409 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 410 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); 411 412 idx = 0; 413 414 /* Indexes for first 16 registers match between HV and QEMU definitions */ 415 idx_next = 16; 416 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 417 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; 418 } 419 idx = idx_next; 420 421 /* Same goes for RIP and RFLAGS */ 422 assert(whpx_register_names[idx] == WHvX64RegisterRip); 423 vcxt.values[idx++].Reg64 = env->eip; 424 425 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 426 vcxt.values[idx++].Reg64 = env->eflags; 427 428 /* Translate 6+4 segment registers. HV and QEMU order matches */ 429 assert(idx == WHvX64RegisterEs); 430 for (i = 0; i < 6; i += 1, idx += 1) { 431 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); 432 } 433 434 assert(idx == WHvX64RegisterLdtr); 435 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); 436 437 assert(idx == WHvX64RegisterTr); 438 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); 439 440 assert(idx == WHvX64RegisterIdtr); 441 vcxt.values[idx].Table.Base = env->idt.base; 442 vcxt.values[idx].Table.Limit = env->idt.limit; 443 idx += 1; 444 445 assert(idx == WHvX64RegisterGdtr); 446 vcxt.values[idx].Table.Base = env->gdt.base; 447 vcxt.values[idx].Table.Limit = env->gdt.limit; 448 idx += 1; 449 450 /* CR0, 2, 3, 4, 8 */ 451 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 452 vcxt.values[idx++].Reg64 = env->cr[0]; 453 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 454 vcxt.values[idx++].Reg64 = env->cr[2]; 455 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 456 vcxt.values[idx++].Reg64 = env->cr[3]; 457 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 458 vcxt.values[idx++].Reg64 = env->cr[4]; 459 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 460 vcxt.values[idx++].Reg64 = vcpu->tpr; 461 462 /* 8 Debug Registers - Skipped */ 463 464 /* 465 * Extended control registers needs to be handled separately depending 466 * on whether xsave is supported/enabled or not. 467 */ 468 whpx_set_xcrs(cpu); 469 470 /* 16 XMM registers */ 471 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 472 idx_next = idx + 16; 473 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 474 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); 475 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); 476 } 477 idx = idx_next; 478 479 /* 8 FP registers */ 480 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 481 for (i = 0; i < 8; i += 1, idx += 1) { 482 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); 483 /* vcxt.values[idx].Fp.AsUINT128.High64 = 484 env->fpregs[i].mmx.MMX_Q(1); 485 */ 486 } 487 488 /* FP control status register */ 489 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 490 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; 491 vcxt.values[idx].FpControlStatus.FpStatus = 492 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; 493 vcxt.values[idx].FpControlStatus.FpTag = 0; 494 for (i = 0; i < 8; ++i) { 495 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; 496 } 497 vcxt.values[idx].FpControlStatus.Reserved = 0; 498 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; 499 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; 500 idx += 1; 501 502 /* XMM control status register */ 503 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 504 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; 505 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; 506 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; 507 idx += 1; 508 509 /* MSRs */ 510 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 511 vcxt.values[idx++].Reg64 = env->efer; 512 #ifdef TARGET_X86_64 513 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 514 vcxt.values[idx++].Reg64 = env->kernelgsbase; 515 #endif 516 517 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 518 vcxt.values[idx++].Reg64 = vcpu->apic_base; 519 520 /* WHvX64RegisterPat - Skipped */ 521 522 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 523 vcxt.values[idx++].Reg64 = env->sysenter_cs; 524 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 525 vcxt.values[idx++].Reg64 = env->sysenter_eip; 526 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 527 vcxt.values[idx++].Reg64 = env->sysenter_esp; 528 assert(whpx_register_names[idx] == WHvX64RegisterStar); 529 vcxt.values[idx++].Reg64 = env->star; 530 #ifdef TARGET_X86_64 531 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 532 vcxt.values[idx++].Reg64 = env->lstar; 533 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 534 vcxt.values[idx++].Reg64 = env->cstar; 535 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 536 vcxt.values[idx++].Reg64 = env->fmask; 537 #endif 538 539 /* Interrupt / Event Registers - Skipped */ 540 541 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 542 543 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 544 whpx->partition, cpu->cpu_index, 545 whpx_register_names, 546 RTL_NUMBER_OF(whpx_register_names), 547 &vcxt.values[0]); 548 549 if (FAILED(hr)) { 550 error_report("WHPX: Failed to set virtual processor context, hr=%08lx", 551 hr); 552 } 553 } 554 555 static int whpx_get_tsc(CPUState *cpu) 556 { 557 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 558 WHV_REGISTER_VALUE tsc_val; 559 HRESULT hr; 560 struct whpx_state *whpx = &whpx_global; 561 562 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 563 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 564 if (FAILED(hr)) { 565 error_report("WHPX: Failed to get TSC, hr=%08lx", hr); 566 return -1; 567 } 568 569 cpu_env(cpu)->tsc = tsc_val.Reg64; 570 return 0; 571 } 572 573 /* X64 Extended Control Registers */ 574 static void whpx_get_xcrs(CPUState *cpu) 575 { 576 HRESULT hr; 577 struct whpx_state *whpx = &whpx_global; 578 WHV_REGISTER_VALUE xcr0; 579 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 580 581 if (!whpx_has_xsave()) { 582 return; 583 } 584 585 /* Only xcr0 is supported by the hypervisor currently */ 586 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 587 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 588 if (FAILED(hr)) { 589 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr); 590 return; 591 } 592 593 cpu_env(cpu)->xcr0 = xcr0.Reg64; 594 } 595 596 static void whpx_get_registers(CPUState *cpu) 597 { 598 struct whpx_state *whpx = &whpx_global; 599 AccelCPUState *vcpu = cpu->accel; 600 X86CPU *x86_cpu = X86_CPU(cpu); 601 CPUX86State *env = &x86_cpu->env; 602 struct whpx_register_set vcxt; 603 uint64_t tpr, apic_base; 604 HRESULT hr; 605 int idx; 606 int idx_next; 607 int i; 608 609 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 610 611 if (!env->tsc_valid) { 612 whpx_get_tsc(cpu); 613 env->tsc_valid = !runstate_is_running(); 614 } 615 616 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 617 whpx->partition, cpu->cpu_index, 618 whpx_register_names, 619 RTL_NUMBER_OF(whpx_register_names), 620 &vcxt.values[0]); 621 if (FAILED(hr)) { 622 error_report("WHPX: Failed to get virtual processor context, hr=%08lx", 623 hr); 624 } 625 626 if (whpx_apic_in_platform()) { 627 /* 628 * Fetch the TPR value from the emulated APIC. It may get overwritten 629 * below with the value from CR8 returned by 630 * WHvGetVirtualProcessorRegisters(). 631 */ 632 whpx_apic_get(x86_cpu->apic_state); 633 vcpu->tpr = whpx_apic_tpr_to_cr8( 634 cpu_get_apic_tpr(x86_cpu->apic_state)); 635 } 636 637 idx = 0; 638 639 /* Indexes for first 16 registers match between HV and QEMU definitions */ 640 idx_next = 16; 641 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 642 env->regs[idx] = vcxt.values[idx].Reg64; 643 } 644 idx = idx_next; 645 646 /* Same goes for RIP and RFLAGS */ 647 assert(whpx_register_names[idx] == WHvX64RegisterRip); 648 env->eip = vcxt.values[idx++].Reg64; 649 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 650 env->eflags = vcxt.values[idx++].Reg64; 651 652 /* Translate 6+4 segment registers. HV and QEMU order matches */ 653 assert(idx == WHvX64RegisterEs); 654 for (i = 0; i < 6; i += 1, idx += 1) { 655 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); 656 } 657 658 assert(idx == WHvX64RegisterLdtr); 659 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); 660 assert(idx == WHvX64RegisterTr); 661 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); 662 assert(idx == WHvX64RegisterIdtr); 663 env->idt.base = vcxt.values[idx].Table.Base; 664 env->idt.limit = vcxt.values[idx].Table.Limit; 665 idx += 1; 666 assert(idx == WHvX64RegisterGdtr); 667 env->gdt.base = vcxt.values[idx].Table.Base; 668 env->gdt.limit = vcxt.values[idx].Table.Limit; 669 idx += 1; 670 671 /* CR0, 2, 3, 4, 8 */ 672 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 673 env->cr[0] = vcxt.values[idx++].Reg64; 674 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 675 env->cr[2] = vcxt.values[idx++].Reg64; 676 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 677 env->cr[3] = vcxt.values[idx++].Reg64; 678 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 679 env->cr[4] = vcxt.values[idx++].Reg64; 680 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 681 tpr = vcxt.values[idx++].Reg64; 682 if (tpr != vcpu->tpr) { 683 vcpu->tpr = tpr; 684 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr)); 685 } 686 687 /* 8 Debug Registers - Skipped */ 688 689 /* 690 * Extended control registers needs to be handled separately depending 691 * on whether xsave is supported/enabled or not. 692 */ 693 whpx_get_xcrs(cpu); 694 695 /* 16 XMM registers */ 696 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 697 idx_next = idx + 16; 698 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 699 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; 700 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; 701 } 702 idx = idx_next; 703 704 /* 8 FP registers */ 705 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 706 for (i = 0; i < 8; i += 1, idx += 1) { 707 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; 708 /* env->fpregs[i].mmx.MMX_Q(1) = 709 vcxt.values[idx].Fp.AsUINT128.High64; 710 */ 711 } 712 713 /* FP control status register */ 714 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 715 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; 716 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; 717 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; 718 for (i = 0; i < 8; ++i) { 719 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); 720 } 721 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; 722 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; 723 idx += 1; 724 725 /* XMM control status register */ 726 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 727 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; 728 idx += 1; 729 730 /* MSRs */ 731 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 732 env->efer = vcxt.values[idx++].Reg64; 733 #ifdef TARGET_X86_64 734 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 735 env->kernelgsbase = vcxt.values[idx++].Reg64; 736 #endif 737 738 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 739 apic_base = vcxt.values[idx++].Reg64; 740 if (apic_base != vcpu->apic_base) { 741 vcpu->apic_base = apic_base; 742 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); 743 } 744 745 /* WHvX64RegisterPat - Skipped */ 746 747 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 748 env->sysenter_cs = vcxt.values[idx++].Reg64; 749 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 750 env->sysenter_eip = vcxt.values[idx++].Reg64; 751 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 752 env->sysenter_esp = vcxt.values[idx++].Reg64; 753 assert(whpx_register_names[idx] == WHvX64RegisterStar); 754 env->star = vcxt.values[idx++].Reg64; 755 #ifdef TARGET_X86_64 756 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 757 env->lstar = vcxt.values[idx++].Reg64; 758 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 759 env->cstar = vcxt.values[idx++].Reg64; 760 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 761 env->fmask = vcxt.values[idx++].Reg64; 762 #endif 763 764 /* Interrupt / Event Registers - Skipped */ 765 766 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 767 768 if (whpx_apic_in_platform()) { 769 whpx_apic_get(x86_cpu->apic_state); 770 } 771 772 x86_update_hflags(env); 773 } 774 775 static HRESULT CALLBACK whpx_emu_ioport_callback( 776 void *ctx, 777 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) 778 { 779 MemTxAttrs attrs = { 0 }; 780 address_space_rw(&address_space_io, IoAccess->Port, attrs, 781 &IoAccess->Data, IoAccess->AccessSize, 782 IoAccess->Direction); 783 return S_OK; 784 } 785 786 static HRESULT CALLBACK whpx_emu_mmio_callback( 787 void *ctx, 788 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) 789 { 790 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, 791 ma->Direction); 792 return S_OK; 793 } 794 795 static HRESULT CALLBACK whpx_emu_getreg_callback( 796 void *ctx, 797 const WHV_REGISTER_NAME *RegisterNames, 798 UINT32 RegisterCount, 799 WHV_REGISTER_VALUE *RegisterValues) 800 { 801 HRESULT hr; 802 struct whpx_state *whpx = &whpx_global; 803 CPUState *cpu = (CPUState *)ctx; 804 805 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 806 whpx->partition, cpu->cpu_index, 807 RegisterNames, RegisterCount, 808 RegisterValues); 809 if (FAILED(hr)) { 810 error_report("WHPX: Failed to get virtual processor registers," 811 " hr=%08lx", hr); 812 } 813 814 return hr; 815 } 816 817 static HRESULT CALLBACK whpx_emu_setreg_callback( 818 void *ctx, 819 const WHV_REGISTER_NAME *RegisterNames, 820 UINT32 RegisterCount, 821 const WHV_REGISTER_VALUE *RegisterValues) 822 { 823 HRESULT hr; 824 struct whpx_state *whpx = &whpx_global; 825 CPUState *cpu = (CPUState *)ctx; 826 827 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 828 whpx->partition, cpu->cpu_index, 829 RegisterNames, RegisterCount, 830 RegisterValues); 831 if (FAILED(hr)) { 832 error_report("WHPX: Failed to set virtual processor registers," 833 " hr=%08lx", hr); 834 } 835 836 /* 837 * The emulator just successfully wrote the register state. We clear the 838 * dirty state so we avoid the double write on resume of the VP. 839 */ 840 cpu->vcpu_dirty = false; 841 842 return hr; 843 } 844 845 static HRESULT CALLBACK whpx_emu_translate_callback( 846 void *ctx, 847 WHV_GUEST_VIRTUAL_ADDRESS Gva, 848 WHV_TRANSLATE_GVA_FLAGS TranslateFlags, 849 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, 850 WHV_GUEST_PHYSICAL_ADDRESS *Gpa) 851 { 852 HRESULT hr; 853 struct whpx_state *whpx = &whpx_global; 854 CPUState *cpu = (CPUState *)ctx; 855 WHV_TRANSLATE_GVA_RESULT res; 856 857 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, 858 Gva, TranslateFlags, &res, Gpa); 859 if (FAILED(hr)) { 860 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); 861 } else { 862 *TranslationResult = res.ResultCode; 863 } 864 865 return hr; 866 } 867 868 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { 869 .Size = sizeof(WHV_EMULATOR_CALLBACKS), 870 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, 871 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, 872 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, 873 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, 874 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, 875 }; 876 877 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) 878 { 879 HRESULT hr; 880 AccelCPUState *vcpu = cpu->accel; 881 WHV_EMULATOR_STATUS emu_status; 882 883 hr = whp_dispatch.WHvEmulatorTryMmioEmulation( 884 vcpu->emulator, cpu, 885 &vcpu->exit_ctx.VpContext, ctx, 886 &emu_status); 887 if (FAILED(hr)) { 888 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); 889 return -1; 890 } 891 892 if (!emu_status.EmulationSuccessful) { 893 error_report("WHPX: Failed to emulate MMIO access with" 894 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 895 return -1; 896 } 897 898 return 0; 899 } 900 901 static int whpx_handle_portio(CPUState *cpu, 902 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) 903 { 904 HRESULT hr; 905 AccelCPUState *vcpu = cpu->accel; 906 WHV_EMULATOR_STATUS emu_status; 907 908 hr = whp_dispatch.WHvEmulatorTryIoEmulation( 909 vcpu->emulator, cpu, 910 &vcpu->exit_ctx.VpContext, ctx, 911 &emu_status); 912 if (FAILED(hr)) { 913 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); 914 return -1; 915 } 916 917 if (!emu_status.EmulationSuccessful) { 918 error_report("WHPX: Failed to emulate PortIO access with" 919 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 920 return -1; 921 } 922 923 return 0; 924 } 925 926 /* 927 * Controls whether we should intercept various exceptions on the guest, 928 * namely breakpoint/single-step events. 929 * 930 * The 'exceptions' argument accepts a bitmask, e.g: 931 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) 932 */ 933 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) 934 { 935 struct whpx_state *whpx = &whpx_global; 936 WHV_PARTITION_PROPERTY prop = { 0, }; 937 HRESULT hr; 938 939 if (exceptions == whpx->exception_exit_bitmap) { 940 return S_OK; 941 } 942 943 prop.ExceptionExitBitmap = exceptions; 944 945 hr = whp_dispatch.WHvSetPartitionProperty( 946 whpx->partition, 947 WHvPartitionPropertyCodeExceptionExitBitmap, 948 &prop, 949 sizeof(WHV_PARTITION_PROPERTY)); 950 951 if (SUCCEEDED(hr)) { 952 whpx->exception_exit_bitmap = exceptions; 953 } 954 955 return hr; 956 } 957 958 959 /* 960 * This function is called before/after stepping over a single instruction. 961 * It will update the CPU registers to arm/disarm the instruction stepping 962 * accordingly. 963 */ 964 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, 965 bool set, 966 uint64_t *exit_context_rflags) 967 { 968 WHV_REGISTER_NAME reg_name; 969 WHV_REGISTER_VALUE reg_value; 970 HRESULT hr; 971 struct whpx_state *whpx = &whpx_global; 972 973 /* 974 * If we are trying to step over a single instruction, we need to set the 975 * TF bit in rflags. Otherwise, clear it. 976 */ 977 reg_name = WHvX64RegisterRflags; 978 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 979 whpx->partition, 980 cpu->cpu_index, 981 ®_name, 982 1, 983 ®_value); 984 985 if (FAILED(hr)) { 986 error_report("WHPX: Failed to get rflags, hr=%08lx", hr); 987 return hr; 988 } 989 990 if (exit_context_rflags) { 991 assert(*exit_context_rflags == reg_value.Reg64); 992 } 993 994 if (set) { 995 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ 996 reg_value.Reg64 |= TF_MASK; 997 } else { 998 reg_value.Reg64 &= ~TF_MASK; 999 } 1000 1001 if (exit_context_rflags) { 1002 *exit_context_rflags = reg_value.Reg64; 1003 } 1004 1005 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1006 whpx->partition, 1007 cpu->cpu_index, 1008 ®_name, 1009 1, 1010 ®_value); 1011 1012 if (FAILED(hr)) { 1013 error_report("WHPX: Failed to set rflags," 1014 " hr=%08lx", 1015 hr); 1016 return hr; 1017 } 1018 1019 reg_name = WHvRegisterInterruptState; 1020 reg_value.Reg64 = 0; 1021 1022 /* Suspend delivery of hardware interrupts during single-stepping. */ 1023 reg_value.InterruptState.InterruptShadow = set != 0; 1024 1025 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1026 whpx->partition, 1027 cpu->cpu_index, 1028 ®_name, 1029 1, 1030 ®_value); 1031 1032 if (FAILED(hr)) { 1033 error_report("WHPX: Failed to set InterruptState," 1034 " hr=%08lx", 1035 hr); 1036 return hr; 1037 } 1038 1039 if (!set) { 1040 /* 1041 * We have just finished stepping over a single instruction, 1042 * and intercepted the INT1 generated by it. 1043 * We need to now hide the INT1 from the guest, 1044 * as it would not be expecting it. 1045 */ 1046 1047 reg_name = WHvX64RegisterPendingDebugException; 1048 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1049 whpx->partition, 1050 cpu->cpu_index, 1051 ®_name, 1052 1, 1053 ®_value); 1054 1055 if (FAILED(hr)) { 1056 error_report("WHPX: Failed to get pending debug exceptions," 1057 "hr=%08lx", hr); 1058 return hr; 1059 } 1060 1061 if (reg_value.PendingDebugException.SingleStep) { 1062 reg_value.PendingDebugException.SingleStep = 0; 1063 1064 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1065 whpx->partition, 1066 cpu->cpu_index, 1067 ®_name, 1068 1, 1069 ®_value); 1070 1071 if (FAILED(hr)) { 1072 error_report("WHPX: Failed to clear pending debug exceptions," 1073 "hr=%08lx", hr); 1074 return hr; 1075 } 1076 } 1077 1078 } 1079 1080 return S_OK; 1081 } 1082 1083 /* Tries to find a breakpoint at the specified address. */ 1084 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) 1085 { 1086 struct whpx_state *whpx = &whpx_global; 1087 int i; 1088 1089 if (whpx->breakpoints.breakpoints) { 1090 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { 1091 if (address == whpx->breakpoints.breakpoints->data[i].address) { 1092 return &whpx->breakpoints.breakpoints->data[i]; 1093 } 1094 } 1095 } 1096 1097 return NULL; 1098 } 1099 1100 /* 1101 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for 1102 * debugging user-mode applications. Since the WHPX API does not offer 1103 * an easy way to pass the intercepted exception back to the guest, we 1104 * resort to using INT1 instead, and let the guest always handle INT3. 1105 */ 1106 static const uint8_t whpx_breakpoint_instruction = 0xF1; 1107 1108 /* 1109 * The WHPX QEMU backend implements breakpoints by writing the INT1 1110 * instruction into memory (ignoring the DRx registers). This raises a few 1111 * issues that need to be carefully handled: 1112 * 1113 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints 1114 * at the same location, and later remove them in arbitrary order. 1115 * This should not cause memory corruption, and should only remove the 1116 * physical breakpoint instruction when the last QEMU breakpoint is gone. 1117 * 1118 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid 1119 * physical location. Hence, physically adding/removing a breakpoint can 1120 * theoretically fail at any time. We need to keep track of it. 1121 * 1122 * The function below rebuilds a list of low-level breakpoints (one per 1123 * address, tracking the original instruction and any errors) from the list of 1124 * high-level breakpoints (set via cpu_breakpoint_insert()). 1125 * 1126 * In order to optimize performance, this function stores the list of 1127 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the 1128 * low-level ones, so that it won't be re-invoked until these breakpoints 1129 * change. 1130 * 1131 * Note that this function decides which breakpoints should be inserted into, 1132 * memory, but doesn't actually do it. The memory accessing is done in 1133 * whpx_apply_breakpoints(). 1134 */ 1135 static void whpx_translate_cpu_breakpoints( 1136 struct whpx_breakpoints *breakpoints, 1137 CPUState *cpu, 1138 int cpu_breakpoint_count) 1139 { 1140 CPUBreakpoint *bp; 1141 int cpu_bp_index = 0; 1142 1143 breakpoints->original_addresses = 1144 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); 1145 1146 breakpoints->original_address_count = cpu_breakpoint_count; 1147 1148 int max_breakpoints = cpu_breakpoint_count + 1149 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); 1150 1151 struct whpx_breakpoint_collection *new_breakpoints = 1152 g_malloc0(sizeof(struct whpx_breakpoint_collection) 1153 + max_breakpoints * sizeof(struct whpx_breakpoint)); 1154 1155 new_breakpoints->allocated = max_breakpoints; 1156 new_breakpoints->used = 0; 1157 1158 /* 1159 * 1. Preserve all old breakpoints that could not be automatically 1160 * cleared when the CPU got stopped. 1161 */ 1162 if (breakpoints->breakpoints) { 1163 int i; 1164 for (i = 0; i < breakpoints->breakpoints->used; i++) { 1165 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { 1166 new_breakpoints->data[new_breakpoints->used++] = 1167 breakpoints->breakpoints->data[i]; 1168 } 1169 } 1170 } 1171 1172 /* 2. Map all CPU breakpoints to WHPX breakpoints */ 1173 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1174 int i; 1175 bool found = false; 1176 1177 /* This will be used to detect changed CPU breakpoints later. */ 1178 breakpoints->original_addresses[cpu_bp_index++] = bp->pc; 1179 1180 for (i = 0; i < new_breakpoints->used; i++) { 1181 /* 1182 * WARNING: This loop has O(N^2) complexity, where N is the 1183 * number of breakpoints. It should not be a bottleneck in 1184 * real-world scenarios, since it only needs to run once after 1185 * the breakpoints have been modified. 1186 * If this ever becomes a concern, it can be optimized by storing 1187 * high-level breakpoint objects in a tree or hash map. 1188 */ 1189 1190 if (new_breakpoints->data[i].address == bp->pc) { 1191 /* There was already a breakpoint at this address. */ 1192 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { 1193 new_breakpoints->data[i].state = WHPX_BP_SET; 1194 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { 1195 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; 1196 } 1197 1198 found = true; 1199 break; 1200 } 1201 } 1202 1203 if (!found && new_breakpoints->used < new_breakpoints->allocated) { 1204 /* No WHPX breakpoint at this address. Create one. */ 1205 new_breakpoints->data[new_breakpoints->used].address = bp->pc; 1206 new_breakpoints->data[new_breakpoints->used].state = 1207 WHPX_BP_SET_PENDING; 1208 new_breakpoints->used++; 1209 } 1210 } 1211 1212 /* 1213 * Free the previous breakpoint list. This can be optimized by keeping 1214 * it as shadow buffer for the next computation instead of freeing 1215 * it immediately. 1216 */ 1217 g_free(breakpoints->breakpoints); 1218 1219 breakpoints->breakpoints = new_breakpoints; 1220 } 1221 1222 /* 1223 * Physically inserts/removes the breakpoints by reading and writing the 1224 * physical memory, keeping a track of the failed attempts. 1225 * 1226 * Passing resuming=true will try to set all previously unset breakpoints. 1227 * Passing resuming=false will remove all inserted ones. 1228 */ 1229 static void whpx_apply_breakpoints( 1230 struct whpx_breakpoint_collection *breakpoints, 1231 CPUState *cpu, 1232 bool resuming) 1233 { 1234 int i, rc; 1235 if (!breakpoints) { 1236 return; 1237 } 1238 1239 for (i = 0; i < breakpoints->used; i++) { 1240 /* Decide what to do right now based on the last known state. */ 1241 WhpxBreakpointState state = breakpoints->data[i].state; 1242 switch (state) { 1243 case WHPX_BP_CLEARED: 1244 if (resuming) { 1245 state = WHPX_BP_SET_PENDING; 1246 } 1247 break; 1248 case WHPX_BP_SET_PENDING: 1249 if (!resuming) { 1250 state = WHPX_BP_CLEARED; 1251 } 1252 break; 1253 case WHPX_BP_SET: 1254 if (!resuming) { 1255 state = WHPX_BP_CLEAR_PENDING; 1256 } 1257 break; 1258 case WHPX_BP_CLEAR_PENDING: 1259 if (resuming) { 1260 state = WHPX_BP_SET; 1261 } 1262 break; 1263 } 1264 1265 if (state == WHPX_BP_SET_PENDING) { 1266 /* Remember the original instruction. */ 1267 rc = cpu_memory_rw_debug(cpu, 1268 breakpoints->data[i].address, 1269 &breakpoints->data[i].original_instruction, 1270 1, 1271 false); 1272 1273 if (!rc) { 1274 /* Write the breakpoint instruction. */ 1275 rc = cpu_memory_rw_debug(cpu, 1276 breakpoints->data[i].address, 1277 (void *)&whpx_breakpoint_instruction, 1278 1, 1279 true); 1280 } 1281 1282 if (!rc) { 1283 state = WHPX_BP_SET; 1284 } 1285 1286 } 1287 1288 if (state == WHPX_BP_CLEAR_PENDING) { 1289 /* Restore the original instruction. */ 1290 rc = cpu_memory_rw_debug(cpu, 1291 breakpoints->data[i].address, 1292 &breakpoints->data[i].original_instruction, 1293 1, 1294 true); 1295 1296 if (!rc) { 1297 state = WHPX_BP_CLEARED; 1298 } 1299 } 1300 1301 breakpoints->data[i].state = state; 1302 } 1303 } 1304 1305 /* 1306 * This function is called when the a VCPU is about to start and no other 1307 * VCPUs have been started so far. Since the VCPU start order could be 1308 * arbitrary, it doesn't have to be VCPU#0. 1309 * 1310 * It is used to commit the breakpoints into memory, and configure WHPX 1311 * to intercept debug exceptions. 1312 * 1313 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or 1314 * more VCPUs are already running, so this is the best place to do it. 1315 */ 1316 static int whpx_first_vcpu_starting(CPUState *cpu) 1317 { 1318 struct whpx_state *whpx = &whpx_global; 1319 HRESULT hr; 1320 1321 g_assert(bql_locked()); 1322 1323 if (!QTAILQ_EMPTY(&cpu->breakpoints) || 1324 (whpx->breakpoints.breakpoints && 1325 whpx->breakpoints.breakpoints->used)) { 1326 CPUBreakpoint *bp; 1327 int i = 0; 1328 bool update_pending = false; 1329 1330 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1331 if (i >= whpx->breakpoints.original_address_count || 1332 bp->pc != whpx->breakpoints.original_addresses[i]) { 1333 update_pending = true; 1334 } 1335 1336 i++; 1337 } 1338 1339 if (i != whpx->breakpoints.original_address_count) { 1340 update_pending = true; 1341 } 1342 1343 if (update_pending) { 1344 /* 1345 * The CPU breakpoints have changed since the last call to 1346 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must 1347 * now be recomputed. 1348 */ 1349 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); 1350 } 1351 1352 /* Actually insert the breakpoints into the memory. */ 1353 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); 1354 } 1355 1356 uint64_t exception_mask; 1357 if (whpx->step_pending || 1358 (whpx->breakpoints.breakpoints && 1359 whpx->breakpoints.breakpoints->used)) { 1360 /* 1361 * We are either attempting to single-step one or more CPUs, or 1362 * have one or more breakpoints enabled. Both require intercepting 1363 * the WHvX64ExceptionTypeBreakpointTrap exception. 1364 */ 1365 1366 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; 1367 } else { 1368 /* Let the guest handle all exceptions. */ 1369 exception_mask = 0; 1370 } 1371 1372 hr = whpx_set_exception_exit_bitmap(exception_mask); 1373 if (!SUCCEEDED(hr)) { 1374 error_report("WHPX: Failed to update exception exit mask," 1375 "hr=%08lx.", hr); 1376 return 1; 1377 } 1378 1379 return 0; 1380 } 1381 1382 /* 1383 * This function is called when the last VCPU has finished running. 1384 * It is used to remove any previously set breakpoints from memory. 1385 */ 1386 static int whpx_last_vcpu_stopping(CPUState *cpu) 1387 { 1388 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); 1389 return 0; 1390 } 1391 1392 /* Returns the address of the next instruction that is about to be executed. */ 1393 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) 1394 { 1395 if (cpu->vcpu_dirty) { 1396 /* The CPU registers have been modified by other parts of QEMU. */ 1397 return cpu_env(cpu)->eip; 1398 } else if (exit_context_valid) { 1399 /* 1400 * The CPU registers have not been modified by neither other parts 1401 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). 1402 * This is the most common case. 1403 */ 1404 AccelCPUState *vcpu = cpu->accel; 1405 return vcpu->exit_ctx.VpContext.Rip; 1406 } else { 1407 /* 1408 * The CPU registers have been modified by a call to 1409 * WHvSetVirtualProcessorRegisters() and must be re-queried from 1410 * the target. 1411 */ 1412 WHV_REGISTER_VALUE reg_value; 1413 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; 1414 HRESULT hr; 1415 struct whpx_state *whpx = &whpx_global; 1416 1417 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1418 whpx->partition, 1419 cpu->cpu_index, 1420 ®_name, 1421 1, 1422 ®_value); 1423 1424 if (FAILED(hr)) { 1425 error_report("WHPX: Failed to get PC, hr=%08lx", hr); 1426 return 0; 1427 } 1428 1429 return reg_value.Reg64; 1430 } 1431 } 1432 1433 static int whpx_handle_halt(CPUState *cpu) 1434 { 1435 int ret = 0; 1436 1437 bql_lock(); 1438 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1439 (cpu_env(cpu)->eflags & IF_MASK)) && 1440 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1441 cpu->exception_index = EXCP_HLT; 1442 cpu->halted = true; 1443 ret = 1; 1444 } 1445 bql_unlock(); 1446 1447 return ret; 1448 } 1449 1450 static void whpx_vcpu_pre_run(CPUState *cpu) 1451 { 1452 HRESULT hr; 1453 struct whpx_state *whpx = &whpx_global; 1454 AccelCPUState *vcpu = cpu->accel; 1455 X86CPU *x86_cpu = X86_CPU(cpu); 1456 CPUX86State *env = &x86_cpu->env; 1457 int irq; 1458 uint8_t tpr; 1459 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; 1460 UINT32 reg_count = 0; 1461 WHV_REGISTER_VALUE reg_values[3]; 1462 WHV_REGISTER_NAME reg_names[3]; 1463 1464 memset(&new_int, 0, sizeof(new_int)); 1465 memset(reg_values, 0, sizeof(reg_values)); 1466 1467 bql_lock(); 1468 1469 /* Inject NMI */ 1470 if (!vcpu->interruption_pending && 1471 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 1472 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 1473 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 1474 vcpu->interruptable = false; 1475 new_int.InterruptionType = WHvX64PendingNmi; 1476 new_int.InterruptionPending = 1; 1477 new_int.InterruptionVector = 2; 1478 } 1479 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 1480 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 1481 } 1482 } 1483 1484 /* 1485 * Force the VCPU out of its inner loop to process any INIT requests or 1486 * commit pending TPR access. 1487 */ 1488 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 1489 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1490 !(env->hflags & HF_SMM_MASK)) { 1491 cpu->exit_request = 1; 1492 } 1493 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1494 cpu->exit_request = 1; 1495 } 1496 } 1497 1498 /* Get pending hard interruption or replay one that was overwritten */ 1499 if (!whpx_apic_in_platform()) { 1500 if (!vcpu->interruption_pending && 1501 vcpu->interruptable && (env->eflags & IF_MASK)) { 1502 assert(!new_int.InterruptionPending); 1503 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1504 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1505 irq = cpu_get_pic_interrupt(env); 1506 if (irq >= 0) { 1507 new_int.InterruptionType = WHvX64PendingInterrupt; 1508 new_int.InterruptionPending = 1; 1509 new_int.InterruptionVector = irq; 1510 } 1511 } 1512 } 1513 1514 /* Setup interrupt state if new one was prepared */ 1515 if (new_int.InterruptionPending) { 1516 reg_values[reg_count].PendingInterruption = new_int; 1517 reg_names[reg_count] = WHvRegisterPendingInterruption; 1518 reg_count += 1; 1519 } 1520 } else if (vcpu->ready_for_pic_interrupt && 1521 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 1522 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1523 irq = cpu_get_pic_interrupt(env); 1524 if (irq >= 0) { 1525 reg_names[reg_count] = WHvRegisterPendingEvent; 1526 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) 1527 { 1528 .EventPending = 1, 1529 .EventType = WHvX64PendingEventExtInt, 1530 .Vector = irq, 1531 }; 1532 reg_count += 1; 1533 } 1534 } 1535 1536 /* Sync the TPR to the CR8 if was modified during the intercept */ 1537 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 1538 if (tpr != vcpu->tpr) { 1539 vcpu->tpr = tpr; 1540 reg_values[reg_count].Reg64 = tpr; 1541 cpu->exit_request = 1; 1542 reg_names[reg_count] = WHvX64RegisterCr8; 1543 reg_count += 1; 1544 } 1545 1546 /* Update the state of the interrupt delivery notification */ 1547 if (!vcpu->window_registered && 1548 cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1549 reg_values[reg_count].DeliverabilityNotifications = 1550 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { 1551 .InterruptNotification = 1 1552 }; 1553 vcpu->window_registered = 1; 1554 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; 1555 reg_count += 1; 1556 } 1557 1558 bql_unlock(); 1559 vcpu->ready_for_pic_interrupt = false; 1560 1561 if (reg_count) { 1562 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1563 whpx->partition, cpu->cpu_index, 1564 reg_names, reg_count, reg_values); 1565 if (FAILED(hr)) { 1566 error_report("WHPX: Failed to set interrupt state registers," 1567 " hr=%08lx", hr); 1568 } 1569 } 1570 } 1571 1572 static void whpx_vcpu_post_run(CPUState *cpu) 1573 { 1574 AccelCPUState *vcpu = cpu->accel; 1575 X86CPU *x86_cpu = X86_CPU(cpu); 1576 CPUX86State *env = &x86_cpu->env; 1577 1578 env->eflags = vcpu->exit_ctx.VpContext.Rflags; 1579 1580 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; 1581 if (vcpu->tpr != tpr) { 1582 vcpu->tpr = tpr; 1583 bql_lock(); 1584 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr)); 1585 bql_unlock(); 1586 } 1587 1588 vcpu->interruption_pending = 1589 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; 1590 1591 vcpu->interruptable = 1592 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; 1593 } 1594 1595 static void whpx_vcpu_process_async_events(CPUState *cpu) 1596 { 1597 X86CPU *x86_cpu = X86_CPU(cpu); 1598 CPUX86State *env = &x86_cpu->env; 1599 AccelCPUState *vcpu = cpu->accel; 1600 1601 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1602 !(env->hflags & HF_SMM_MASK)) { 1603 whpx_cpu_synchronize_state(cpu); 1604 do_cpu_init(x86_cpu); 1605 vcpu->interruptable = true; 1606 } 1607 1608 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 1609 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 1610 apic_poll_irq(x86_cpu->apic_state); 1611 } 1612 1613 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1614 (env->eflags & IF_MASK)) || 1615 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1616 cpu->halted = false; 1617 } 1618 1619 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 1620 whpx_cpu_synchronize_state(cpu); 1621 do_cpu_sipi(x86_cpu); 1622 } 1623 1624 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1625 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 1626 whpx_cpu_synchronize_state(cpu); 1627 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 1628 env->tpr_access_type); 1629 } 1630 } 1631 1632 static int whpx_vcpu_run(CPUState *cpu) 1633 { 1634 HRESULT hr; 1635 struct whpx_state *whpx = &whpx_global; 1636 AccelCPUState *vcpu = cpu->accel; 1637 struct whpx_breakpoint *stepped_over_bp = NULL; 1638 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; 1639 int ret; 1640 1641 g_assert(bql_locked()); 1642 1643 if (whpx->running_cpus++ == 0) { 1644 /* Insert breakpoints into memory, update exception exit bitmap. */ 1645 ret = whpx_first_vcpu_starting(cpu); 1646 if (ret != 0) { 1647 return ret; 1648 } 1649 } 1650 1651 if (whpx->breakpoints.breakpoints && 1652 whpx->breakpoints.breakpoints->used > 0) 1653 { 1654 uint64_t pc = whpx_vcpu_get_pc(cpu, true); 1655 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); 1656 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { 1657 stepped_over_bp = NULL; 1658 } 1659 1660 if (stepped_over_bp) { 1661 /* 1662 * We are trying to run the instruction overwritten by an active 1663 * breakpoint. We will temporarily disable the breakpoint, suspend 1664 * other CPUs, and step over the instruction. 1665 */ 1666 exclusive_step_mode = WHPX_STEP_EXCLUSIVE; 1667 } 1668 } 1669 1670 if (exclusive_step_mode == WHPX_STEP_NONE) { 1671 whpx_vcpu_process_async_events(cpu); 1672 if (cpu->halted && !whpx_apic_in_platform()) { 1673 cpu->exception_index = EXCP_HLT; 1674 qatomic_set(&cpu->exit_request, false); 1675 return 0; 1676 } 1677 } 1678 1679 bql_unlock(); 1680 1681 if (exclusive_step_mode != WHPX_STEP_NONE) { 1682 start_exclusive(); 1683 g_assert(cpu == current_cpu); 1684 g_assert(!cpu->running); 1685 cpu->running = true; 1686 1687 hr = whpx_set_exception_exit_bitmap( 1688 1UL << WHvX64ExceptionTypeDebugTrapOrFault); 1689 if (!SUCCEEDED(hr)) { 1690 error_report("WHPX: Failed to update exception exit mask, " 1691 "hr=%08lx.", hr); 1692 return 1; 1693 } 1694 1695 if (stepped_over_bp) { 1696 /* Temporarily disable the triggered breakpoint. */ 1697 cpu_memory_rw_debug(cpu, 1698 stepped_over_bp->address, 1699 &stepped_over_bp->original_instruction, 1700 1, 1701 true); 1702 } 1703 } else { 1704 cpu_exec_start(cpu); 1705 } 1706 1707 do { 1708 if (cpu->vcpu_dirty) { 1709 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); 1710 cpu->vcpu_dirty = false; 1711 } 1712 1713 if (exclusive_step_mode == WHPX_STEP_NONE) { 1714 whpx_vcpu_pre_run(cpu); 1715 1716 if (qatomic_read(&cpu->exit_request)) { 1717 whpx_vcpu_kick(cpu); 1718 } 1719 } 1720 1721 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1722 whpx_vcpu_configure_single_stepping(cpu, true, NULL); 1723 } 1724 1725 hr = whp_dispatch.WHvRunVirtualProcessor( 1726 whpx->partition, cpu->cpu_index, 1727 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); 1728 1729 if (FAILED(hr)) { 1730 error_report("WHPX: Failed to exec a virtual processor," 1731 " hr=%08lx", hr); 1732 ret = -1; 1733 break; 1734 } 1735 1736 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1737 whpx_vcpu_configure_single_stepping(cpu, 1738 false, 1739 &vcpu->exit_ctx.VpContext.Rflags); 1740 } 1741 1742 whpx_vcpu_post_run(cpu); 1743 1744 switch (vcpu->exit_ctx.ExitReason) { 1745 case WHvRunVpExitReasonMemoryAccess: 1746 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); 1747 break; 1748 1749 case WHvRunVpExitReasonX64IoPortAccess: 1750 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); 1751 break; 1752 1753 case WHvRunVpExitReasonX64InterruptWindow: 1754 vcpu->ready_for_pic_interrupt = 1; 1755 vcpu->window_registered = 0; 1756 ret = 0; 1757 break; 1758 1759 case WHvRunVpExitReasonX64ApicEoi: 1760 assert(whpx_apic_in_platform()); 1761 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); 1762 break; 1763 1764 case WHvRunVpExitReasonX64Halt: 1765 /* 1766 * WARNING: as of build 19043.1526 (21H1), this exit reason is no 1767 * longer used. 1768 */ 1769 ret = whpx_handle_halt(cpu); 1770 break; 1771 1772 case WHvRunVpExitReasonX64ApicInitSipiTrap: { 1773 WHV_INTERRUPT_CONTROL ipi = {0}; 1774 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; 1775 uint32_t delivery_mode = 1776 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; 1777 int dest_shorthand = 1778 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; 1779 bool broadcast = false; 1780 bool include_self = false; 1781 uint32_t i; 1782 1783 /* We only registered for INIT and SIPI exits. */ 1784 if ((delivery_mode != APIC_DM_INIT) && 1785 (delivery_mode != APIC_DM_SIPI)) { 1786 error_report( 1787 "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); 1788 break; 1789 } 1790 1791 if (delivery_mode == APIC_DM_INIT) { 1792 ipi.Type = WHvX64InterruptTypeInit; 1793 } else { 1794 ipi.Type = WHvX64InterruptTypeSipi; 1795 } 1796 1797 ipi.DestinationMode = 1798 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? 1799 WHvX64InterruptDestinationModeLogical : 1800 WHvX64InterruptDestinationModePhysical; 1801 1802 ipi.TriggerMode = 1803 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? 1804 WHvX64InterruptTriggerModeLevel : 1805 WHvX64InterruptTriggerModeEdge; 1806 1807 ipi.Vector = icr & APIC_VECTOR_MASK; 1808 switch (dest_shorthand) { 1809 /* no shorthand. Bits 56-63 contain the destination. */ 1810 case 0: 1811 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; 1812 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1813 &ipi, sizeof(ipi)); 1814 if (FAILED(hr)) { 1815 error_report("WHPX: Failed to request interrupt hr=%08lx", 1816 hr); 1817 } 1818 1819 break; 1820 1821 /* self */ 1822 case 1: 1823 include_self = true; 1824 break; 1825 1826 /* broadcast, including self */ 1827 case 2: 1828 broadcast = true; 1829 include_self = true; 1830 break; 1831 1832 /* broadcast, excluding self */ 1833 case 3: 1834 broadcast = true; 1835 break; 1836 } 1837 1838 if (!broadcast && !include_self) { 1839 break; 1840 } 1841 1842 for (i = 0; i <= max_vcpu_index; i++) { 1843 if (i == cpu->cpu_index && !include_self) { 1844 continue; 1845 } 1846 1847 /* 1848 * Assuming that APIC Ids are identity mapped since 1849 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers 1850 * are not handled yet and the hypervisor doesn't allow the 1851 * guest to modify the APIC ID. 1852 */ 1853 ipi.Destination = i; 1854 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1855 &ipi, sizeof(ipi)); 1856 if (FAILED(hr)) { 1857 error_report( 1858 "WHPX: Failed to request SIPI for %d, hr=%08lx", 1859 i, hr); 1860 } 1861 } 1862 1863 break; 1864 } 1865 1866 case WHvRunVpExitReasonCanceled: 1867 if (exclusive_step_mode != WHPX_STEP_NONE) { 1868 /* 1869 * We are trying to step over a single instruction, and 1870 * likely got a request to stop from another thread. 1871 * Delay it until we are done stepping 1872 * over. 1873 */ 1874 ret = 0; 1875 } else { 1876 cpu->exception_index = EXCP_INTERRUPT; 1877 ret = 1; 1878 } 1879 break; 1880 case WHvRunVpExitReasonX64MsrAccess: { 1881 WHV_REGISTER_VALUE reg_values[3] = {0}; 1882 WHV_REGISTER_NAME reg_names[3]; 1883 UINT32 reg_count; 1884 1885 reg_names[0] = WHvX64RegisterRip; 1886 reg_names[1] = WHvX64RegisterRax; 1887 reg_names[2] = WHvX64RegisterRdx; 1888 1889 reg_values[0].Reg64 = 1890 vcpu->exit_ctx.VpContext.Rip + 1891 vcpu->exit_ctx.VpContext.InstructionLength; 1892 1893 /* 1894 * For all unsupported MSR access we: 1895 * ignore writes 1896 * return 0 on read. 1897 */ 1898 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? 1899 1 : 3; 1900 1901 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1902 whpx->partition, 1903 cpu->cpu_index, 1904 reg_names, reg_count, 1905 reg_values); 1906 1907 if (FAILED(hr)) { 1908 error_report("WHPX: Failed to set MsrAccess state " 1909 " registers, hr=%08lx", hr); 1910 } 1911 ret = 0; 1912 break; 1913 } 1914 case WHvRunVpExitReasonX64Cpuid: { 1915 WHV_REGISTER_VALUE reg_values[5]; 1916 WHV_REGISTER_NAME reg_names[5]; 1917 UINT32 reg_count = 5; 1918 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; 1919 X86CPU *x86_cpu = X86_CPU(cpu); 1920 CPUX86State *env = &x86_cpu->env; 1921 1922 memset(reg_values, 0, sizeof(reg_values)); 1923 1924 rip = vcpu->exit_ctx.VpContext.Rip + 1925 vcpu->exit_ctx.VpContext.InstructionLength; 1926 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; 1927 1928 /* 1929 * Ideally, these should be supplied to the hypervisor during VCPU 1930 * initialization and it should be able to satisfy this request. 1931 * But, currently, WHPX doesn't support setting CPUID values in the 1932 * hypervisor once the partition has been setup, which is too late 1933 * since VCPUs are realized later. For now, use the values from 1934 * QEMU to satisfy these requests, until WHPX adds support for 1935 * being able to set these values in the hypervisor at runtime. 1936 */ 1937 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, 1938 (UINT32 *)&rcx, (UINT32 *)&rdx); 1939 switch (cpuid_fn) { 1940 case 0x40000000: 1941 /* Expose the vmware cpu frequency cpuid leaf */ 1942 rax = 0x40000010; 1943 rbx = rcx = rdx = 0; 1944 break; 1945 1946 case 0x40000010: 1947 rax = env->tsc_khz; 1948 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1949 rcx = rdx = 0; 1950 break; 1951 1952 case 0x80000001: 1953 /* Remove any support of OSVW */ 1954 rcx &= ~CPUID_EXT3_OSVW; 1955 break; 1956 } 1957 1958 reg_names[0] = WHvX64RegisterRip; 1959 reg_names[1] = WHvX64RegisterRax; 1960 reg_names[2] = WHvX64RegisterRcx; 1961 reg_names[3] = WHvX64RegisterRdx; 1962 reg_names[4] = WHvX64RegisterRbx; 1963 1964 reg_values[0].Reg64 = rip; 1965 reg_values[1].Reg64 = rax; 1966 reg_values[2].Reg64 = rcx; 1967 reg_values[3].Reg64 = rdx; 1968 reg_values[4].Reg64 = rbx; 1969 1970 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1971 whpx->partition, cpu->cpu_index, 1972 reg_names, 1973 reg_count, 1974 reg_values); 1975 1976 if (FAILED(hr)) { 1977 error_report("WHPX: Failed to set CpuidAccess state registers," 1978 " hr=%08lx", hr); 1979 } 1980 ret = 0; 1981 break; 1982 } 1983 case WHvRunVpExitReasonException: 1984 whpx_get_registers(cpu); 1985 1986 if ((vcpu->exit_ctx.VpException.ExceptionType == 1987 WHvX64ExceptionTypeDebugTrapOrFault) && 1988 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && 1989 (vcpu->exit_ctx.VpException.InstructionBytes[0] == 1990 whpx_breakpoint_instruction)) { 1991 /* Stopped at a software breakpoint. */ 1992 cpu->exception_index = EXCP_DEBUG; 1993 } else if ((vcpu->exit_ctx.VpException.ExceptionType == 1994 WHvX64ExceptionTypeDebugTrapOrFault) && 1995 !cpu->singlestep_enabled) { 1996 /* 1997 * Just finished stepping over a breakpoint, but the 1998 * gdb does not expect us to do single-stepping. 1999 * Don't do anything special. 2000 */ 2001 cpu->exception_index = EXCP_INTERRUPT; 2002 } else { 2003 /* Another exception or debug event. Report it to GDB. */ 2004 cpu->exception_index = EXCP_DEBUG; 2005 } 2006 2007 ret = 1; 2008 break; 2009 case WHvRunVpExitReasonNone: 2010 case WHvRunVpExitReasonUnrecoverableException: 2011 case WHvRunVpExitReasonInvalidVpRegisterValue: 2012 case WHvRunVpExitReasonUnsupportedFeature: 2013 default: 2014 error_report("WHPX: Unexpected VP exit code %d", 2015 vcpu->exit_ctx.ExitReason); 2016 whpx_get_registers(cpu); 2017 bql_lock(); 2018 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2019 bql_unlock(); 2020 break; 2021 } 2022 2023 } while (!ret); 2024 2025 if (stepped_over_bp) { 2026 /* Restore the breakpoint we stepped over */ 2027 cpu_memory_rw_debug(cpu, 2028 stepped_over_bp->address, 2029 (void *)&whpx_breakpoint_instruction, 2030 1, 2031 true); 2032 } 2033 2034 if (exclusive_step_mode != WHPX_STEP_NONE) { 2035 g_assert(cpu_in_exclusive_context(cpu)); 2036 cpu->running = false; 2037 end_exclusive(); 2038 2039 exclusive_step_mode = WHPX_STEP_NONE; 2040 } else { 2041 cpu_exec_end(cpu); 2042 } 2043 2044 bql_lock(); 2045 current_cpu = cpu; 2046 2047 if (--whpx->running_cpus == 0) { 2048 whpx_last_vcpu_stopping(cpu); 2049 } 2050 2051 qatomic_set(&cpu->exit_request, false); 2052 2053 return ret < 0; 2054 } 2055 2056 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2057 { 2058 if (!cpu->vcpu_dirty) { 2059 whpx_get_registers(cpu); 2060 cpu->vcpu_dirty = true; 2061 } 2062 } 2063 2064 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, 2065 run_on_cpu_data arg) 2066 { 2067 whpx_set_registers(cpu, WHPX_SET_RESET_STATE); 2068 cpu->vcpu_dirty = false; 2069 } 2070 2071 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, 2072 run_on_cpu_data arg) 2073 { 2074 whpx_set_registers(cpu, WHPX_SET_FULL_STATE); 2075 cpu->vcpu_dirty = false; 2076 } 2077 2078 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, 2079 run_on_cpu_data arg) 2080 { 2081 cpu->vcpu_dirty = true; 2082 } 2083 2084 /* 2085 * CPU support. 2086 */ 2087 2088 void whpx_cpu_synchronize_state(CPUState *cpu) 2089 { 2090 if (!cpu->vcpu_dirty) { 2091 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); 2092 } 2093 } 2094 2095 void whpx_cpu_synchronize_post_reset(CPUState *cpu) 2096 { 2097 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2098 } 2099 2100 void whpx_cpu_synchronize_post_init(CPUState *cpu) 2101 { 2102 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2103 } 2104 2105 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) 2106 { 2107 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2108 } 2109 2110 static void whpx_pre_resume_vm(AccelState *as, bool step_pending) 2111 { 2112 whpx_global.step_pending = step_pending; 2113 } 2114 2115 /* 2116 * Vcpu support. 2117 */ 2118 2119 static Error *whpx_migration_blocker; 2120 2121 static void whpx_cpu_update_state(void *opaque, bool running, RunState state) 2122 { 2123 CPUX86State *env = opaque; 2124 2125 if (running) { 2126 env->tsc_valid = false; 2127 } 2128 } 2129 2130 int whpx_init_vcpu(CPUState *cpu) 2131 { 2132 HRESULT hr; 2133 struct whpx_state *whpx = &whpx_global; 2134 AccelCPUState *vcpu = NULL; 2135 Error *local_error = NULL; 2136 X86CPU *x86_cpu = X86_CPU(cpu); 2137 CPUX86State *env = &x86_cpu->env; 2138 UINT64 freq = 0; 2139 int ret; 2140 2141 /* Add migration blockers for all unsupported features of the 2142 * Windows Hypervisor Platform 2143 */ 2144 if (whpx_migration_blocker == NULL) { 2145 error_setg(&whpx_migration_blocker, 2146 "State blocked due to non-migratable CPUID feature support," 2147 "dirty memory tracking support, and XSAVE/XRSTOR support"); 2148 2149 if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) { 2150 error_report_err(local_error); 2151 ret = -EINVAL; 2152 goto error; 2153 } 2154 } 2155 2156 vcpu = g_new0(AccelCPUState, 1); 2157 2158 hr = whp_dispatch.WHvEmulatorCreateEmulator( 2159 &whpx_emu_callbacks, 2160 &vcpu->emulator); 2161 if (FAILED(hr)) { 2162 error_report("WHPX: Failed to setup instruction completion support," 2163 " hr=%08lx", hr); 2164 ret = -EINVAL; 2165 goto error; 2166 } 2167 2168 hr = whp_dispatch.WHvCreateVirtualProcessor( 2169 whpx->partition, cpu->cpu_index, 0); 2170 if (FAILED(hr)) { 2171 error_report("WHPX: Failed to create a virtual processor," 2172 " hr=%08lx", hr); 2173 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2174 ret = -EINVAL; 2175 goto error; 2176 } 2177 2178 /* 2179 * vcpu's TSC frequency is either specified by user, or use the value 2180 * provided by Hyper-V if the former is not present. In the latter case, we 2181 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC 2182 * frequency can be migrated later via this field. 2183 */ 2184 if (!env->tsc_khz) { 2185 hr = whp_dispatch.WHvGetCapability( 2186 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), 2187 NULL); 2188 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2189 if (FAILED(hr)) { 2190 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); 2191 } else { 2192 env->tsc_khz = freq / 1000; /* Hz to KHz */ 2193 } 2194 } 2195 } 2196 2197 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; 2198 hr = whp_dispatch.WHvGetCapability( 2199 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); 2200 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2201 if (FAILED(hr)) { 2202 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); 2203 } else { 2204 env->apic_bus_freq = freq; 2205 } 2206 } 2207 2208 /* 2209 * If the vmware cpuid frequency leaf option is set, and we have a valid 2210 * tsc value, trap the corresponding cpuid's. 2211 */ 2212 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { 2213 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; 2214 2215 hr = whp_dispatch.WHvSetPartitionProperty( 2216 whpx->partition, 2217 WHvPartitionPropertyCodeCpuidExitList, 2218 cpuidExitList, 2219 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2220 2221 if (FAILED(hr)) { 2222 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2223 hr); 2224 ret = -EINVAL; 2225 goto error; 2226 } 2227 } 2228 2229 vcpu->interruptable = true; 2230 cpu->vcpu_dirty = true; 2231 cpu->accel = vcpu; 2232 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); 2233 qemu_add_vm_change_state_handler(whpx_cpu_update_state, env); 2234 2235 return 0; 2236 2237 error: 2238 g_free(vcpu); 2239 2240 return ret; 2241 } 2242 2243 int whpx_vcpu_exec(CPUState *cpu) 2244 { 2245 int ret; 2246 int fatal; 2247 2248 for (;;) { 2249 if (cpu->exception_index >= EXCP_INTERRUPT) { 2250 ret = cpu->exception_index; 2251 cpu->exception_index = -1; 2252 break; 2253 } 2254 2255 fatal = whpx_vcpu_run(cpu); 2256 2257 if (fatal) { 2258 error_report("WHPX: Failed to exec a virtual processor"); 2259 abort(); 2260 } 2261 } 2262 2263 return ret; 2264 } 2265 2266 void whpx_destroy_vcpu(CPUState *cpu) 2267 { 2268 struct whpx_state *whpx = &whpx_global; 2269 AccelCPUState *vcpu = cpu->accel; 2270 2271 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); 2272 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2273 g_free(cpu->accel); 2274 } 2275 2276 void whpx_vcpu_kick(CPUState *cpu) 2277 { 2278 struct whpx_state *whpx = &whpx_global; 2279 whp_dispatch.WHvCancelRunVirtualProcessor( 2280 whpx->partition, cpu->cpu_index, 0); 2281 } 2282 2283 /* 2284 * Memory support. 2285 */ 2286 2287 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, 2288 void *host_va, int add, int rom, 2289 const char *name) 2290 { 2291 struct whpx_state *whpx = &whpx_global; 2292 HRESULT hr; 2293 2294 /* 2295 if (add) { 2296 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", 2297 (void*)start_pa, (void*)size, host_va, 2298 (rom ? "ROM" : "RAM"), name); 2299 } else { 2300 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", 2301 (void*)start_pa, (void*)size, host_va, name); 2302 } 2303 */ 2304 2305 if (add) { 2306 hr = whp_dispatch.WHvMapGpaRange(whpx->partition, 2307 host_va, 2308 start_pa, 2309 size, 2310 (WHvMapGpaRangeFlagRead | 2311 WHvMapGpaRangeFlagExecute | 2312 (rom ? 0 : WHvMapGpaRangeFlagWrite))); 2313 } else { 2314 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, 2315 start_pa, 2316 size); 2317 } 2318 2319 if (FAILED(hr)) { 2320 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," 2321 " Host:%p, hr=%08lx", 2322 (add ? "MAP" : "UNMAP"), name, 2323 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); 2324 } 2325 } 2326 2327 static void whpx_process_section(MemoryRegionSection *section, int add) 2328 { 2329 MemoryRegion *mr = section->mr; 2330 hwaddr start_pa = section->offset_within_address_space; 2331 ram_addr_t size = int128_get64(section->size); 2332 unsigned int delta; 2333 uint64_t host_va; 2334 2335 if (!memory_region_is_ram(mr)) { 2336 return; 2337 } 2338 2339 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 2340 delta &= ~qemu_real_host_page_mask(); 2341 if (delta > size) { 2342 return; 2343 } 2344 start_pa += delta; 2345 size -= delta; 2346 size &= qemu_real_host_page_mask(); 2347 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 2348 return; 2349 } 2350 2351 host_va = (uintptr_t)memory_region_get_ram_ptr(mr) 2352 + section->offset_within_region + delta; 2353 2354 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, 2355 memory_region_is_rom(mr), mr->name); 2356 } 2357 2358 static void whpx_region_add(MemoryListener *listener, 2359 MemoryRegionSection *section) 2360 { 2361 memory_region_ref(section->mr); 2362 whpx_process_section(section, 1); 2363 } 2364 2365 static void whpx_region_del(MemoryListener *listener, 2366 MemoryRegionSection *section) 2367 { 2368 whpx_process_section(section, 0); 2369 memory_region_unref(section->mr); 2370 } 2371 2372 static void whpx_transaction_begin(MemoryListener *listener) 2373 { 2374 } 2375 2376 static void whpx_transaction_commit(MemoryListener *listener) 2377 { 2378 } 2379 2380 static void whpx_log_sync(MemoryListener *listener, 2381 MemoryRegionSection *section) 2382 { 2383 MemoryRegion *mr = section->mr; 2384 2385 if (!memory_region_is_ram(mr)) { 2386 return; 2387 } 2388 2389 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 2390 } 2391 2392 static MemoryListener whpx_memory_listener = { 2393 .name = "whpx", 2394 .begin = whpx_transaction_begin, 2395 .commit = whpx_transaction_commit, 2396 .region_add = whpx_region_add, 2397 .region_del = whpx_region_del, 2398 .log_sync = whpx_log_sync, 2399 .priority = MEMORY_LISTENER_PRIORITY_ACCEL, 2400 }; 2401 2402 static void whpx_memory_init(void) 2403 { 2404 memory_listener_register(&whpx_memory_listener, &address_space_memory); 2405 } 2406 2407 /* 2408 * Load the functions from the given library, using the given handle. If a 2409 * handle is provided, it is used, otherwise the library is opened. The 2410 * handle will be updated on return with the opened one. 2411 */ 2412 static bool load_whp_dispatch_fns(HMODULE *handle, 2413 WHPFunctionList function_list) 2414 { 2415 HMODULE hLib = *handle; 2416 2417 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" 2418 #define WINHV_EMULATION_DLL "WinHvEmulation.dll" 2419 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ 2420 whp_dispatch.function_name = \ 2421 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2422 2423 #define WHP_LOAD_FIELD(return_type, function_name, signature) \ 2424 whp_dispatch.function_name = \ 2425 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2426 if (!whp_dispatch.function_name) { \ 2427 error_report("Could not load function %s", #function_name); \ 2428 goto error; \ 2429 } \ 2430 2431 #define WHP_LOAD_LIB(lib_name, handle_lib) \ 2432 if (!handle_lib) { \ 2433 handle_lib = LoadLibrary(lib_name); \ 2434 if (!handle_lib) { \ 2435 error_report("Could not load library %s.", lib_name); \ 2436 goto error; \ 2437 } \ 2438 } \ 2439 2440 switch (function_list) { 2441 case WINHV_PLATFORM_FNS_DEFAULT: 2442 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2443 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) 2444 break; 2445 2446 case WINHV_EMULATION_FNS_DEFAULT: 2447 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) 2448 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) 2449 break; 2450 2451 case WINHV_PLATFORM_FNS_SUPPLEMENTAL: 2452 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2453 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) 2454 break; 2455 } 2456 2457 *handle = hLib; 2458 return true; 2459 2460 error: 2461 if (hLib) { 2462 FreeLibrary(hLib); 2463 } 2464 2465 return false; 2466 } 2467 2468 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, 2469 const char *name, void *opaque, 2470 Error **errp) 2471 { 2472 struct whpx_state *whpx = &whpx_global; 2473 OnOffSplit mode; 2474 2475 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 2476 return; 2477 } 2478 2479 switch (mode) { 2480 case ON_OFF_SPLIT_ON: 2481 whpx->kernel_irqchip_allowed = true; 2482 whpx->kernel_irqchip_required = true; 2483 break; 2484 2485 case ON_OFF_SPLIT_OFF: 2486 whpx->kernel_irqchip_allowed = false; 2487 whpx->kernel_irqchip_required = false; 2488 break; 2489 2490 case ON_OFF_SPLIT_SPLIT: 2491 error_setg(errp, "WHPX: split irqchip currently not supported"); 2492 error_append_hint(errp, 2493 "Try without kernel-irqchip or with kernel-irqchip=on|off"); 2494 break; 2495 2496 default: 2497 /* 2498 * The value was checked in visit_type_OnOffSplit() above. If 2499 * we get here, then something is wrong in QEMU. 2500 */ 2501 abort(); 2502 } 2503 } 2504 2505 static void whpx_cpu_instance_init(CPUState *cs) 2506 { 2507 X86CPU *cpu = X86_CPU(cs); 2508 2509 host_cpu_instance_init(cpu); 2510 } 2511 2512 static void whpx_cpu_accel_class_init(ObjectClass *oc, const void *data) 2513 { 2514 AccelCPUClass *acc = ACCEL_CPU_CLASS(oc); 2515 2516 acc->cpu_instance_init = whpx_cpu_instance_init; 2517 } 2518 2519 static const TypeInfo whpx_cpu_accel_type = { 2520 .name = ACCEL_CPU_NAME("whpx"), 2521 2522 .parent = TYPE_ACCEL_CPU, 2523 .class_init = whpx_cpu_accel_class_init, 2524 .abstract = true, 2525 }; 2526 2527 /* 2528 * Partition support 2529 */ 2530 2531 static int whpx_accel_init(AccelState *as, MachineState *ms) 2532 { 2533 struct whpx_state *whpx; 2534 int ret; 2535 HRESULT hr; 2536 WHV_CAPABILITY whpx_cap; 2537 UINT32 whpx_cap_size; 2538 WHV_PARTITION_PROPERTY prop; 2539 UINT32 cpuidExitList[] = {1, 0x80000001}; 2540 WHV_CAPABILITY_FEATURES features = {0}; 2541 2542 whpx = &whpx_global; 2543 2544 if (!init_whp_dispatch()) { 2545 ret = -ENOSYS; 2546 goto error; 2547 } 2548 2549 whpx->mem_quota = ms->ram_size; 2550 2551 hr = whp_dispatch.WHvGetCapability( 2552 WHvCapabilityCodeHypervisorPresent, &whpx_cap, 2553 sizeof(whpx_cap), &whpx_cap_size); 2554 if (FAILED(hr) || !whpx_cap.HypervisorPresent) { 2555 error_report("WHPX: No accelerator found, hr=%08lx", hr); 2556 ret = -ENOSPC; 2557 goto error; 2558 } 2559 2560 hr = whp_dispatch.WHvGetCapability( 2561 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); 2562 if (FAILED(hr)) { 2563 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); 2564 ret = -EINVAL; 2565 goto error; 2566 } 2567 2568 hr = whp_dispatch.WHvCreatePartition(&whpx->partition); 2569 if (FAILED(hr)) { 2570 error_report("WHPX: Failed to create partition, hr=%08lx", hr); 2571 ret = -EINVAL; 2572 goto error; 2573 } 2574 2575 /* 2576 * Query the XSAVE capability of the partition. Any error here is not 2577 * considered fatal. 2578 */ 2579 hr = whp_dispatch.WHvGetPartitionProperty( 2580 whpx->partition, 2581 WHvPartitionPropertyCodeProcessorXsaveFeatures, 2582 &whpx_xsave_cap, 2583 sizeof(whpx_xsave_cap), 2584 &whpx_cap_size); 2585 2586 /* 2587 * Windows version which don't support this property will return with the 2588 * specific error code. 2589 */ 2590 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) { 2591 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr); 2592 } 2593 2594 if (!whpx_has_xsave()) { 2595 printf("WHPX: Partition is not XSAVE capable\n"); 2596 } 2597 2598 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2599 prop.ProcessorCount = ms->smp.cpus; 2600 hr = whp_dispatch.WHvSetPartitionProperty( 2601 whpx->partition, 2602 WHvPartitionPropertyCodeProcessorCount, 2603 &prop, 2604 sizeof(WHV_PARTITION_PROPERTY)); 2605 2606 if (FAILED(hr)) { 2607 error_report("WHPX: Failed to set partition processor count to %u," 2608 " hr=%08lx", prop.ProcessorCount, hr); 2609 ret = -EINVAL; 2610 goto error; 2611 } 2612 2613 /* 2614 * Error out if WHP doesn't support apic emulation and user is requiring 2615 * it. 2616 */ 2617 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || 2618 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { 2619 error_report("WHPX: kernel irqchip requested, but unavailable. " 2620 "Try without kernel-irqchip or with kernel-irqchip=off"); 2621 ret = -EINVAL; 2622 goto error; 2623 } 2624 2625 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && 2626 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { 2627 WHV_X64_LOCAL_APIC_EMULATION_MODE mode = 2628 WHvX64LocalApicEmulationModeXApic; 2629 printf("WHPX: setting APIC emulation mode in the hypervisor\n"); 2630 hr = whp_dispatch.WHvSetPartitionProperty( 2631 whpx->partition, 2632 WHvPartitionPropertyCodeLocalApicEmulationMode, 2633 &mode, 2634 sizeof(mode)); 2635 if (FAILED(hr)) { 2636 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); 2637 if (whpx->kernel_irqchip_required) { 2638 error_report("WHPX: kernel irqchip requested, but unavailable"); 2639 ret = -EINVAL; 2640 goto error; 2641 } 2642 } else { 2643 whpx->apic_in_platform = true; 2644 } 2645 } 2646 2647 /* Register for MSR and CPUID exits */ 2648 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2649 prop.ExtendedVmExits.X64MsrExit = 1; 2650 prop.ExtendedVmExits.X64CpuidExit = 1; 2651 prop.ExtendedVmExits.ExceptionExit = 1; 2652 if (whpx_apic_in_platform()) { 2653 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; 2654 } 2655 2656 hr = whp_dispatch.WHvSetPartitionProperty( 2657 whpx->partition, 2658 WHvPartitionPropertyCodeExtendedVmExits, 2659 &prop, 2660 sizeof(WHV_PARTITION_PROPERTY)); 2661 if (FAILED(hr)) { 2662 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); 2663 ret = -EINVAL; 2664 goto error; 2665 } 2666 2667 hr = whp_dispatch.WHvSetPartitionProperty( 2668 whpx->partition, 2669 WHvPartitionPropertyCodeCpuidExitList, 2670 cpuidExitList, 2671 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2672 2673 if (FAILED(hr)) { 2674 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2675 hr); 2676 ret = -EINVAL; 2677 goto error; 2678 } 2679 2680 /* 2681 * We do not want to intercept any exceptions from the guest, 2682 * until we actually start debugging with gdb. 2683 */ 2684 whpx->exception_exit_bitmap = -1; 2685 hr = whpx_set_exception_exit_bitmap(0); 2686 2687 if (FAILED(hr)) { 2688 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); 2689 ret = -EINVAL; 2690 goto error; 2691 } 2692 2693 hr = whp_dispatch.WHvSetupPartition(whpx->partition); 2694 if (FAILED(hr)) { 2695 error_report("WHPX: Failed to setup partition, hr=%08lx", hr); 2696 ret = -EINVAL; 2697 goto error; 2698 } 2699 2700 whpx_memory_init(); 2701 2702 printf("Windows Hypervisor Platform accelerator is operational\n"); 2703 return 0; 2704 2705 error: 2706 2707 if (NULL != whpx->partition) { 2708 whp_dispatch.WHvDeletePartition(whpx->partition); 2709 whpx->partition = NULL; 2710 } 2711 2712 return ret; 2713 } 2714 2715 bool whpx_apic_in_platform(void) { 2716 return whpx_global.apic_in_platform; 2717 } 2718 2719 static void whpx_accel_class_init(ObjectClass *oc, const void *data) 2720 { 2721 AccelClass *ac = ACCEL_CLASS(oc); 2722 ac->name = "WHPX"; 2723 ac->init_machine = whpx_accel_init; 2724 ac->pre_resume_vm = whpx_pre_resume_vm; 2725 ac->allowed = &whpx_allowed; 2726 2727 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 2728 NULL, whpx_set_kernel_irqchip, 2729 NULL, NULL); 2730 object_class_property_set_description(oc, "kernel-irqchip", 2731 "Configure WHPX in-kernel irqchip"); 2732 } 2733 2734 static void whpx_accel_instance_init(Object *obj) 2735 { 2736 struct whpx_state *whpx = &whpx_global; 2737 2738 memset(whpx, 0, sizeof(struct whpx_state)); 2739 /* Turn on kernel-irqchip, by default */ 2740 whpx->kernel_irqchip_allowed = true; 2741 } 2742 2743 static const TypeInfo whpx_accel_type = { 2744 .name = ACCEL_CLASS_NAME("whpx"), 2745 .parent = TYPE_ACCEL, 2746 .instance_init = whpx_accel_instance_init, 2747 .class_init = whpx_accel_class_init, 2748 }; 2749 2750 static void whpx_type_init(void) 2751 { 2752 type_register_static(&whpx_accel_type); 2753 type_register_static(&whpx_cpu_accel_type); 2754 } 2755 2756 bool init_whp_dispatch(void) 2757 { 2758 if (whp_dispatch_initialized) { 2759 return true; 2760 } 2761 2762 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { 2763 goto error; 2764 } 2765 2766 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { 2767 goto error; 2768 } 2769 2770 assert(load_whp_dispatch_fns(&hWinHvPlatform, 2771 WINHV_PLATFORM_FNS_SUPPLEMENTAL)); 2772 whp_dispatch_initialized = true; 2773 2774 return true; 2775 error: 2776 if (hWinHvPlatform) { 2777 FreeLibrary(hWinHvPlatform); 2778 } 2779 2780 if (hWinHvEmulation) { 2781 FreeLibrary(hWinHvEmulation); 2782 } 2783 2784 return false; 2785 } 2786 2787 type_init(whpx_type_init); 2788