1 /* 2 * QEMU Windows Hypervisor Platform accelerator (WHPX) 3 * 4 * Copyright Microsoft Corp. 2017 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 11 #include "qemu/osdep.h" 12 #include "cpu.h" 13 #include "exec/address-spaces.h" 14 #include "exec/ioport.h" 15 #include "exec/gdbstub.h" 16 #include "qemu/accel.h" 17 #include "sysemu/whpx.h" 18 #include "sysemu/cpus.h" 19 #include "sysemu/runstate.h" 20 #include "qemu/main-loop.h" 21 #include "hw/boards.h" 22 #include "hw/i386/ioapic.h" 23 #include "hw/i386/apic_internal.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "qapi/qapi-types-common.h" 27 #include "qapi/qapi-visit-common.h" 28 #include "migration/blocker.h" 29 #include <winerror.h> 30 31 #include "whpx-internal.h" 32 #include "whpx-accel-ops.h" 33 34 #include <WinHvPlatform.h> 35 #include <WinHvEmulation.h> 36 37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) 38 39 static const WHV_REGISTER_NAME whpx_register_names[] = { 40 41 /* X64 General purpose registers */ 42 WHvX64RegisterRax, 43 WHvX64RegisterRcx, 44 WHvX64RegisterRdx, 45 WHvX64RegisterRbx, 46 WHvX64RegisterRsp, 47 WHvX64RegisterRbp, 48 WHvX64RegisterRsi, 49 WHvX64RegisterRdi, 50 WHvX64RegisterR8, 51 WHvX64RegisterR9, 52 WHvX64RegisterR10, 53 WHvX64RegisterR11, 54 WHvX64RegisterR12, 55 WHvX64RegisterR13, 56 WHvX64RegisterR14, 57 WHvX64RegisterR15, 58 WHvX64RegisterRip, 59 WHvX64RegisterRflags, 60 61 /* X64 Segment registers */ 62 WHvX64RegisterEs, 63 WHvX64RegisterCs, 64 WHvX64RegisterSs, 65 WHvX64RegisterDs, 66 WHvX64RegisterFs, 67 WHvX64RegisterGs, 68 WHvX64RegisterLdtr, 69 WHvX64RegisterTr, 70 71 /* X64 Table registers */ 72 WHvX64RegisterIdtr, 73 WHvX64RegisterGdtr, 74 75 /* X64 Control Registers */ 76 WHvX64RegisterCr0, 77 WHvX64RegisterCr2, 78 WHvX64RegisterCr3, 79 WHvX64RegisterCr4, 80 WHvX64RegisterCr8, 81 82 /* X64 Debug Registers */ 83 /* 84 * WHvX64RegisterDr0, 85 * WHvX64RegisterDr1, 86 * WHvX64RegisterDr2, 87 * WHvX64RegisterDr3, 88 * WHvX64RegisterDr6, 89 * WHvX64RegisterDr7, 90 */ 91 92 /* X64 Floating Point and Vector Registers */ 93 WHvX64RegisterXmm0, 94 WHvX64RegisterXmm1, 95 WHvX64RegisterXmm2, 96 WHvX64RegisterXmm3, 97 WHvX64RegisterXmm4, 98 WHvX64RegisterXmm5, 99 WHvX64RegisterXmm6, 100 WHvX64RegisterXmm7, 101 WHvX64RegisterXmm8, 102 WHvX64RegisterXmm9, 103 WHvX64RegisterXmm10, 104 WHvX64RegisterXmm11, 105 WHvX64RegisterXmm12, 106 WHvX64RegisterXmm13, 107 WHvX64RegisterXmm14, 108 WHvX64RegisterXmm15, 109 WHvX64RegisterFpMmx0, 110 WHvX64RegisterFpMmx1, 111 WHvX64RegisterFpMmx2, 112 WHvX64RegisterFpMmx3, 113 WHvX64RegisterFpMmx4, 114 WHvX64RegisterFpMmx5, 115 WHvX64RegisterFpMmx6, 116 WHvX64RegisterFpMmx7, 117 WHvX64RegisterFpControlStatus, 118 WHvX64RegisterXmmControlStatus, 119 120 /* X64 MSRs */ 121 WHvX64RegisterEfer, 122 #ifdef TARGET_X86_64 123 WHvX64RegisterKernelGsBase, 124 #endif 125 WHvX64RegisterApicBase, 126 /* WHvX64RegisterPat, */ 127 WHvX64RegisterSysenterCs, 128 WHvX64RegisterSysenterEip, 129 WHvX64RegisterSysenterEsp, 130 WHvX64RegisterStar, 131 #ifdef TARGET_X86_64 132 WHvX64RegisterLstar, 133 WHvX64RegisterCstar, 134 WHvX64RegisterSfmask, 135 #endif 136 137 /* Interrupt / Event Registers */ 138 /* 139 * WHvRegisterPendingInterruption, 140 * WHvRegisterInterruptState, 141 * WHvRegisterPendingEvent0, 142 * WHvRegisterPendingEvent1 143 * WHvX64RegisterDeliverabilityNotifications, 144 */ 145 }; 146 147 struct whpx_register_set { 148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; 149 }; 150 151 /* 152 * The current implementation of instruction stepping sets the TF flag 153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction. 154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. 155 * 156 * This approach has a few limitations: 157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag 158 * along with the other flags, possibly restoring it later. It would 159 * result in another INT1 when the flags are restored, triggering 160 * a stop in gdb that could be cleared by doing another step. 161 * 162 * Stepping over a POPF/LAHF instruction will let it overwrite the 163 * TF flags, ending the stepping mode. 164 * 165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, 166 * or anything that could result in a page fault) will save the flags 167 * to the stack, clear the TF flag, and let the guest execute the 168 * handler. Normally, the guest will restore the original flags, 169 * that will continue single-stepping. 170 * 171 * 3. Debuggers running on the guest may wish to set TF to do instruction 172 * stepping. INT1 events generated by it would be intercepted by us, 173 * as long as the gdb is connected to QEMU. 174 * 175 * In practice this means that: 176 * 1. Stepping through flags-modifying instructions may cause gdb to 177 * continue or stop in unexpected places. This will be fully recoverable 178 * and will not crash the target. 179 * 180 * 2. Stepping over an instruction that triggers an exception will step 181 * over the exception handler, not into it. 182 * 183 * 3. Debugging the guest via gdb, while running debugger on the guest 184 * at the same time may lead to unexpected effects. Removing all 185 * breakpoints set via QEMU will prevent any further interference 186 * with the guest-level debuggers. 187 * 188 * The limitations can be addressed as shown below: 189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of 190 * stepping through them. The exact semantics of the instructions is 191 * defined in the "Combined Volume Set of Intel 64 and IA-32 192 * Architectures Software Developer's Manuals", however it involves a 193 * fair amount of corner cases due to compatibility with real mode, 194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes. 195 * 196 * 2. We could step into the guest's exception handlers using the following 197 * sequence: 198 * a. Temporarily enable catching of all exception types via 199 * whpx_set_exception_exit_bitmap(). 200 * b. Once an exception is intercepted, read the IDT/GDT and locate 201 * the original handler. 202 * c. Patch the original handler, injecting an INT3 at the beginning. 203 * d. Update the exception exit bitmap to only catch the 204 * WHvX64ExceptionTypeBreakpointTrap exception. 205 * e. Let the affected CPU run in the exclusive mode. 206 * f. Restore the original handler and the exception exit bitmap. 207 * Note that handling all corner cases related to IDT/GDT is harder 208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a 209 * rough idea. 210 * 211 * 3. In order to properly support guest-level debugging in parallel with 212 * the QEMU-level debugging, we would need to be able to pass some INT1 213 * events to the guest. This could be done via the following methods: 214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, 215 * it seems to only work for interrupts and not software 216 * exceptions. 217 * b. Locating and patching the original handler by parsing IDT/GDT. 218 * This involves relatively complex logic outlined in the previous 219 * paragraph. 220 * c. Emulating the exception invocation (i.e. manually updating RIP, 221 * RFLAGS, and pushing the old values to stack). This is even more 222 * complicated than the previous option, since it involves checking 223 * CPL, gate attributes, and doing various adjustments depending 224 * on the current CPU mode, whether the CPL is changing, etc. 225 */ 226 typedef enum WhpxStepMode { 227 WHPX_STEP_NONE = 0, 228 /* Halt other VCPUs */ 229 WHPX_STEP_EXCLUSIVE, 230 } WhpxStepMode; 231 232 struct whpx_vcpu { 233 WHV_EMULATOR_HANDLE emulator; 234 bool window_registered; 235 bool interruptable; 236 bool ready_for_pic_interrupt; 237 uint64_t tpr; 238 uint64_t apic_base; 239 bool interruption_pending; 240 241 /* Must be the last field as it may have a tail */ 242 WHV_RUN_VP_EXIT_CONTEXT exit_ctx; 243 }; 244 245 static bool whpx_allowed; 246 static bool whp_dispatch_initialized; 247 static HMODULE hWinHvPlatform, hWinHvEmulation; 248 static uint32_t max_vcpu_index; 249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap; 250 251 struct whpx_state whpx_global; 252 struct WHPDispatch whp_dispatch; 253 254 static bool whpx_has_xsave(void) 255 { 256 return whpx_xsave_cap.XsaveSupport; 257 } 258 259 /* 260 * VP support 261 */ 262 263 static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu) 264 { 265 return (struct whpx_vcpu *)cpu->hax_vcpu; 266 } 267 268 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, 269 int r86) 270 { 271 WHV_X64_SEGMENT_REGISTER hs; 272 unsigned flags = qs->flags; 273 274 hs.Base = qs->base; 275 hs.Limit = qs->limit; 276 hs.Selector = qs->selector; 277 278 if (v86) { 279 hs.Attributes = 0; 280 hs.SegmentType = 3; 281 hs.Present = 1; 282 hs.DescriptorPrivilegeLevel = 3; 283 hs.NonSystemSegment = 1; 284 285 } else { 286 hs.Attributes = (flags >> DESC_TYPE_SHIFT); 287 288 if (r86) { 289 /* hs.Base &= 0xfffff; */ 290 } 291 } 292 293 return hs; 294 } 295 296 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) 297 { 298 SegmentCache qs; 299 300 qs.base = hs->Base; 301 qs.limit = hs->Limit; 302 qs.selector = hs->Selector; 303 304 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; 305 306 return qs; 307 } 308 309 /* X64 Extended Control Registers */ 310 static void whpx_set_xcrs(CPUState *cpu) 311 { 312 CPUX86State *env = cpu->env_ptr; 313 HRESULT hr; 314 struct whpx_state *whpx = &whpx_global; 315 WHV_REGISTER_VALUE xcr0; 316 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 317 318 if (!whpx_has_xsave()) { 319 return; 320 } 321 322 /* Only xcr0 is supported by the hypervisor currently */ 323 xcr0.Reg64 = env->xcr0; 324 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 325 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 326 if (FAILED(hr)) { 327 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr); 328 } 329 } 330 331 static int whpx_set_tsc(CPUState *cpu) 332 { 333 CPUX86State *env = cpu->env_ptr; 334 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 335 WHV_REGISTER_VALUE tsc_val; 336 HRESULT hr; 337 struct whpx_state *whpx = &whpx_global; 338 339 /* 340 * Suspend the partition prior to setting the TSC to reduce the variance 341 * in TSC across vCPUs. When the first vCPU runs post suspend, the 342 * partition is automatically resumed. 343 */ 344 if (whp_dispatch.WHvSuspendPartitionTime) { 345 346 /* 347 * Unable to suspend partition while setting TSC is not a fatal 348 * error. It just increases the likelihood of TSC variance between 349 * vCPUs and some guest OS are able to handle that just fine. 350 */ 351 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); 352 if (FAILED(hr)) { 353 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); 354 } 355 } 356 357 tsc_val.Reg64 = env->tsc; 358 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 359 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 360 if (FAILED(hr)) { 361 error_report("WHPX: Failed to set TSC, hr=%08lx", hr); 362 return -1; 363 } 364 365 return 0; 366 } 367 368 /* 369 * The CR8 register in the CPU is mapped to the TPR register of the APIC, 370 * however, they use a slightly different encoding. Specifically: 371 * 372 * APIC.TPR[bits 7:4] = CR8[bits 3:0] 373 * 374 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 375 * and IA-32 Architectures Software Developer's Manual. 376 */ 377 378 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) 379 { 380 return tpr >> 4; 381 } 382 383 static void whpx_set_registers(CPUState *cpu, int level) 384 { 385 struct whpx_state *whpx = &whpx_global; 386 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 387 CPUX86State *env = cpu->env_ptr; 388 X86CPU *x86_cpu = X86_CPU(cpu); 389 struct whpx_register_set vcxt; 390 HRESULT hr; 391 int idx; 392 int idx_next; 393 int i; 394 int v86, r86; 395 396 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 397 398 /* 399 * Following MSRs have side effects on the guest or are too heavy for 400 * runtime. Limit them to full state update. 401 */ 402 if (level >= WHPX_SET_RESET_STATE) { 403 whpx_set_tsc(cpu); 404 } 405 406 memset(&vcxt, 0, sizeof(struct whpx_register_set)); 407 408 v86 = (env->eflags & VM_MASK); 409 r86 = !(env->cr[0] & CR0_PE_MASK); 410 411 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 412 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); 413 414 idx = 0; 415 416 /* Indexes for first 16 registers match between HV and QEMU definitions */ 417 idx_next = 16; 418 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 419 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; 420 } 421 idx = idx_next; 422 423 /* Same goes for RIP and RFLAGS */ 424 assert(whpx_register_names[idx] == WHvX64RegisterRip); 425 vcxt.values[idx++].Reg64 = env->eip; 426 427 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 428 vcxt.values[idx++].Reg64 = env->eflags; 429 430 /* Translate 6+4 segment registers. HV and QEMU order matches */ 431 assert(idx == WHvX64RegisterEs); 432 for (i = 0; i < 6; i += 1, idx += 1) { 433 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); 434 } 435 436 assert(idx == WHvX64RegisterLdtr); 437 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); 438 439 assert(idx == WHvX64RegisterTr); 440 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); 441 442 assert(idx == WHvX64RegisterIdtr); 443 vcxt.values[idx].Table.Base = env->idt.base; 444 vcxt.values[idx].Table.Limit = env->idt.limit; 445 idx += 1; 446 447 assert(idx == WHvX64RegisterGdtr); 448 vcxt.values[idx].Table.Base = env->gdt.base; 449 vcxt.values[idx].Table.Limit = env->gdt.limit; 450 idx += 1; 451 452 /* CR0, 2, 3, 4, 8 */ 453 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 454 vcxt.values[idx++].Reg64 = env->cr[0]; 455 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 456 vcxt.values[idx++].Reg64 = env->cr[2]; 457 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 458 vcxt.values[idx++].Reg64 = env->cr[3]; 459 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 460 vcxt.values[idx++].Reg64 = env->cr[4]; 461 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 462 vcxt.values[idx++].Reg64 = vcpu->tpr; 463 464 /* 8 Debug Registers - Skipped */ 465 466 /* 467 * Extended control registers needs to be handled separately depending 468 * on whether xsave is supported/enabled or not. 469 */ 470 whpx_set_xcrs(cpu); 471 472 /* 16 XMM registers */ 473 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 474 idx_next = idx + 16; 475 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 476 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); 477 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); 478 } 479 idx = idx_next; 480 481 /* 8 FP registers */ 482 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 483 for (i = 0; i < 8; i += 1, idx += 1) { 484 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); 485 /* vcxt.values[idx].Fp.AsUINT128.High64 = 486 env->fpregs[i].mmx.MMX_Q(1); 487 */ 488 } 489 490 /* FP control status register */ 491 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 492 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; 493 vcxt.values[idx].FpControlStatus.FpStatus = 494 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; 495 vcxt.values[idx].FpControlStatus.FpTag = 0; 496 for (i = 0; i < 8; ++i) { 497 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; 498 } 499 vcxt.values[idx].FpControlStatus.Reserved = 0; 500 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; 501 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; 502 idx += 1; 503 504 /* XMM control status register */ 505 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 506 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; 507 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; 508 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; 509 idx += 1; 510 511 /* MSRs */ 512 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 513 vcxt.values[idx++].Reg64 = env->efer; 514 #ifdef TARGET_X86_64 515 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 516 vcxt.values[idx++].Reg64 = env->kernelgsbase; 517 #endif 518 519 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 520 vcxt.values[idx++].Reg64 = vcpu->apic_base; 521 522 /* WHvX64RegisterPat - Skipped */ 523 524 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 525 vcxt.values[idx++].Reg64 = env->sysenter_cs; 526 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 527 vcxt.values[idx++].Reg64 = env->sysenter_eip; 528 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 529 vcxt.values[idx++].Reg64 = env->sysenter_esp; 530 assert(whpx_register_names[idx] == WHvX64RegisterStar); 531 vcxt.values[idx++].Reg64 = env->star; 532 #ifdef TARGET_X86_64 533 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 534 vcxt.values[idx++].Reg64 = env->lstar; 535 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 536 vcxt.values[idx++].Reg64 = env->cstar; 537 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 538 vcxt.values[idx++].Reg64 = env->fmask; 539 #endif 540 541 /* Interrupt / Event Registers - Skipped */ 542 543 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 544 545 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 546 whpx->partition, cpu->cpu_index, 547 whpx_register_names, 548 RTL_NUMBER_OF(whpx_register_names), 549 &vcxt.values[0]); 550 551 if (FAILED(hr)) { 552 error_report("WHPX: Failed to set virtual processor context, hr=%08lx", 553 hr); 554 } 555 556 return; 557 } 558 559 static int whpx_get_tsc(CPUState *cpu) 560 { 561 CPUX86State *env = cpu->env_ptr; 562 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 563 WHV_REGISTER_VALUE tsc_val; 564 HRESULT hr; 565 struct whpx_state *whpx = &whpx_global; 566 567 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 568 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 569 if (FAILED(hr)) { 570 error_report("WHPX: Failed to get TSC, hr=%08lx", hr); 571 return -1; 572 } 573 574 env->tsc = tsc_val.Reg64; 575 return 0; 576 } 577 578 /* X64 Extended Control Registers */ 579 static void whpx_get_xcrs(CPUState *cpu) 580 { 581 CPUX86State *env = cpu->env_ptr; 582 HRESULT hr; 583 struct whpx_state *whpx = &whpx_global; 584 WHV_REGISTER_VALUE xcr0; 585 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 586 587 if (!whpx_has_xsave()) { 588 return; 589 } 590 591 /* Only xcr0 is supported by the hypervisor currently */ 592 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 593 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 594 if (FAILED(hr)) { 595 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr); 596 return; 597 } 598 599 env->xcr0 = xcr0.Reg64; 600 } 601 602 static void whpx_get_registers(CPUState *cpu) 603 { 604 struct whpx_state *whpx = &whpx_global; 605 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 606 CPUX86State *env = cpu->env_ptr; 607 X86CPU *x86_cpu = X86_CPU(cpu); 608 struct whpx_register_set vcxt; 609 uint64_t tpr, apic_base; 610 HRESULT hr; 611 int idx; 612 int idx_next; 613 int i; 614 615 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 616 617 if (!env->tsc_valid) { 618 whpx_get_tsc(cpu); 619 env->tsc_valid = !runstate_is_running(); 620 } 621 622 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 623 whpx->partition, cpu->cpu_index, 624 whpx_register_names, 625 RTL_NUMBER_OF(whpx_register_names), 626 &vcxt.values[0]); 627 if (FAILED(hr)) { 628 error_report("WHPX: Failed to get virtual processor context, hr=%08lx", 629 hr); 630 } 631 632 if (whpx_apic_in_platform()) { 633 /* 634 * Fetch the TPR value from the emulated APIC. It may get overwritten 635 * below with the value from CR8 returned by 636 * WHvGetVirtualProcessorRegisters(). 637 */ 638 whpx_apic_get(x86_cpu->apic_state); 639 vcpu->tpr = whpx_apic_tpr_to_cr8( 640 cpu_get_apic_tpr(x86_cpu->apic_state)); 641 } 642 643 idx = 0; 644 645 /* Indexes for first 16 registers match between HV and QEMU definitions */ 646 idx_next = 16; 647 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 648 env->regs[idx] = vcxt.values[idx].Reg64; 649 } 650 idx = idx_next; 651 652 /* Same goes for RIP and RFLAGS */ 653 assert(whpx_register_names[idx] == WHvX64RegisterRip); 654 env->eip = vcxt.values[idx++].Reg64; 655 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 656 env->eflags = vcxt.values[idx++].Reg64; 657 658 /* Translate 6+4 segment registers. HV and QEMU order matches */ 659 assert(idx == WHvX64RegisterEs); 660 for (i = 0; i < 6; i += 1, idx += 1) { 661 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); 662 } 663 664 assert(idx == WHvX64RegisterLdtr); 665 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); 666 assert(idx == WHvX64RegisterTr); 667 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); 668 assert(idx == WHvX64RegisterIdtr); 669 env->idt.base = vcxt.values[idx].Table.Base; 670 env->idt.limit = vcxt.values[idx].Table.Limit; 671 idx += 1; 672 assert(idx == WHvX64RegisterGdtr); 673 env->gdt.base = vcxt.values[idx].Table.Base; 674 env->gdt.limit = vcxt.values[idx].Table.Limit; 675 idx += 1; 676 677 /* CR0, 2, 3, 4, 8 */ 678 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 679 env->cr[0] = vcxt.values[idx++].Reg64; 680 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 681 env->cr[2] = vcxt.values[idx++].Reg64; 682 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 683 env->cr[3] = vcxt.values[idx++].Reg64; 684 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 685 env->cr[4] = vcxt.values[idx++].Reg64; 686 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 687 tpr = vcxt.values[idx++].Reg64; 688 if (tpr != vcpu->tpr) { 689 vcpu->tpr = tpr; 690 cpu_set_apic_tpr(x86_cpu->apic_state, tpr); 691 } 692 693 /* 8 Debug Registers - Skipped */ 694 695 /* 696 * Extended control registers needs to be handled separately depending 697 * on whether xsave is supported/enabled or not. 698 */ 699 whpx_get_xcrs(cpu); 700 701 /* 16 XMM registers */ 702 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 703 idx_next = idx + 16; 704 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 705 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; 706 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; 707 } 708 idx = idx_next; 709 710 /* 8 FP registers */ 711 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 712 for (i = 0; i < 8; i += 1, idx += 1) { 713 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; 714 /* env->fpregs[i].mmx.MMX_Q(1) = 715 vcxt.values[idx].Fp.AsUINT128.High64; 716 */ 717 } 718 719 /* FP control status register */ 720 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 721 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; 722 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; 723 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; 724 for (i = 0; i < 8; ++i) { 725 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); 726 } 727 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; 728 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; 729 idx += 1; 730 731 /* XMM control status register */ 732 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 733 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; 734 idx += 1; 735 736 /* MSRs */ 737 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 738 env->efer = vcxt.values[idx++].Reg64; 739 #ifdef TARGET_X86_64 740 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 741 env->kernelgsbase = vcxt.values[idx++].Reg64; 742 #endif 743 744 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 745 apic_base = vcxt.values[idx++].Reg64; 746 if (apic_base != vcpu->apic_base) { 747 vcpu->apic_base = apic_base; 748 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); 749 } 750 751 /* WHvX64RegisterPat - Skipped */ 752 753 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 754 env->sysenter_cs = vcxt.values[idx++].Reg64; 755 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 756 env->sysenter_eip = vcxt.values[idx++].Reg64; 757 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 758 env->sysenter_esp = vcxt.values[idx++].Reg64; 759 assert(whpx_register_names[idx] == WHvX64RegisterStar); 760 env->star = vcxt.values[idx++].Reg64; 761 #ifdef TARGET_X86_64 762 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 763 env->lstar = vcxt.values[idx++].Reg64; 764 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 765 env->cstar = vcxt.values[idx++].Reg64; 766 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 767 env->fmask = vcxt.values[idx++].Reg64; 768 #endif 769 770 /* Interrupt / Event Registers - Skipped */ 771 772 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 773 774 if (whpx_apic_in_platform()) { 775 whpx_apic_get(x86_cpu->apic_state); 776 } 777 778 x86_update_hflags(env); 779 780 return; 781 } 782 783 static HRESULT CALLBACK whpx_emu_ioport_callback( 784 void *ctx, 785 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) 786 { 787 MemTxAttrs attrs = { 0 }; 788 address_space_rw(&address_space_io, IoAccess->Port, attrs, 789 &IoAccess->Data, IoAccess->AccessSize, 790 IoAccess->Direction); 791 return S_OK; 792 } 793 794 static HRESULT CALLBACK whpx_emu_mmio_callback( 795 void *ctx, 796 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) 797 { 798 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, 799 ma->Direction); 800 return S_OK; 801 } 802 803 static HRESULT CALLBACK whpx_emu_getreg_callback( 804 void *ctx, 805 const WHV_REGISTER_NAME *RegisterNames, 806 UINT32 RegisterCount, 807 WHV_REGISTER_VALUE *RegisterValues) 808 { 809 HRESULT hr; 810 struct whpx_state *whpx = &whpx_global; 811 CPUState *cpu = (CPUState *)ctx; 812 813 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 814 whpx->partition, cpu->cpu_index, 815 RegisterNames, RegisterCount, 816 RegisterValues); 817 if (FAILED(hr)) { 818 error_report("WHPX: Failed to get virtual processor registers," 819 " hr=%08lx", hr); 820 } 821 822 return hr; 823 } 824 825 static HRESULT CALLBACK whpx_emu_setreg_callback( 826 void *ctx, 827 const WHV_REGISTER_NAME *RegisterNames, 828 UINT32 RegisterCount, 829 const WHV_REGISTER_VALUE *RegisterValues) 830 { 831 HRESULT hr; 832 struct whpx_state *whpx = &whpx_global; 833 CPUState *cpu = (CPUState *)ctx; 834 835 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 836 whpx->partition, cpu->cpu_index, 837 RegisterNames, RegisterCount, 838 RegisterValues); 839 if (FAILED(hr)) { 840 error_report("WHPX: Failed to set virtual processor registers," 841 " hr=%08lx", hr); 842 } 843 844 /* 845 * The emulator just successfully wrote the register state. We clear the 846 * dirty state so we avoid the double write on resume of the VP. 847 */ 848 cpu->vcpu_dirty = false; 849 850 return hr; 851 } 852 853 static HRESULT CALLBACK whpx_emu_translate_callback( 854 void *ctx, 855 WHV_GUEST_VIRTUAL_ADDRESS Gva, 856 WHV_TRANSLATE_GVA_FLAGS TranslateFlags, 857 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, 858 WHV_GUEST_PHYSICAL_ADDRESS *Gpa) 859 { 860 HRESULT hr; 861 struct whpx_state *whpx = &whpx_global; 862 CPUState *cpu = (CPUState *)ctx; 863 WHV_TRANSLATE_GVA_RESULT res; 864 865 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, 866 Gva, TranslateFlags, &res, Gpa); 867 if (FAILED(hr)) { 868 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); 869 } else { 870 *TranslationResult = res.ResultCode; 871 } 872 873 return hr; 874 } 875 876 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { 877 .Size = sizeof(WHV_EMULATOR_CALLBACKS), 878 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, 879 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, 880 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, 881 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, 882 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, 883 }; 884 885 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) 886 { 887 HRESULT hr; 888 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 889 WHV_EMULATOR_STATUS emu_status; 890 891 hr = whp_dispatch.WHvEmulatorTryMmioEmulation( 892 vcpu->emulator, cpu, 893 &vcpu->exit_ctx.VpContext, ctx, 894 &emu_status); 895 if (FAILED(hr)) { 896 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); 897 return -1; 898 } 899 900 if (!emu_status.EmulationSuccessful) { 901 error_report("WHPX: Failed to emulate MMIO access with" 902 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 903 return -1; 904 } 905 906 return 0; 907 } 908 909 static int whpx_handle_portio(CPUState *cpu, 910 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) 911 { 912 HRESULT hr; 913 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 914 WHV_EMULATOR_STATUS emu_status; 915 916 hr = whp_dispatch.WHvEmulatorTryIoEmulation( 917 vcpu->emulator, cpu, 918 &vcpu->exit_ctx.VpContext, ctx, 919 &emu_status); 920 if (FAILED(hr)) { 921 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); 922 return -1; 923 } 924 925 if (!emu_status.EmulationSuccessful) { 926 error_report("WHPX: Failed to emulate PortIO access with" 927 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 928 return -1; 929 } 930 931 return 0; 932 } 933 934 /* 935 * Controls whether we should intercept various exceptions on the guest, 936 * namely breakpoint/single-step events. 937 * 938 * The 'exceptions' argument accepts a bitmask, e.g: 939 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) 940 */ 941 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) 942 { 943 struct whpx_state *whpx = &whpx_global; 944 WHV_PARTITION_PROPERTY prop = { 0, }; 945 HRESULT hr; 946 947 if (exceptions == whpx->exception_exit_bitmap) { 948 return S_OK; 949 } 950 951 prop.ExceptionExitBitmap = exceptions; 952 953 hr = whp_dispatch.WHvSetPartitionProperty( 954 whpx->partition, 955 WHvPartitionPropertyCodeExceptionExitBitmap, 956 &prop, 957 sizeof(WHV_PARTITION_PROPERTY)); 958 959 if (SUCCEEDED(hr)) { 960 whpx->exception_exit_bitmap = exceptions; 961 } 962 963 return hr; 964 } 965 966 967 /* 968 * This function is called before/after stepping over a single instruction. 969 * It will update the CPU registers to arm/disarm the instruction stepping 970 * accordingly. 971 */ 972 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, 973 bool set, 974 uint64_t *exit_context_rflags) 975 { 976 WHV_REGISTER_NAME reg_name; 977 WHV_REGISTER_VALUE reg_value; 978 HRESULT hr; 979 struct whpx_state *whpx = &whpx_global; 980 981 /* 982 * If we are trying to step over a single instruction, we need to set the 983 * TF bit in rflags. Otherwise, clear it. 984 */ 985 reg_name = WHvX64RegisterRflags; 986 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 987 whpx->partition, 988 cpu->cpu_index, 989 ®_name, 990 1, 991 ®_value); 992 993 if (FAILED(hr)) { 994 error_report("WHPX: Failed to get rflags, hr=%08lx", hr); 995 return hr; 996 } 997 998 if (exit_context_rflags) { 999 assert(*exit_context_rflags == reg_value.Reg64); 1000 } 1001 1002 if (set) { 1003 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ 1004 reg_value.Reg64 |= TF_MASK; 1005 } else { 1006 reg_value.Reg64 &= ~TF_MASK; 1007 } 1008 1009 if (exit_context_rflags) { 1010 *exit_context_rflags = reg_value.Reg64; 1011 } 1012 1013 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1014 whpx->partition, 1015 cpu->cpu_index, 1016 ®_name, 1017 1, 1018 ®_value); 1019 1020 if (FAILED(hr)) { 1021 error_report("WHPX: Failed to set rflags," 1022 " hr=%08lx", 1023 hr); 1024 return hr; 1025 } 1026 1027 reg_name = WHvRegisterInterruptState; 1028 reg_value.Reg64 = 0; 1029 1030 /* Suspend delivery of hardware interrupts during single-stepping. */ 1031 reg_value.InterruptState.InterruptShadow = set != 0; 1032 1033 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1034 whpx->partition, 1035 cpu->cpu_index, 1036 ®_name, 1037 1, 1038 ®_value); 1039 1040 if (FAILED(hr)) { 1041 error_report("WHPX: Failed to set InterruptState," 1042 " hr=%08lx", 1043 hr); 1044 return hr; 1045 } 1046 1047 if (!set) { 1048 /* 1049 * We have just finished stepping over a single instruction, 1050 * and intercepted the INT1 generated by it. 1051 * We need to now hide the INT1 from the guest, 1052 * as it would not be expecting it. 1053 */ 1054 1055 reg_name = WHvX64RegisterPendingDebugException; 1056 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1057 whpx->partition, 1058 cpu->cpu_index, 1059 ®_name, 1060 1, 1061 ®_value); 1062 1063 if (FAILED(hr)) { 1064 error_report("WHPX: Failed to get pending debug exceptions," 1065 "hr=%08lx", hr); 1066 return hr; 1067 } 1068 1069 if (reg_value.PendingDebugException.SingleStep) { 1070 reg_value.PendingDebugException.SingleStep = 0; 1071 1072 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1073 whpx->partition, 1074 cpu->cpu_index, 1075 ®_name, 1076 1, 1077 ®_value); 1078 1079 if (FAILED(hr)) { 1080 error_report("WHPX: Failed to clear pending debug exceptions," 1081 "hr=%08lx", hr); 1082 return hr; 1083 } 1084 } 1085 1086 } 1087 1088 return S_OK; 1089 } 1090 1091 /* Tries to find a breakpoint at the specified address. */ 1092 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) 1093 { 1094 struct whpx_state *whpx = &whpx_global; 1095 int i; 1096 1097 if (whpx->breakpoints.breakpoints) { 1098 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { 1099 if (address == whpx->breakpoints.breakpoints->data[i].address) { 1100 return &whpx->breakpoints.breakpoints->data[i]; 1101 } 1102 } 1103 } 1104 1105 return NULL; 1106 } 1107 1108 /* 1109 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for 1110 * debugging user-mode applications. Since the WHPX API does not offer 1111 * an easy way to pass the intercepted exception back to the guest, we 1112 * resort to using INT1 instead, and let the guest always handle INT3. 1113 */ 1114 static const uint8_t whpx_breakpoint_instruction = 0xF1; 1115 1116 /* 1117 * The WHPX QEMU backend implements breakpoints by writing the INT1 1118 * instruction into memory (ignoring the DRx registers). This raises a few 1119 * issues that need to be carefully handled: 1120 * 1121 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints 1122 * at the same location, and later remove them in arbitrary order. 1123 * This should not cause memory corruption, and should only remove the 1124 * physical breakpoint instruction when the last QEMU breakpoint is gone. 1125 * 1126 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid 1127 * physical location. Hence, physically adding/removing a breakpoint can 1128 * theoretically fail at any time. We need to keep track of it. 1129 * 1130 * The function below rebuilds a list of low-level breakpoints (one per 1131 * address, tracking the original instruction and any errors) from the list of 1132 * high-level breakpoints (set via cpu_breakpoint_insert()). 1133 * 1134 * In order to optimize performance, this function stores the list of 1135 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the 1136 * low-level ones, so that it won't be re-invoked until these breakpoints 1137 * change. 1138 * 1139 * Note that this function decides which breakpoints should be inserted into, 1140 * memory, but doesn't actually do it. The memory accessing is done in 1141 * whpx_apply_breakpoints(). 1142 */ 1143 static void whpx_translate_cpu_breakpoints( 1144 struct whpx_breakpoints *breakpoints, 1145 CPUState *cpu, 1146 int cpu_breakpoint_count) 1147 { 1148 CPUBreakpoint *bp; 1149 int cpu_bp_index = 0; 1150 1151 breakpoints->original_addresses = 1152 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); 1153 1154 breakpoints->original_address_count = cpu_breakpoint_count; 1155 1156 int max_breakpoints = cpu_breakpoint_count + 1157 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); 1158 1159 struct whpx_breakpoint_collection *new_breakpoints = 1160 (struct whpx_breakpoint_collection *)g_malloc0( 1161 sizeof(struct whpx_breakpoint_collection) + 1162 max_breakpoints * sizeof(struct whpx_breakpoint)); 1163 1164 new_breakpoints->allocated = max_breakpoints; 1165 new_breakpoints->used = 0; 1166 1167 /* 1168 * 1. Preserve all old breakpoints that could not be automatically 1169 * cleared when the CPU got stopped. 1170 */ 1171 if (breakpoints->breakpoints) { 1172 int i; 1173 for (i = 0; i < breakpoints->breakpoints->used; i++) { 1174 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { 1175 new_breakpoints->data[new_breakpoints->used++] = 1176 breakpoints->breakpoints->data[i]; 1177 } 1178 } 1179 } 1180 1181 /* 2. Map all CPU breakpoints to WHPX breakpoints */ 1182 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1183 int i; 1184 bool found = false; 1185 1186 /* This will be used to detect changed CPU breakpoints later. */ 1187 breakpoints->original_addresses[cpu_bp_index++] = bp->pc; 1188 1189 for (i = 0; i < new_breakpoints->used; i++) { 1190 /* 1191 * WARNING: This loop has O(N^2) complexity, where N is the 1192 * number of breakpoints. It should not be a bottleneck in 1193 * real-world scenarios, since it only needs to run once after 1194 * the breakpoints have been modified. 1195 * If this ever becomes a concern, it can be optimized by storing 1196 * high-level breakpoint objects in a tree or hash map. 1197 */ 1198 1199 if (new_breakpoints->data[i].address == bp->pc) { 1200 /* There was already a breakpoint at this address. */ 1201 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { 1202 new_breakpoints->data[i].state = WHPX_BP_SET; 1203 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { 1204 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; 1205 } 1206 1207 found = true; 1208 break; 1209 } 1210 } 1211 1212 if (!found && new_breakpoints->used < new_breakpoints->allocated) { 1213 /* No WHPX breakpoint at this address. Create one. */ 1214 new_breakpoints->data[new_breakpoints->used].address = bp->pc; 1215 new_breakpoints->data[new_breakpoints->used].state = 1216 WHPX_BP_SET_PENDING; 1217 new_breakpoints->used++; 1218 } 1219 } 1220 1221 if (breakpoints->breakpoints) { 1222 /* 1223 * Free the previous breakpoint list. This can be optimized by keeping 1224 * it as shadow buffer for the next computation instead of freeing 1225 * it immediately. 1226 */ 1227 g_free(breakpoints->breakpoints); 1228 } 1229 1230 breakpoints->breakpoints = new_breakpoints; 1231 } 1232 1233 /* 1234 * Physically inserts/removes the breakpoints by reading and writing the 1235 * physical memory, keeping a track of the failed attempts. 1236 * 1237 * Passing resuming=true will try to set all previously unset breakpoints. 1238 * Passing resuming=false will remove all inserted ones. 1239 */ 1240 static void whpx_apply_breakpoints( 1241 struct whpx_breakpoint_collection *breakpoints, 1242 CPUState *cpu, 1243 bool resuming) 1244 { 1245 int i, rc; 1246 if (!breakpoints) { 1247 return; 1248 } 1249 1250 for (i = 0; i < breakpoints->used; i++) { 1251 /* Decide what to do right now based on the last known state. */ 1252 WhpxBreakpointState state = breakpoints->data[i].state; 1253 switch (state) { 1254 case WHPX_BP_CLEARED: 1255 if (resuming) { 1256 state = WHPX_BP_SET_PENDING; 1257 } 1258 break; 1259 case WHPX_BP_SET_PENDING: 1260 if (!resuming) { 1261 state = WHPX_BP_CLEARED; 1262 } 1263 break; 1264 case WHPX_BP_SET: 1265 if (!resuming) { 1266 state = WHPX_BP_CLEAR_PENDING; 1267 } 1268 break; 1269 case WHPX_BP_CLEAR_PENDING: 1270 if (resuming) { 1271 state = WHPX_BP_SET; 1272 } 1273 break; 1274 } 1275 1276 if (state == WHPX_BP_SET_PENDING) { 1277 /* Remember the original instruction. */ 1278 rc = cpu_memory_rw_debug(cpu, 1279 breakpoints->data[i].address, 1280 &breakpoints->data[i].original_instruction, 1281 1, 1282 false); 1283 1284 if (!rc) { 1285 /* Write the breakpoint instruction. */ 1286 rc = cpu_memory_rw_debug(cpu, 1287 breakpoints->data[i].address, 1288 (void *)&whpx_breakpoint_instruction, 1289 1, 1290 true); 1291 } 1292 1293 if (!rc) { 1294 state = WHPX_BP_SET; 1295 } 1296 1297 } 1298 1299 if (state == WHPX_BP_CLEAR_PENDING) { 1300 /* Restore the original instruction. */ 1301 rc = cpu_memory_rw_debug(cpu, 1302 breakpoints->data[i].address, 1303 &breakpoints->data[i].original_instruction, 1304 1, 1305 true); 1306 1307 if (!rc) { 1308 state = WHPX_BP_CLEARED; 1309 } 1310 } 1311 1312 breakpoints->data[i].state = state; 1313 } 1314 } 1315 1316 /* 1317 * This function is called when the a VCPU is about to start and no other 1318 * VCPUs have been started so far. Since the VCPU start order could be 1319 * arbitrary, it doesn't have to be VCPU#0. 1320 * 1321 * It is used to commit the breakpoints into memory, and configure WHPX 1322 * to intercept debug exceptions. 1323 * 1324 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or 1325 * more VCPUs are already running, so this is the best place to do it. 1326 */ 1327 static int whpx_first_vcpu_starting(CPUState *cpu) 1328 { 1329 struct whpx_state *whpx = &whpx_global; 1330 HRESULT hr; 1331 1332 g_assert(qemu_mutex_iothread_locked()); 1333 1334 if (!QTAILQ_EMPTY(&cpu->breakpoints) || 1335 (whpx->breakpoints.breakpoints && 1336 whpx->breakpoints.breakpoints->used)) { 1337 CPUBreakpoint *bp; 1338 int i = 0; 1339 bool update_pending = false; 1340 1341 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1342 if (i >= whpx->breakpoints.original_address_count || 1343 bp->pc != whpx->breakpoints.original_addresses[i]) { 1344 update_pending = true; 1345 } 1346 1347 i++; 1348 } 1349 1350 if (i != whpx->breakpoints.original_address_count) { 1351 update_pending = true; 1352 } 1353 1354 if (update_pending) { 1355 /* 1356 * The CPU breakpoints have changed since the last call to 1357 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must 1358 * now be recomputed. 1359 */ 1360 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); 1361 } 1362 1363 /* Actually insert the breakpoints into the memory. */ 1364 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); 1365 } 1366 1367 uint64_t exception_mask; 1368 if (whpx->step_pending || 1369 (whpx->breakpoints.breakpoints && 1370 whpx->breakpoints.breakpoints->used)) { 1371 /* 1372 * We are either attempting to single-step one or more CPUs, or 1373 * have one or more breakpoints enabled. Both require intercepting 1374 * the WHvX64ExceptionTypeBreakpointTrap exception. 1375 */ 1376 1377 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; 1378 } else { 1379 /* Let the guest handle all exceptions. */ 1380 exception_mask = 0; 1381 } 1382 1383 hr = whpx_set_exception_exit_bitmap(exception_mask); 1384 if (!SUCCEEDED(hr)) { 1385 error_report("WHPX: Failed to update exception exit mask," 1386 "hr=%08lx.", hr); 1387 return 1; 1388 } 1389 1390 return 0; 1391 } 1392 1393 /* 1394 * This function is called when the last VCPU has finished running. 1395 * It is used to remove any previously set breakpoints from memory. 1396 */ 1397 static int whpx_last_vcpu_stopping(CPUState *cpu) 1398 { 1399 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); 1400 return 0; 1401 } 1402 1403 /* Returns the address of the next instruction that is about to be executed. */ 1404 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) 1405 { 1406 if (cpu->vcpu_dirty) { 1407 /* The CPU registers have been modified by other parts of QEMU. */ 1408 CPUArchState *env = (CPUArchState *)(cpu->env_ptr); 1409 return env->eip; 1410 } else if (exit_context_valid) { 1411 /* 1412 * The CPU registers have not been modified by neither other parts 1413 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). 1414 * This is the most common case. 1415 */ 1416 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1417 return vcpu->exit_ctx.VpContext.Rip; 1418 } else { 1419 /* 1420 * The CPU registers have been modified by a call to 1421 * WHvSetVirtualProcessorRegisters() and must be re-queried from 1422 * the target. 1423 */ 1424 WHV_REGISTER_VALUE reg_value; 1425 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; 1426 HRESULT hr; 1427 struct whpx_state *whpx = &whpx_global; 1428 1429 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1430 whpx->partition, 1431 cpu->cpu_index, 1432 ®_name, 1433 1, 1434 ®_value); 1435 1436 if (FAILED(hr)) { 1437 error_report("WHPX: Failed to get PC, hr=%08lx", hr); 1438 return 0; 1439 } 1440 1441 return reg_value.Reg64; 1442 } 1443 } 1444 1445 static int whpx_handle_halt(CPUState *cpu) 1446 { 1447 CPUX86State *env = cpu->env_ptr; 1448 int ret = 0; 1449 1450 qemu_mutex_lock_iothread(); 1451 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1452 (env->eflags & IF_MASK)) && 1453 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1454 cpu->exception_index = EXCP_HLT; 1455 cpu->halted = true; 1456 ret = 1; 1457 } 1458 qemu_mutex_unlock_iothread(); 1459 1460 return ret; 1461 } 1462 1463 static void whpx_vcpu_pre_run(CPUState *cpu) 1464 { 1465 HRESULT hr; 1466 struct whpx_state *whpx = &whpx_global; 1467 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1468 CPUX86State *env = cpu->env_ptr; 1469 X86CPU *x86_cpu = X86_CPU(cpu); 1470 int irq; 1471 uint8_t tpr; 1472 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; 1473 UINT32 reg_count = 0; 1474 WHV_REGISTER_VALUE reg_values[3]; 1475 WHV_REGISTER_NAME reg_names[3]; 1476 1477 memset(&new_int, 0, sizeof(new_int)); 1478 memset(reg_values, 0, sizeof(reg_values)); 1479 1480 qemu_mutex_lock_iothread(); 1481 1482 /* Inject NMI */ 1483 if (!vcpu->interruption_pending && 1484 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 1485 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 1486 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 1487 vcpu->interruptable = false; 1488 new_int.InterruptionType = WHvX64PendingNmi; 1489 new_int.InterruptionPending = 1; 1490 new_int.InterruptionVector = 2; 1491 } 1492 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 1493 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 1494 } 1495 } 1496 1497 /* 1498 * Force the VCPU out of its inner loop to process any INIT requests or 1499 * commit pending TPR access. 1500 */ 1501 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 1502 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1503 !(env->hflags & HF_SMM_MASK)) { 1504 cpu->exit_request = 1; 1505 } 1506 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1507 cpu->exit_request = 1; 1508 } 1509 } 1510 1511 /* Get pending hard interruption or replay one that was overwritten */ 1512 if (!whpx_apic_in_platform()) { 1513 if (!vcpu->interruption_pending && 1514 vcpu->interruptable && (env->eflags & IF_MASK)) { 1515 assert(!new_int.InterruptionPending); 1516 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1517 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1518 irq = cpu_get_pic_interrupt(env); 1519 if (irq >= 0) { 1520 new_int.InterruptionType = WHvX64PendingInterrupt; 1521 new_int.InterruptionPending = 1; 1522 new_int.InterruptionVector = irq; 1523 } 1524 } 1525 } 1526 1527 /* Setup interrupt state if new one was prepared */ 1528 if (new_int.InterruptionPending) { 1529 reg_values[reg_count].PendingInterruption = new_int; 1530 reg_names[reg_count] = WHvRegisterPendingInterruption; 1531 reg_count += 1; 1532 } 1533 } else if (vcpu->ready_for_pic_interrupt && 1534 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 1535 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1536 irq = cpu_get_pic_interrupt(env); 1537 if (irq >= 0) { 1538 reg_names[reg_count] = WHvRegisterPendingEvent; 1539 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) 1540 { 1541 .EventPending = 1, 1542 .EventType = WHvX64PendingEventExtInt, 1543 .Vector = irq, 1544 }; 1545 reg_count += 1; 1546 } 1547 } 1548 1549 /* Sync the TPR to the CR8 if was modified during the intercept */ 1550 tpr = cpu_get_apic_tpr(x86_cpu->apic_state); 1551 if (tpr != vcpu->tpr) { 1552 vcpu->tpr = tpr; 1553 reg_values[reg_count].Reg64 = tpr; 1554 cpu->exit_request = 1; 1555 reg_names[reg_count] = WHvX64RegisterCr8; 1556 reg_count += 1; 1557 } 1558 1559 /* Update the state of the interrupt delivery notification */ 1560 if (!vcpu->window_registered && 1561 cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1562 reg_values[reg_count].DeliverabilityNotifications = 1563 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { 1564 .InterruptNotification = 1 1565 }; 1566 vcpu->window_registered = 1; 1567 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; 1568 reg_count += 1; 1569 } 1570 1571 qemu_mutex_unlock_iothread(); 1572 vcpu->ready_for_pic_interrupt = false; 1573 1574 if (reg_count) { 1575 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1576 whpx->partition, cpu->cpu_index, 1577 reg_names, reg_count, reg_values); 1578 if (FAILED(hr)) { 1579 error_report("WHPX: Failed to set interrupt state registers," 1580 " hr=%08lx", hr); 1581 } 1582 } 1583 1584 return; 1585 } 1586 1587 static void whpx_vcpu_post_run(CPUState *cpu) 1588 { 1589 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1590 CPUX86State *env = cpu->env_ptr; 1591 X86CPU *x86_cpu = X86_CPU(cpu); 1592 1593 env->eflags = vcpu->exit_ctx.VpContext.Rflags; 1594 1595 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; 1596 if (vcpu->tpr != tpr) { 1597 vcpu->tpr = tpr; 1598 qemu_mutex_lock_iothread(); 1599 cpu_set_apic_tpr(x86_cpu->apic_state, vcpu->tpr); 1600 qemu_mutex_unlock_iothread(); 1601 } 1602 1603 vcpu->interruption_pending = 1604 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; 1605 1606 vcpu->interruptable = 1607 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; 1608 1609 return; 1610 } 1611 1612 static void whpx_vcpu_process_async_events(CPUState *cpu) 1613 { 1614 CPUX86State *env = cpu->env_ptr; 1615 X86CPU *x86_cpu = X86_CPU(cpu); 1616 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1617 1618 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1619 !(env->hflags & HF_SMM_MASK)) { 1620 whpx_cpu_synchronize_state(cpu); 1621 do_cpu_init(x86_cpu); 1622 vcpu->interruptable = true; 1623 } 1624 1625 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 1626 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 1627 apic_poll_irq(x86_cpu->apic_state); 1628 } 1629 1630 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1631 (env->eflags & IF_MASK)) || 1632 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1633 cpu->halted = false; 1634 } 1635 1636 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 1637 whpx_cpu_synchronize_state(cpu); 1638 do_cpu_sipi(x86_cpu); 1639 } 1640 1641 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1642 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 1643 whpx_cpu_synchronize_state(cpu); 1644 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 1645 env->tpr_access_type); 1646 } 1647 1648 return; 1649 } 1650 1651 static int whpx_vcpu_run(CPUState *cpu) 1652 { 1653 HRESULT hr; 1654 struct whpx_state *whpx = &whpx_global; 1655 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1656 struct whpx_breakpoint *stepped_over_bp = NULL; 1657 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; 1658 int ret; 1659 1660 g_assert(qemu_mutex_iothread_locked()); 1661 1662 if (whpx->running_cpus++ == 0) { 1663 /* Insert breakpoints into memory, update exception exit bitmap. */ 1664 ret = whpx_first_vcpu_starting(cpu); 1665 if (ret != 0) { 1666 return ret; 1667 } 1668 } 1669 1670 if (whpx->breakpoints.breakpoints && 1671 whpx->breakpoints.breakpoints->used > 0) 1672 { 1673 uint64_t pc = whpx_vcpu_get_pc(cpu, true); 1674 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); 1675 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { 1676 stepped_over_bp = NULL; 1677 } 1678 1679 if (stepped_over_bp) { 1680 /* 1681 * We are trying to run the instruction overwritten by an active 1682 * breakpoint. We will temporarily disable the breakpoint, suspend 1683 * other CPUs, and step over the instruction. 1684 */ 1685 exclusive_step_mode = WHPX_STEP_EXCLUSIVE; 1686 } 1687 } 1688 1689 if (exclusive_step_mode == WHPX_STEP_NONE) { 1690 whpx_vcpu_process_async_events(cpu); 1691 if (cpu->halted && !whpx_apic_in_platform()) { 1692 cpu->exception_index = EXCP_HLT; 1693 qatomic_set(&cpu->exit_request, false); 1694 return 0; 1695 } 1696 } 1697 1698 qemu_mutex_unlock_iothread(); 1699 1700 if (exclusive_step_mode != WHPX_STEP_NONE) { 1701 start_exclusive(); 1702 g_assert(cpu == current_cpu); 1703 g_assert(!cpu->running); 1704 cpu->running = true; 1705 1706 hr = whpx_set_exception_exit_bitmap( 1707 1UL << WHvX64ExceptionTypeDebugTrapOrFault); 1708 if (!SUCCEEDED(hr)) { 1709 error_report("WHPX: Failed to update exception exit mask, " 1710 "hr=%08lx.", hr); 1711 return 1; 1712 } 1713 1714 if (stepped_over_bp) { 1715 /* Temporarily disable the triggered breakpoint. */ 1716 cpu_memory_rw_debug(cpu, 1717 stepped_over_bp->address, 1718 &stepped_over_bp->original_instruction, 1719 1, 1720 true); 1721 } 1722 } else { 1723 cpu_exec_start(cpu); 1724 } 1725 1726 do { 1727 if (cpu->vcpu_dirty) { 1728 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); 1729 cpu->vcpu_dirty = false; 1730 } 1731 1732 if (exclusive_step_mode == WHPX_STEP_NONE) { 1733 whpx_vcpu_pre_run(cpu); 1734 1735 if (qatomic_read(&cpu->exit_request)) { 1736 whpx_vcpu_kick(cpu); 1737 } 1738 } 1739 1740 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1741 whpx_vcpu_configure_single_stepping(cpu, true, NULL); 1742 } 1743 1744 hr = whp_dispatch.WHvRunVirtualProcessor( 1745 whpx->partition, cpu->cpu_index, 1746 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); 1747 1748 if (FAILED(hr)) { 1749 error_report("WHPX: Failed to exec a virtual processor," 1750 " hr=%08lx", hr); 1751 ret = -1; 1752 break; 1753 } 1754 1755 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1756 whpx_vcpu_configure_single_stepping(cpu, 1757 false, 1758 &vcpu->exit_ctx.VpContext.Rflags); 1759 } 1760 1761 whpx_vcpu_post_run(cpu); 1762 1763 switch (vcpu->exit_ctx.ExitReason) { 1764 case WHvRunVpExitReasonMemoryAccess: 1765 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); 1766 break; 1767 1768 case WHvRunVpExitReasonX64IoPortAccess: 1769 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); 1770 break; 1771 1772 case WHvRunVpExitReasonX64InterruptWindow: 1773 vcpu->ready_for_pic_interrupt = 1; 1774 vcpu->window_registered = 0; 1775 ret = 0; 1776 break; 1777 1778 case WHvRunVpExitReasonX64ApicEoi: 1779 assert(whpx_apic_in_platform()); 1780 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); 1781 break; 1782 1783 case WHvRunVpExitReasonX64Halt: 1784 /* 1785 * WARNING: as of build 19043.1526 (21H1), this exit reason is no 1786 * longer used. 1787 */ 1788 ret = whpx_handle_halt(cpu); 1789 break; 1790 1791 case WHvRunVpExitReasonX64ApicInitSipiTrap: { 1792 WHV_INTERRUPT_CONTROL ipi = {0}; 1793 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; 1794 uint32_t delivery_mode = 1795 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; 1796 int dest_shorthand = 1797 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; 1798 bool broadcast = false; 1799 bool include_self = false; 1800 uint32_t i; 1801 1802 /* We only registered for INIT and SIPI exits. */ 1803 if ((delivery_mode != APIC_DM_INIT) && 1804 (delivery_mode != APIC_DM_SIPI)) { 1805 error_report( 1806 "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); 1807 break; 1808 } 1809 1810 if (delivery_mode == APIC_DM_INIT) { 1811 ipi.Type = WHvX64InterruptTypeInit; 1812 } else { 1813 ipi.Type = WHvX64InterruptTypeSipi; 1814 } 1815 1816 ipi.DestinationMode = 1817 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? 1818 WHvX64InterruptDestinationModeLogical : 1819 WHvX64InterruptDestinationModePhysical; 1820 1821 ipi.TriggerMode = 1822 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? 1823 WHvX64InterruptTriggerModeLevel : 1824 WHvX64InterruptTriggerModeEdge; 1825 1826 ipi.Vector = icr & APIC_VECTOR_MASK; 1827 switch (dest_shorthand) { 1828 /* no shorthand. Bits 56-63 contain the destination. */ 1829 case 0: 1830 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; 1831 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1832 &ipi, sizeof(ipi)); 1833 if (FAILED(hr)) { 1834 error_report("WHPX: Failed to request interrupt hr=%08lx", 1835 hr); 1836 } 1837 1838 break; 1839 1840 /* self */ 1841 case 1: 1842 include_self = true; 1843 break; 1844 1845 /* broadcast, including self */ 1846 case 2: 1847 broadcast = true; 1848 include_self = true; 1849 break; 1850 1851 /* broadcast, excluding self */ 1852 case 3: 1853 broadcast = true; 1854 break; 1855 } 1856 1857 if (!broadcast && !include_self) { 1858 break; 1859 } 1860 1861 for (i = 0; i <= max_vcpu_index; i++) { 1862 if (i == cpu->cpu_index && !include_self) { 1863 continue; 1864 } 1865 1866 /* 1867 * Assuming that APIC Ids are identity mapped since 1868 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers 1869 * are not handled yet and the hypervisor doesn't allow the 1870 * guest to modify the APIC ID. 1871 */ 1872 ipi.Destination = i; 1873 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1874 &ipi, sizeof(ipi)); 1875 if (FAILED(hr)) { 1876 error_report( 1877 "WHPX: Failed to request SIPI for %d, hr=%08lx", 1878 i, hr); 1879 } 1880 } 1881 1882 break; 1883 } 1884 1885 case WHvRunVpExitReasonCanceled: 1886 if (exclusive_step_mode != WHPX_STEP_NONE) { 1887 /* 1888 * We are trying to step over a single instruction, and 1889 * likely got a request to stop from another thread. 1890 * Delay it until we are done stepping 1891 * over. 1892 */ 1893 ret = 0; 1894 } else { 1895 cpu->exception_index = EXCP_INTERRUPT; 1896 ret = 1; 1897 } 1898 break; 1899 case WHvRunVpExitReasonX64MsrAccess: { 1900 WHV_REGISTER_VALUE reg_values[3] = {0}; 1901 WHV_REGISTER_NAME reg_names[3]; 1902 UINT32 reg_count; 1903 1904 reg_names[0] = WHvX64RegisterRip; 1905 reg_names[1] = WHvX64RegisterRax; 1906 reg_names[2] = WHvX64RegisterRdx; 1907 1908 reg_values[0].Reg64 = 1909 vcpu->exit_ctx.VpContext.Rip + 1910 vcpu->exit_ctx.VpContext.InstructionLength; 1911 1912 /* 1913 * For all unsupported MSR access we: 1914 * ignore writes 1915 * return 0 on read. 1916 */ 1917 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? 1918 1 : 3; 1919 1920 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1921 whpx->partition, 1922 cpu->cpu_index, 1923 reg_names, reg_count, 1924 reg_values); 1925 1926 if (FAILED(hr)) { 1927 error_report("WHPX: Failed to set MsrAccess state " 1928 " registers, hr=%08lx", hr); 1929 } 1930 ret = 0; 1931 break; 1932 } 1933 case WHvRunVpExitReasonX64Cpuid: { 1934 WHV_REGISTER_VALUE reg_values[5]; 1935 WHV_REGISTER_NAME reg_names[5]; 1936 UINT32 reg_count = 5; 1937 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; 1938 X86CPU *x86_cpu = X86_CPU(cpu); 1939 CPUX86State *env = &x86_cpu->env; 1940 1941 memset(reg_values, 0, sizeof(reg_values)); 1942 1943 rip = vcpu->exit_ctx.VpContext.Rip + 1944 vcpu->exit_ctx.VpContext.InstructionLength; 1945 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; 1946 1947 /* 1948 * Ideally, these should be supplied to the hypervisor during VCPU 1949 * initialization and it should be able to satisfy this request. 1950 * But, currently, WHPX doesn't support setting CPUID values in the 1951 * hypervisor once the partition has been setup, which is too late 1952 * since VCPUs are realized later. For now, use the values from 1953 * QEMU to satisfy these requests, until WHPX adds support for 1954 * being able to set these values in the hypervisor at runtime. 1955 */ 1956 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, 1957 (UINT32 *)&rcx, (UINT32 *)&rdx); 1958 switch (cpuid_fn) { 1959 case 0x40000000: 1960 /* Expose the vmware cpu frequency cpuid leaf */ 1961 rax = 0x40000010; 1962 rbx = rcx = rdx = 0; 1963 break; 1964 1965 case 0x40000010: 1966 rax = env->tsc_khz; 1967 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1968 rcx = rdx = 0; 1969 break; 1970 1971 case 0x80000001: 1972 /* Remove any support of OSVW */ 1973 rcx &= ~CPUID_EXT3_OSVW; 1974 break; 1975 } 1976 1977 reg_names[0] = WHvX64RegisterRip; 1978 reg_names[1] = WHvX64RegisterRax; 1979 reg_names[2] = WHvX64RegisterRcx; 1980 reg_names[3] = WHvX64RegisterRdx; 1981 reg_names[4] = WHvX64RegisterRbx; 1982 1983 reg_values[0].Reg64 = rip; 1984 reg_values[1].Reg64 = rax; 1985 reg_values[2].Reg64 = rcx; 1986 reg_values[3].Reg64 = rdx; 1987 reg_values[4].Reg64 = rbx; 1988 1989 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1990 whpx->partition, cpu->cpu_index, 1991 reg_names, 1992 reg_count, 1993 reg_values); 1994 1995 if (FAILED(hr)) { 1996 error_report("WHPX: Failed to set CpuidAccess state registers," 1997 " hr=%08lx", hr); 1998 } 1999 ret = 0; 2000 break; 2001 } 2002 case WHvRunVpExitReasonException: 2003 whpx_get_registers(cpu); 2004 2005 if ((vcpu->exit_ctx.VpException.ExceptionType == 2006 WHvX64ExceptionTypeDebugTrapOrFault) && 2007 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && 2008 (vcpu->exit_ctx.VpException.InstructionBytes[0] == 2009 whpx_breakpoint_instruction)) { 2010 /* Stopped at a software breakpoint. */ 2011 cpu->exception_index = EXCP_DEBUG; 2012 } else if ((vcpu->exit_ctx.VpException.ExceptionType == 2013 WHvX64ExceptionTypeDebugTrapOrFault) && 2014 !cpu->singlestep_enabled) { 2015 /* 2016 * Just finished stepping over a breakpoint, but the 2017 * gdb does not expect us to do single-stepping. 2018 * Don't do anything special. 2019 */ 2020 cpu->exception_index = EXCP_INTERRUPT; 2021 } else { 2022 /* Another exception or debug event. Report it to GDB. */ 2023 cpu->exception_index = EXCP_DEBUG; 2024 } 2025 2026 ret = 1; 2027 break; 2028 case WHvRunVpExitReasonNone: 2029 case WHvRunVpExitReasonUnrecoverableException: 2030 case WHvRunVpExitReasonInvalidVpRegisterValue: 2031 case WHvRunVpExitReasonUnsupportedFeature: 2032 default: 2033 error_report("WHPX: Unexpected VP exit code %d", 2034 vcpu->exit_ctx.ExitReason); 2035 whpx_get_registers(cpu); 2036 qemu_mutex_lock_iothread(); 2037 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2038 qemu_mutex_unlock_iothread(); 2039 break; 2040 } 2041 2042 } while (!ret); 2043 2044 if (stepped_over_bp) { 2045 /* Restore the breakpoint we stepped over */ 2046 cpu_memory_rw_debug(cpu, 2047 stepped_over_bp->address, 2048 (void *)&whpx_breakpoint_instruction, 2049 1, 2050 true); 2051 } 2052 2053 if (exclusive_step_mode != WHPX_STEP_NONE) { 2054 g_assert(cpu_in_exclusive_context(cpu)); 2055 cpu->running = false; 2056 end_exclusive(); 2057 2058 exclusive_step_mode = WHPX_STEP_NONE; 2059 } else { 2060 cpu_exec_end(cpu); 2061 } 2062 2063 qemu_mutex_lock_iothread(); 2064 current_cpu = cpu; 2065 2066 if (--whpx->running_cpus == 0) { 2067 whpx_last_vcpu_stopping(cpu); 2068 } 2069 2070 qatomic_set(&cpu->exit_request, false); 2071 2072 return ret < 0; 2073 } 2074 2075 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2076 { 2077 if (!cpu->vcpu_dirty) { 2078 whpx_get_registers(cpu); 2079 cpu->vcpu_dirty = true; 2080 } 2081 } 2082 2083 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, 2084 run_on_cpu_data arg) 2085 { 2086 whpx_set_registers(cpu, WHPX_SET_RESET_STATE); 2087 cpu->vcpu_dirty = false; 2088 } 2089 2090 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, 2091 run_on_cpu_data arg) 2092 { 2093 whpx_set_registers(cpu, WHPX_SET_FULL_STATE); 2094 cpu->vcpu_dirty = false; 2095 } 2096 2097 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, 2098 run_on_cpu_data arg) 2099 { 2100 cpu->vcpu_dirty = true; 2101 } 2102 2103 /* 2104 * CPU support. 2105 */ 2106 2107 void whpx_cpu_synchronize_state(CPUState *cpu) 2108 { 2109 if (!cpu->vcpu_dirty) { 2110 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); 2111 } 2112 } 2113 2114 void whpx_cpu_synchronize_post_reset(CPUState *cpu) 2115 { 2116 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2117 } 2118 2119 void whpx_cpu_synchronize_post_init(CPUState *cpu) 2120 { 2121 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2122 } 2123 2124 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) 2125 { 2126 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2127 } 2128 2129 void whpx_cpu_synchronize_pre_resume(bool step_pending) 2130 { 2131 whpx_global.step_pending = step_pending; 2132 } 2133 2134 /* 2135 * Vcpu support. 2136 */ 2137 2138 static Error *whpx_migration_blocker; 2139 2140 static void whpx_cpu_update_state(void *opaque, bool running, RunState state) 2141 { 2142 CPUX86State *env = opaque; 2143 2144 if (running) { 2145 env->tsc_valid = false; 2146 } 2147 } 2148 2149 int whpx_init_vcpu(CPUState *cpu) 2150 { 2151 HRESULT hr; 2152 struct whpx_state *whpx = &whpx_global; 2153 struct whpx_vcpu *vcpu = NULL; 2154 Error *local_error = NULL; 2155 CPUX86State *env = cpu->env_ptr; 2156 X86CPU *x86_cpu = X86_CPU(cpu); 2157 UINT64 freq = 0; 2158 int ret; 2159 2160 /* Add migration blockers for all unsupported features of the 2161 * Windows Hypervisor Platform 2162 */ 2163 if (whpx_migration_blocker == NULL) { 2164 error_setg(&whpx_migration_blocker, 2165 "State blocked due to non-migratable CPUID feature support," 2166 "dirty memory tracking support, and XSAVE/XRSTOR support"); 2167 2168 if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) { 2169 error_report_err(local_error); 2170 error_free(whpx_migration_blocker); 2171 ret = -EINVAL; 2172 goto error; 2173 } 2174 } 2175 2176 vcpu = g_new0(struct whpx_vcpu, 1); 2177 2178 if (!vcpu) { 2179 error_report("WHPX: Failed to allocte VCPU context."); 2180 ret = -ENOMEM; 2181 goto error; 2182 } 2183 2184 hr = whp_dispatch.WHvEmulatorCreateEmulator( 2185 &whpx_emu_callbacks, 2186 &vcpu->emulator); 2187 if (FAILED(hr)) { 2188 error_report("WHPX: Failed to setup instruction completion support," 2189 " hr=%08lx", hr); 2190 ret = -EINVAL; 2191 goto error; 2192 } 2193 2194 hr = whp_dispatch.WHvCreateVirtualProcessor( 2195 whpx->partition, cpu->cpu_index, 0); 2196 if (FAILED(hr)) { 2197 error_report("WHPX: Failed to create a virtual processor," 2198 " hr=%08lx", hr); 2199 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2200 ret = -EINVAL; 2201 goto error; 2202 } 2203 2204 /* 2205 * vcpu's TSC frequency is either specified by user, or use the value 2206 * provided by Hyper-V if the former is not present. In the latter case, we 2207 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC 2208 * frequency can be migrated later via this field. 2209 */ 2210 if (!env->tsc_khz) { 2211 hr = whp_dispatch.WHvGetCapability( 2212 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), 2213 NULL); 2214 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2215 if (FAILED(hr)) { 2216 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); 2217 } else { 2218 env->tsc_khz = freq / 1000; /* Hz to KHz */ 2219 } 2220 } 2221 } 2222 2223 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; 2224 hr = whp_dispatch.WHvGetCapability( 2225 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); 2226 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2227 if (FAILED(hr)) { 2228 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); 2229 } else { 2230 env->apic_bus_freq = freq; 2231 } 2232 } 2233 2234 /* 2235 * If the vmware cpuid frequency leaf option is set, and we have a valid 2236 * tsc value, trap the corresponding cpuid's. 2237 */ 2238 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { 2239 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; 2240 2241 hr = whp_dispatch.WHvSetPartitionProperty( 2242 whpx->partition, 2243 WHvPartitionPropertyCodeCpuidExitList, 2244 cpuidExitList, 2245 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2246 2247 if (FAILED(hr)) { 2248 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2249 hr); 2250 ret = -EINVAL; 2251 goto error; 2252 } 2253 } 2254 2255 vcpu->interruptable = true; 2256 cpu->vcpu_dirty = true; 2257 cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu; 2258 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); 2259 qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr); 2260 2261 return 0; 2262 2263 error: 2264 g_free(vcpu); 2265 2266 return ret; 2267 } 2268 2269 int whpx_vcpu_exec(CPUState *cpu) 2270 { 2271 int ret; 2272 int fatal; 2273 2274 for (;;) { 2275 if (cpu->exception_index >= EXCP_INTERRUPT) { 2276 ret = cpu->exception_index; 2277 cpu->exception_index = -1; 2278 break; 2279 } 2280 2281 fatal = whpx_vcpu_run(cpu); 2282 2283 if (fatal) { 2284 error_report("WHPX: Failed to exec a virtual processor"); 2285 abort(); 2286 } 2287 } 2288 2289 return ret; 2290 } 2291 2292 void whpx_destroy_vcpu(CPUState *cpu) 2293 { 2294 struct whpx_state *whpx = &whpx_global; 2295 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 2296 2297 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); 2298 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2299 g_free(cpu->hax_vcpu); 2300 return; 2301 } 2302 2303 void whpx_vcpu_kick(CPUState *cpu) 2304 { 2305 struct whpx_state *whpx = &whpx_global; 2306 whp_dispatch.WHvCancelRunVirtualProcessor( 2307 whpx->partition, cpu->cpu_index, 0); 2308 } 2309 2310 /* 2311 * Memory support. 2312 */ 2313 2314 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, 2315 void *host_va, int add, int rom, 2316 const char *name) 2317 { 2318 struct whpx_state *whpx = &whpx_global; 2319 HRESULT hr; 2320 2321 /* 2322 if (add) { 2323 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", 2324 (void*)start_pa, (void*)size, host_va, 2325 (rom ? "ROM" : "RAM"), name); 2326 } else { 2327 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", 2328 (void*)start_pa, (void*)size, host_va, name); 2329 } 2330 */ 2331 2332 if (add) { 2333 hr = whp_dispatch.WHvMapGpaRange(whpx->partition, 2334 host_va, 2335 start_pa, 2336 size, 2337 (WHvMapGpaRangeFlagRead | 2338 WHvMapGpaRangeFlagExecute | 2339 (rom ? 0 : WHvMapGpaRangeFlagWrite))); 2340 } else { 2341 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, 2342 start_pa, 2343 size); 2344 } 2345 2346 if (FAILED(hr)) { 2347 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," 2348 " Host:%p, hr=%08lx", 2349 (add ? "MAP" : "UNMAP"), name, 2350 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); 2351 } 2352 } 2353 2354 static void whpx_process_section(MemoryRegionSection *section, int add) 2355 { 2356 MemoryRegion *mr = section->mr; 2357 hwaddr start_pa = section->offset_within_address_space; 2358 ram_addr_t size = int128_get64(section->size); 2359 unsigned int delta; 2360 uint64_t host_va; 2361 2362 if (!memory_region_is_ram(mr)) { 2363 return; 2364 } 2365 2366 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 2367 delta &= ~qemu_real_host_page_mask(); 2368 if (delta > size) { 2369 return; 2370 } 2371 start_pa += delta; 2372 size -= delta; 2373 size &= qemu_real_host_page_mask(); 2374 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 2375 return; 2376 } 2377 2378 host_va = (uintptr_t)memory_region_get_ram_ptr(mr) 2379 + section->offset_within_region + delta; 2380 2381 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, 2382 memory_region_is_rom(mr), mr->name); 2383 } 2384 2385 static void whpx_region_add(MemoryListener *listener, 2386 MemoryRegionSection *section) 2387 { 2388 memory_region_ref(section->mr); 2389 whpx_process_section(section, 1); 2390 } 2391 2392 static void whpx_region_del(MemoryListener *listener, 2393 MemoryRegionSection *section) 2394 { 2395 whpx_process_section(section, 0); 2396 memory_region_unref(section->mr); 2397 } 2398 2399 static void whpx_transaction_begin(MemoryListener *listener) 2400 { 2401 } 2402 2403 static void whpx_transaction_commit(MemoryListener *listener) 2404 { 2405 } 2406 2407 static void whpx_log_sync(MemoryListener *listener, 2408 MemoryRegionSection *section) 2409 { 2410 MemoryRegion *mr = section->mr; 2411 2412 if (!memory_region_is_ram(mr)) { 2413 return; 2414 } 2415 2416 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 2417 } 2418 2419 static MemoryListener whpx_memory_listener = { 2420 .name = "whpx", 2421 .begin = whpx_transaction_begin, 2422 .commit = whpx_transaction_commit, 2423 .region_add = whpx_region_add, 2424 .region_del = whpx_region_del, 2425 .log_sync = whpx_log_sync, 2426 .priority = 10, 2427 }; 2428 2429 static void whpx_memory_init(void) 2430 { 2431 memory_listener_register(&whpx_memory_listener, &address_space_memory); 2432 } 2433 2434 /* 2435 * Load the functions from the given library, using the given handle. If a 2436 * handle is provided, it is used, otherwise the library is opened. The 2437 * handle will be updated on return with the opened one. 2438 */ 2439 static bool load_whp_dispatch_fns(HMODULE *handle, 2440 WHPFunctionList function_list) 2441 { 2442 HMODULE hLib = *handle; 2443 2444 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" 2445 #define WINHV_EMULATION_DLL "WinHvEmulation.dll" 2446 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ 2447 whp_dispatch.function_name = \ 2448 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2449 2450 #define WHP_LOAD_FIELD(return_type, function_name, signature) \ 2451 whp_dispatch.function_name = \ 2452 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2453 if (!whp_dispatch.function_name) { \ 2454 error_report("Could not load function %s", #function_name); \ 2455 goto error; \ 2456 } \ 2457 2458 #define WHP_LOAD_LIB(lib_name, handle_lib) \ 2459 if (!handle_lib) { \ 2460 handle_lib = LoadLibrary(lib_name); \ 2461 if (!handle_lib) { \ 2462 error_report("Could not load library %s.", lib_name); \ 2463 goto error; \ 2464 } \ 2465 } \ 2466 2467 switch (function_list) { 2468 case WINHV_PLATFORM_FNS_DEFAULT: 2469 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2470 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) 2471 break; 2472 2473 case WINHV_EMULATION_FNS_DEFAULT: 2474 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) 2475 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) 2476 break; 2477 2478 case WINHV_PLATFORM_FNS_SUPPLEMENTAL: 2479 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2480 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) 2481 break; 2482 } 2483 2484 *handle = hLib; 2485 return true; 2486 2487 error: 2488 if (hLib) { 2489 FreeLibrary(hLib); 2490 } 2491 2492 return false; 2493 } 2494 2495 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, 2496 const char *name, void *opaque, 2497 Error **errp) 2498 { 2499 struct whpx_state *whpx = &whpx_global; 2500 OnOffSplit mode; 2501 2502 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 2503 return; 2504 } 2505 2506 switch (mode) { 2507 case ON_OFF_SPLIT_ON: 2508 whpx->kernel_irqchip_allowed = true; 2509 whpx->kernel_irqchip_required = true; 2510 break; 2511 2512 case ON_OFF_SPLIT_OFF: 2513 whpx->kernel_irqchip_allowed = false; 2514 whpx->kernel_irqchip_required = false; 2515 break; 2516 2517 case ON_OFF_SPLIT_SPLIT: 2518 error_setg(errp, "WHPX: split irqchip currently not supported"); 2519 error_append_hint(errp, 2520 "Try without kernel-irqchip or with kernel-irqchip=on|off"); 2521 break; 2522 2523 default: 2524 /* 2525 * The value was checked in visit_type_OnOffSplit() above. If 2526 * we get here, then something is wrong in QEMU. 2527 */ 2528 abort(); 2529 } 2530 } 2531 2532 /* 2533 * Partition support 2534 */ 2535 2536 static int whpx_accel_init(MachineState *ms) 2537 { 2538 struct whpx_state *whpx; 2539 int ret; 2540 HRESULT hr; 2541 WHV_CAPABILITY whpx_cap; 2542 UINT32 whpx_cap_size; 2543 WHV_PARTITION_PROPERTY prop; 2544 UINT32 cpuidExitList[] = {1, 0x80000001}; 2545 WHV_CAPABILITY_FEATURES features = {0}; 2546 2547 whpx = &whpx_global; 2548 2549 if (!init_whp_dispatch()) { 2550 ret = -ENOSYS; 2551 goto error; 2552 } 2553 2554 whpx->mem_quota = ms->ram_size; 2555 2556 hr = whp_dispatch.WHvGetCapability( 2557 WHvCapabilityCodeHypervisorPresent, &whpx_cap, 2558 sizeof(whpx_cap), &whpx_cap_size); 2559 if (FAILED(hr) || !whpx_cap.HypervisorPresent) { 2560 error_report("WHPX: No accelerator found, hr=%08lx", hr); 2561 ret = -ENOSPC; 2562 goto error; 2563 } 2564 2565 hr = whp_dispatch.WHvGetCapability( 2566 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); 2567 if (FAILED(hr)) { 2568 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); 2569 ret = -EINVAL; 2570 goto error; 2571 } 2572 2573 hr = whp_dispatch.WHvCreatePartition(&whpx->partition); 2574 if (FAILED(hr)) { 2575 error_report("WHPX: Failed to create partition, hr=%08lx", hr); 2576 ret = -EINVAL; 2577 goto error; 2578 } 2579 2580 /* 2581 * Query the XSAVE capability of the partition. Any error here is not 2582 * considered fatal. 2583 */ 2584 hr = whp_dispatch.WHvGetPartitionProperty( 2585 whpx->partition, 2586 WHvPartitionPropertyCodeProcessorXsaveFeatures, 2587 &whpx_xsave_cap, 2588 sizeof(whpx_xsave_cap), 2589 &whpx_cap_size); 2590 2591 /* 2592 * Windows version which don't support this property will return with the 2593 * specific error code. 2594 */ 2595 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) { 2596 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr); 2597 } 2598 2599 if (!whpx_has_xsave()) { 2600 printf("WHPX: Partition is not XSAVE capable\n"); 2601 } 2602 2603 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2604 prop.ProcessorCount = ms->smp.cpus; 2605 hr = whp_dispatch.WHvSetPartitionProperty( 2606 whpx->partition, 2607 WHvPartitionPropertyCodeProcessorCount, 2608 &prop, 2609 sizeof(WHV_PARTITION_PROPERTY)); 2610 2611 if (FAILED(hr)) { 2612 error_report("WHPX: Failed to set partition core count to %d," 2613 " hr=%08lx", ms->smp.cores, hr); 2614 ret = -EINVAL; 2615 goto error; 2616 } 2617 2618 /* 2619 * Error out if WHP doesn't support apic emulation and user is requiring 2620 * it. 2621 */ 2622 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || 2623 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { 2624 error_report("WHPX: kernel irqchip requested, but unavailable. " 2625 "Try without kernel-irqchip or with kernel-irqchip=off"); 2626 ret = -EINVAL; 2627 goto error; 2628 } 2629 2630 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && 2631 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { 2632 WHV_X64_LOCAL_APIC_EMULATION_MODE mode = 2633 WHvX64LocalApicEmulationModeXApic; 2634 printf("WHPX: setting APIC emulation mode in the hypervisor\n"); 2635 hr = whp_dispatch.WHvSetPartitionProperty( 2636 whpx->partition, 2637 WHvPartitionPropertyCodeLocalApicEmulationMode, 2638 &mode, 2639 sizeof(mode)); 2640 if (FAILED(hr)) { 2641 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); 2642 if (whpx->kernel_irqchip_required) { 2643 error_report("WHPX: kernel irqchip requested, but unavailable"); 2644 ret = -EINVAL; 2645 goto error; 2646 } 2647 } else { 2648 whpx->apic_in_platform = true; 2649 } 2650 } 2651 2652 /* Register for MSR and CPUID exits */ 2653 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2654 prop.ExtendedVmExits.X64MsrExit = 1; 2655 prop.ExtendedVmExits.X64CpuidExit = 1; 2656 prop.ExtendedVmExits.ExceptionExit = 1; 2657 if (whpx_apic_in_platform()) { 2658 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; 2659 } 2660 2661 hr = whp_dispatch.WHvSetPartitionProperty( 2662 whpx->partition, 2663 WHvPartitionPropertyCodeExtendedVmExits, 2664 &prop, 2665 sizeof(WHV_PARTITION_PROPERTY)); 2666 if (FAILED(hr)) { 2667 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); 2668 ret = -EINVAL; 2669 goto error; 2670 } 2671 2672 hr = whp_dispatch.WHvSetPartitionProperty( 2673 whpx->partition, 2674 WHvPartitionPropertyCodeCpuidExitList, 2675 cpuidExitList, 2676 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2677 2678 if (FAILED(hr)) { 2679 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2680 hr); 2681 ret = -EINVAL; 2682 goto error; 2683 } 2684 2685 /* 2686 * We do not want to intercept any exceptions from the guest, 2687 * until we actually start debugging with gdb. 2688 */ 2689 whpx->exception_exit_bitmap = -1; 2690 hr = whpx_set_exception_exit_bitmap(0); 2691 2692 if (FAILED(hr)) { 2693 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); 2694 ret = -EINVAL; 2695 goto error; 2696 } 2697 2698 hr = whp_dispatch.WHvSetupPartition(whpx->partition); 2699 if (FAILED(hr)) { 2700 error_report("WHPX: Failed to setup partition, hr=%08lx", hr); 2701 ret = -EINVAL; 2702 goto error; 2703 } 2704 2705 whpx_memory_init(); 2706 2707 printf("Windows Hypervisor Platform accelerator is operational\n"); 2708 return 0; 2709 2710 error: 2711 2712 if (NULL != whpx->partition) { 2713 whp_dispatch.WHvDeletePartition(whpx->partition); 2714 whpx->partition = NULL; 2715 } 2716 2717 return ret; 2718 } 2719 2720 int whpx_enabled(void) 2721 { 2722 return whpx_allowed; 2723 } 2724 2725 bool whpx_apic_in_platform(void) { 2726 return whpx_global.apic_in_platform; 2727 } 2728 2729 static void whpx_accel_class_init(ObjectClass *oc, void *data) 2730 { 2731 AccelClass *ac = ACCEL_CLASS(oc); 2732 ac->name = "WHPX"; 2733 ac->init_machine = whpx_accel_init; 2734 ac->allowed = &whpx_allowed; 2735 2736 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 2737 NULL, whpx_set_kernel_irqchip, 2738 NULL, NULL); 2739 object_class_property_set_description(oc, "kernel-irqchip", 2740 "Configure WHPX in-kernel irqchip"); 2741 } 2742 2743 static void whpx_accel_instance_init(Object *obj) 2744 { 2745 struct whpx_state *whpx = &whpx_global; 2746 2747 memset(whpx, 0, sizeof(struct whpx_state)); 2748 /* Turn on kernel-irqchip, by default */ 2749 whpx->kernel_irqchip_allowed = true; 2750 } 2751 2752 static const TypeInfo whpx_accel_type = { 2753 .name = ACCEL_CLASS_NAME("whpx"), 2754 .parent = TYPE_ACCEL, 2755 .instance_init = whpx_accel_instance_init, 2756 .class_init = whpx_accel_class_init, 2757 }; 2758 2759 static void whpx_type_init(void) 2760 { 2761 type_register_static(&whpx_accel_type); 2762 } 2763 2764 bool init_whp_dispatch(void) 2765 { 2766 if (whp_dispatch_initialized) { 2767 return true; 2768 } 2769 2770 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { 2771 goto error; 2772 } 2773 2774 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { 2775 goto error; 2776 } 2777 2778 assert(load_whp_dispatch_fns(&hWinHvPlatform, 2779 WINHV_PLATFORM_FNS_SUPPLEMENTAL)); 2780 whp_dispatch_initialized = true; 2781 2782 return true; 2783 error: 2784 if (hWinHvPlatform) { 2785 FreeLibrary(hWinHvPlatform); 2786 } 2787 2788 if (hWinHvEmulation) { 2789 FreeLibrary(hWinHvEmulation); 2790 } 2791 2792 return false; 2793 } 2794 2795 type_init(whpx_type_init); 2796