1 /* 2 * QEMU Windows Hypervisor Platform accelerator (WHPX) 3 * 4 * Copyright Microsoft Corp. 2017 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 11 #include "qemu/osdep.h" 12 #include "cpu.h" 13 #include "exec/address-spaces.h" 14 #include "exec/ioport.h" 15 #include "gdbstub/helpers.h" 16 #include "qemu/accel.h" 17 #include "sysemu/whpx.h" 18 #include "sysemu/cpus.h" 19 #include "sysemu/runstate.h" 20 #include "qemu/main-loop.h" 21 #include "hw/boards.h" 22 #include "hw/intc/ioapic.h" 23 #include "hw/i386/apic_internal.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "qapi/qapi-types-common.h" 27 #include "qapi/qapi-visit-common.h" 28 #include "migration/blocker.h" 29 #include <winerror.h> 30 31 #include "whpx-internal.h" 32 #include "whpx-accel-ops.h" 33 34 #include <WinHvPlatform.h> 35 #include <WinHvEmulation.h> 36 37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) 38 39 static const WHV_REGISTER_NAME whpx_register_names[] = { 40 41 /* X64 General purpose registers */ 42 WHvX64RegisterRax, 43 WHvX64RegisterRcx, 44 WHvX64RegisterRdx, 45 WHvX64RegisterRbx, 46 WHvX64RegisterRsp, 47 WHvX64RegisterRbp, 48 WHvX64RegisterRsi, 49 WHvX64RegisterRdi, 50 WHvX64RegisterR8, 51 WHvX64RegisterR9, 52 WHvX64RegisterR10, 53 WHvX64RegisterR11, 54 WHvX64RegisterR12, 55 WHvX64RegisterR13, 56 WHvX64RegisterR14, 57 WHvX64RegisterR15, 58 WHvX64RegisterRip, 59 WHvX64RegisterRflags, 60 61 /* X64 Segment registers */ 62 WHvX64RegisterEs, 63 WHvX64RegisterCs, 64 WHvX64RegisterSs, 65 WHvX64RegisterDs, 66 WHvX64RegisterFs, 67 WHvX64RegisterGs, 68 WHvX64RegisterLdtr, 69 WHvX64RegisterTr, 70 71 /* X64 Table registers */ 72 WHvX64RegisterIdtr, 73 WHvX64RegisterGdtr, 74 75 /* X64 Control Registers */ 76 WHvX64RegisterCr0, 77 WHvX64RegisterCr2, 78 WHvX64RegisterCr3, 79 WHvX64RegisterCr4, 80 WHvX64RegisterCr8, 81 82 /* X64 Debug Registers */ 83 /* 84 * WHvX64RegisterDr0, 85 * WHvX64RegisterDr1, 86 * WHvX64RegisterDr2, 87 * WHvX64RegisterDr3, 88 * WHvX64RegisterDr6, 89 * WHvX64RegisterDr7, 90 */ 91 92 /* X64 Floating Point and Vector Registers */ 93 WHvX64RegisterXmm0, 94 WHvX64RegisterXmm1, 95 WHvX64RegisterXmm2, 96 WHvX64RegisterXmm3, 97 WHvX64RegisterXmm4, 98 WHvX64RegisterXmm5, 99 WHvX64RegisterXmm6, 100 WHvX64RegisterXmm7, 101 WHvX64RegisterXmm8, 102 WHvX64RegisterXmm9, 103 WHvX64RegisterXmm10, 104 WHvX64RegisterXmm11, 105 WHvX64RegisterXmm12, 106 WHvX64RegisterXmm13, 107 WHvX64RegisterXmm14, 108 WHvX64RegisterXmm15, 109 WHvX64RegisterFpMmx0, 110 WHvX64RegisterFpMmx1, 111 WHvX64RegisterFpMmx2, 112 WHvX64RegisterFpMmx3, 113 WHvX64RegisterFpMmx4, 114 WHvX64RegisterFpMmx5, 115 WHvX64RegisterFpMmx6, 116 WHvX64RegisterFpMmx7, 117 WHvX64RegisterFpControlStatus, 118 WHvX64RegisterXmmControlStatus, 119 120 /* X64 MSRs */ 121 WHvX64RegisterEfer, 122 #ifdef TARGET_X86_64 123 WHvX64RegisterKernelGsBase, 124 #endif 125 WHvX64RegisterApicBase, 126 /* WHvX64RegisterPat, */ 127 WHvX64RegisterSysenterCs, 128 WHvX64RegisterSysenterEip, 129 WHvX64RegisterSysenterEsp, 130 WHvX64RegisterStar, 131 #ifdef TARGET_X86_64 132 WHvX64RegisterLstar, 133 WHvX64RegisterCstar, 134 WHvX64RegisterSfmask, 135 #endif 136 137 /* Interrupt / Event Registers */ 138 /* 139 * WHvRegisterPendingInterruption, 140 * WHvRegisterInterruptState, 141 * WHvRegisterPendingEvent0, 142 * WHvRegisterPendingEvent1 143 * WHvX64RegisterDeliverabilityNotifications, 144 */ 145 }; 146 147 struct whpx_register_set { 148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; 149 }; 150 151 /* 152 * The current implementation of instruction stepping sets the TF flag 153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction. 154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. 155 * 156 * This approach has a few limitations: 157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag 158 * along with the other flags, possibly restoring it later. It would 159 * result in another INT1 when the flags are restored, triggering 160 * a stop in gdb that could be cleared by doing another step. 161 * 162 * Stepping over a POPF/LAHF instruction will let it overwrite the 163 * TF flags, ending the stepping mode. 164 * 165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, 166 * or anything that could result in a page fault) will save the flags 167 * to the stack, clear the TF flag, and let the guest execute the 168 * handler. Normally, the guest will restore the original flags, 169 * that will continue single-stepping. 170 * 171 * 3. Debuggers running on the guest may wish to set TF to do instruction 172 * stepping. INT1 events generated by it would be intercepted by us, 173 * as long as the gdb is connected to QEMU. 174 * 175 * In practice this means that: 176 * 1. Stepping through flags-modifying instructions may cause gdb to 177 * continue or stop in unexpected places. This will be fully recoverable 178 * and will not crash the target. 179 * 180 * 2. Stepping over an instruction that triggers an exception will step 181 * over the exception handler, not into it. 182 * 183 * 3. Debugging the guest via gdb, while running debugger on the guest 184 * at the same time may lead to unexpected effects. Removing all 185 * breakpoints set via QEMU will prevent any further interference 186 * with the guest-level debuggers. 187 * 188 * The limitations can be addressed as shown below: 189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of 190 * stepping through them. The exact semantics of the instructions is 191 * defined in the "Combined Volume Set of Intel 64 and IA-32 192 * Architectures Software Developer's Manuals", however it involves a 193 * fair amount of corner cases due to compatibility with real mode, 194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes. 195 * 196 * 2. We could step into the guest's exception handlers using the following 197 * sequence: 198 * a. Temporarily enable catching of all exception types via 199 * whpx_set_exception_exit_bitmap(). 200 * b. Once an exception is intercepted, read the IDT/GDT and locate 201 * the original handler. 202 * c. Patch the original handler, injecting an INT3 at the beginning. 203 * d. Update the exception exit bitmap to only catch the 204 * WHvX64ExceptionTypeBreakpointTrap exception. 205 * e. Let the affected CPU run in the exclusive mode. 206 * f. Restore the original handler and the exception exit bitmap. 207 * Note that handling all corner cases related to IDT/GDT is harder 208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a 209 * rough idea. 210 * 211 * 3. In order to properly support guest-level debugging in parallel with 212 * the QEMU-level debugging, we would need to be able to pass some INT1 213 * events to the guest. This could be done via the following methods: 214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, 215 * it seems to only work for interrupts and not software 216 * exceptions. 217 * b. Locating and patching the original handler by parsing IDT/GDT. 218 * This involves relatively complex logic outlined in the previous 219 * paragraph. 220 * c. Emulating the exception invocation (i.e. manually updating RIP, 221 * RFLAGS, and pushing the old values to stack). This is even more 222 * complicated than the previous option, since it involves checking 223 * CPL, gate attributes, and doing various adjustments depending 224 * on the current CPU mode, whether the CPL is changing, etc. 225 */ 226 typedef enum WhpxStepMode { 227 WHPX_STEP_NONE = 0, 228 /* Halt other VCPUs */ 229 WHPX_STEP_EXCLUSIVE, 230 } WhpxStepMode; 231 232 struct whpx_vcpu { 233 WHV_EMULATOR_HANDLE emulator; 234 bool window_registered; 235 bool interruptable; 236 bool ready_for_pic_interrupt; 237 uint64_t tpr; 238 uint64_t apic_base; 239 bool interruption_pending; 240 241 /* Must be the last field as it may have a tail */ 242 WHV_RUN_VP_EXIT_CONTEXT exit_ctx; 243 }; 244 245 static bool whpx_allowed; 246 static bool whp_dispatch_initialized; 247 static HMODULE hWinHvPlatform, hWinHvEmulation; 248 static uint32_t max_vcpu_index; 249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap; 250 251 struct whpx_state whpx_global; 252 struct WHPDispatch whp_dispatch; 253 254 static bool whpx_has_xsave(void) 255 { 256 return whpx_xsave_cap.XsaveSupport; 257 } 258 259 /* 260 * VP support 261 */ 262 263 static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu) 264 { 265 return (struct whpx_vcpu *)cpu->hax_vcpu; 266 } 267 268 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, 269 int r86) 270 { 271 WHV_X64_SEGMENT_REGISTER hs; 272 unsigned flags = qs->flags; 273 274 hs.Base = qs->base; 275 hs.Limit = qs->limit; 276 hs.Selector = qs->selector; 277 278 if (v86) { 279 hs.Attributes = 0; 280 hs.SegmentType = 3; 281 hs.Present = 1; 282 hs.DescriptorPrivilegeLevel = 3; 283 hs.NonSystemSegment = 1; 284 285 } else { 286 hs.Attributes = (flags >> DESC_TYPE_SHIFT); 287 288 if (r86) { 289 /* hs.Base &= 0xfffff; */ 290 } 291 } 292 293 return hs; 294 } 295 296 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) 297 { 298 SegmentCache qs; 299 300 qs.base = hs->Base; 301 qs.limit = hs->Limit; 302 qs.selector = hs->Selector; 303 304 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; 305 306 return qs; 307 } 308 309 /* X64 Extended Control Registers */ 310 static void whpx_set_xcrs(CPUState *cpu) 311 { 312 CPUX86State *env = cpu->env_ptr; 313 HRESULT hr; 314 struct whpx_state *whpx = &whpx_global; 315 WHV_REGISTER_VALUE xcr0; 316 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 317 318 if (!whpx_has_xsave()) { 319 return; 320 } 321 322 /* Only xcr0 is supported by the hypervisor currently */ 323 xcr0.Reg64 = env->xcr0; 324 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 325 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 326 if (FAILED(hr)) { 327 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr); 328 } 329 } 330 331 static int whpx_set_tsc(CPUState *cpu) 332 { 333 CPUX86State *env = cpu->env_ptr; 334 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 335 WHV_REGISTER_VALUE tsc_val; 336 HRESULT hr; 337 struct whpx_state *whpx = &whpx_global; 338 339 /* 340 * Suspend the partition prior to setting the TSC to reduce the variance 341 * in TSC across vCPUs. When the first vCPU runs post suspend, the 342 * partition is automatically resumed. 343 */ 344 if (whp_dispatch.WHvSuspendPartitionTime) { 345 346 /* 347 * Unable to suspend partition while setting TSC is not a fatal 348 * error. It just increases the likelihood of TSC variance between 349 * vCPUs and some guest OS are able to handle that just fine. 350 */ 351 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); 352 if (FAILED(hr)) { 353 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); 354 } 355 } 356 357 tsc_val.Reg64 = env->tsc; 358 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 359 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 360 if (FAILED(hr)) { 361 error_report("WHPX: Failed to set TSC, hr=%08lx", hr); 362 return -1; 363 } 364 365 return 0; 366 } 367 368 /* 369 * The CR8 register in the CPU is mapped to the TPR register of the APIC, 370 * however, they use a slightly different encoding. Specifically: 371 * 372 * APIC.TPR[bits 7:4] = CR8[bits 3:0] 373 * 374 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 375 * and IA-32 Architectures Software Developer's Manual. 376 * 377 * The functions below translate the value of CR8 to TPR and vice versa. 378 */ 379 380 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) 381 { 382 return tpr >> 4; 383 } 384 385 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8) 386 { 387 return cr8 << 4; 388 } 389 390 static void whpx_set_registers(CPUState *cpu, int level) 391 { 392 struct whpx_state *whpx = &whpx_global; 393 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 394 CPUX86State *env = cpu->env_ptr; 395 X86CPU *x86_cpu = X86_CPU(cpu); 396 struct whpx_register_set vcxt; 397 HRESULT hr; 398 int idx; 399 int idx_next; 400 int i; 401 int v86, r86; 402 403 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 404 405 /* 406 * Following MSRs have side effects on the guest or are too heavy for 407 * runtime. Limit them to full state update. 408 */ 409 if (level >= WHPX_SET_RESET_STATE) { 410 whpx_set_tsc(cpu); 411 } 412 413 memset(&vcxt, 0, sizeof(struct whpx_register_set)); 414 415 v86 = (env->eflags & VM_MASK); 416 r86 = !(env->cr[0] & CR0_PE_MASK); 417 418 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 419 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); 420 421 idx = 0; 422 423 /* Indexes for first 16 registers match between HV and QEMU definitions */ 424 idx_next = 16; 425 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 426 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; 427 } 428 idx = idx_next; 429 430 /* Same goes for RIP and RFLAGS */ 431 assert(whpx_register_names[idx] == WHvX64RegisterRip); 432 vcxt.values[idx++].Reg64 = env->eip; 433 434 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 435 vcxt.values[idx++].Reg64 = env->eflags; 436 437 /* Translate 6+4 segment registers. HV and QEMU order matches */ 438 assert(idx == WHvX64RegisterEs); 439 for (i = 0; i < 6; i += 1, idx += 1) { 440 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); 441 } 442 443 assert(idx == WHvX64RegisterLdtr); 444 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); 445 446 assert(idx == WHvX64RegisterTr); 447 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); 448 449 assert(idx == WHvX64RegisterIdtr); 450 vcxt.values[idx].Table.Base = env->idt.base; 451 vcxt.values[idx].Table.Limit = env->idt.limit; 452 idx += 1; 453 454 assert(idx == WHvX64RegisterGdtr); 455 vcxt.values[idx].Table.Base = env->gdt.base; 456 vcxt.values[idx].Table.Limit = env->gdt.limit; 457 idx += 1; 458 459 /* CR0, 2, 3, 4, 8 */ 460 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 461 vcxt.values[idx++].Reg64 = env->cr[0]; 462 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 463 vcxt.values[idx++].Reg64 = env->cr[2]; 464 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 465 vcxt.values[idx++].Reg64 = env->cr[3]; 466 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 467 vcxt.values[idx++].Reg64 = env->cr[4]; 468 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 469 vcxt.values[idx++].Reg64 = vcpu->tpr; 470 471 /* 8 Debug Registers - Skipped */ 472 473 /* 474 * Extended control registers needs to be handled separately depending 475 * on whether xsave is supported/enabled or not. 476 */ 477 whpx_set_xcrs(cpu); 478 479 /* 16 XMM registers */ 480 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 481 idx_next = idx + 16; 482 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 483 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); 484 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); 485 } 486 idx = idx_next; 487 488 /* 8 FP registers */ 489 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 490 for (i = 0; i < 8; i += 1, idx += 1) { 491 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); 492 /* vcxt.values[idx].Fp.AsUINT128.High64 = 493 env->fpregs[i].mmx.MMX_Q(1); 494 */ 495 } 496 497 /* FP control status register */ 498 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 499 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; 500 vcxt.values[idx].FpControlStatus.FpStatus = 501 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; 502 vcxt.values[idx].FpControlStatus.FpTag = 0; 503 for (i = 0; i < 8; ++i) { 504 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; 505 } 506 vcxt.values[idx].FpControlStatus.Reserved = 0; 507 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; 508 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; 509 idx += 1; 510 511 /* XMM control status register */ 512 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 513 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; 514 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; 515 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; 516 idx += 1; 517 518 /* MSRs */ 519 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 520 vcxt.values[idx++].Reg64 = env->efer; 521 #ifdef TARGET_X86_64 522 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 523 vcxt.values[idx++].Reg64 = env->kernelgsbase; 524 #endif 525 526 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 527 vcxt.values[idx++].Reg64 = vcpu->apic_base; 528 529 /* WHvX64RegisterPat - Skipped */ 530 531 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 532 vcxt.values[idx++].Reg64 = env->sysenter_cs; 533 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 534 vcxt.values[idx++].Reg64 = env->sysenter_eip; 535 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 536 vcxt.values[idx++].Reg64 = env->sysenter_esp; 537 assert(whpx_register_names[idx] == WHvX64RegisterStar); 538 vcxt.values[idx++].Reg64 = env->star; 539 #ifdef TARGET_X86_64 540 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 541 vcxt.values[idx++].Reg64 = env->lstar; 542 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 543 vcxt.values[idx++].Reg64 = env->cstar; 544 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 545 vcxt.values[idx++].Reg64 = env->fmask; 546 #endif 547 548 /* Interrupt / Event Registers - Skipped */ 549 550 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 551 552 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 553 whpx->partition, cpu->cpu_index, 554 whpx_register_names, 555 RTL_NUMBER_OF(whpx_register_names), 556 &vcxt.values[0]); 557 558 if (FAILED(hr)) { 559 error_report("WHPX: Failed to set virtual processor context, hr=%08lx", 560 hr); 561 } 562 563 return; 564 } 565 566 static int whpx_get_tsc(CPUState *cpu) 567 { 568 CPUX86State *env = cpu->env_ptr; 569 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 570 WHV_REGISTER_VALUE tsc_val; 571 HRESULT hr; 572 struct whpx_state *whpx = &whpx_global; 573 574 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 575 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 576 if (FAILED(hr)) { 577 error_report("WHPX: Failed to get TSC, hr=%08lx", hr); 578 return -1; 579 } 580 581 env->tsc = tsc_val.Reg64; 582 return 0; 583 } 584 585 /* X64 Extended Control Registers */ 586 static void whpx_get_xcrs(CPUState *cpu) 587 { 588 CPUX86State *env = cpu->env_ptr; 589 HRESULT hr; 590 struct whpx_state *whpx = &whpx_global; 591 WHV_REGISTER_VALUE xcr0; 592 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 593 594 if (!whpx_has_xsave()) { 595 return; 596 } 597 598 /* Only xcr0 is supported by the hypervisor currently */ 599 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 600 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 601 if (FAILED(hr)) { 602 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr); 603 return; 604 } 605 606 env->xcr0 = xcr0.Reg64; 607 } 608 609 static void whpx_get_registers(CPUState *cpu) 610 { 611 struct whpx_state *whpx = &whpx_global; 612 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 613 CPUX86State *env = cpu->env_ptr; 614 X86CPU *x86_cpu = X86_CPU(cpu); 615 struct whpx_register_set vcxt; 616 uint64_t tpr, apic_base; 617 HRESULT hr; 618 int idx; 619 int idx_next; 620 int i; 621 622 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 623 624 if (!env->tsc_valid) { 625 whpx_get_tsc(cpu); 626 env->tsc_valid = !runstate_is_running(); 627 } 628 629 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 630 whpx->partition, cpu->cpu_index, 631 whpx_register_names, 632 RTL_NUMBER_OF(whpx_register_names), 633 &vcxt.values[0]); 634 if (FAILED(hr)) { 635 error_report("WHPX: Failed to get virtual processor context, hr=%08lx", 636 hr); 637 } 638 639 if (whpx_apic_in_platform()) { 640 /* 641 * Fetch the TPR value from the emulated APIC. It may get overwritten 642 * below with the value from CR8 returned by 643 * WHvGetVirtualProcessorRegisters(). 644 */ 645 whpx_apic_get(x86_cpu->apic_state); 646 vcpu->tpr = whpx_apic_tpr_to_cr8( 647 cpu_get_apic_tpr(x86_cpu->apic_state)); 648 } 649 650 idx = 0; 651 652 /* Indexes for first 16 registers match between HV and QEMU definitions */ 653 idx_next = 16; 654 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 655 env->regs[idx] = vcxt.values[idx].Reg64; 656 } 657 idx = idx_next; 658 659 /* Same goes for RIP and RFLAGS */ 660 assert(whpx_register_names[idx] == WHvX64RegisterRip); 661 env->eip = vcxt.values[idx++].Reg64; 662 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 663 env->eflags = vcxt.values[idx++].Reg64; 664 665 /* Translate 6+4 segment registers. HV and QEMU order matches */ 666 assert(idx == WHvX64RegisterEs); 667 for (i = 0; i < 6; i += 1, idx += 1) { 668 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); 669 } 670 671 assert(idx == WHvX64RegisterLdtr); 672 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); 673 assert(idx == WHvX64RegisterTr); 674 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); 675 assert(idx == WHvX64RegisterIdtr); 676 env->idt.base = vcxt.values[idx].Table.Base; 677 env->idt.limit = vcxt.values[idx].Table.Limit; 678 idx += 1; 679 assert(idx == WHvX64RegisterGdtr); 680 env->gdt.base = vcxt.values[idx].Table.Base; 681 env->gdt.limit = vcxt.values[idx].Table.Limit; 682 idx += 1; 683 684 /* CR0, 2, 3, 4, 8 */ 685 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 686 env->cr[0] = vcxt.values[idx++].Reg64; 687 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 688 env->cr[2] = vcxt.values[idx++].Reg64; 689 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 690 env->cr[3] = vcxt.values[idx++].Reg64; 691 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 692 env->cr[4] = vcxt.values[idx++].Reg64; 693 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 694 tpr = vcxt.values[idx++].Reg64; 695 if (tpr != vcpu->tpr) { 696 vcpu->tpr = tpr; 697 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr)); 698 } 699 700 /* 8 Debug Registers - Skipped */ 701 702 /* 703 * Extended control registers needs to be handled separately depending 704 * on whether xsave is supported/enabled or not. 705 */ 706 whpx_get_xcrs(cpu); 707 708 /* 16 XMM registers */ 709 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 710 idx_next = idx + 16; 711 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 712 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; 713 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; 714 } 715 idx = idx_next; 716 717 /* 8 FP registers */ 718 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 719 for (i = 0; i < 8; i += 1, idx += 1) { 720 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; 721 /* env->fpregs[i].mmx.MMX_Q(1) = 722 vcxt.values[idx].Fp.AsUINT128.High64; 723 */ 724 } 725 726 /* FP control status register */ 727 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 728 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; 729 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; 730 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; 731 for (i = 0; i < 8; ++i) { 732 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); 733 } 734 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; 735 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; 736 idx += 1; 737 738 /* XMM control status register */ 739 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 740 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; 741 idx += 1; 742 743 /* MSRs */ 744 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 745 env->efer = vcxt.values[idx++].Reg64; 746 #ifdef TARGET_X86_64 747 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 748 env->kernelgsbase = vcxt.values[idx++].Reg64; 749 #endif 750 751 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 752 apic_base = vcxt.values[idx++].Reg64; 753 if (apic_base != vcpu->apic_base) { 754 vcpu->apic_base = apic_base; 755 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); 756 } 757 758 /* WHvX64RegisterPat - Skipped */ 759 760 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 761 env->sysenter_cs = vcxt.values[idx++].Reg64; 762 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 763 env->sysenter_eip = vcxt.values[idx++].Reg64; 764 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 765 env->sysenter_esp = vcxt.values[idx++].Reg64; 766 assert(whpx_register_names[idx] == WHvX64RegisterStar); 767 env->star = vcxt.values[idx++].Reg64; 768 #ifdef TARGET_X86_64 769 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 770 env->lstar = vcxt.values[idx++].Reg64; 771 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 772 env->cstar = vcxt.values[idx++].Reg64; 773 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 774 env->fmask = vcxt.values[idx++].Reg64; 775 #endif 776 777 /* Interrupt / Event Registers - Skipped */ 778 779 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 780 781 if (whpx_apic_in_platform()) { 782 whpx_apic_get(x86_cpu->apic_state); 783 } 784 785 x86_update_hflags(env); 786 787 return; 788 } 789 790 static HRESULT CALLBACK whpx_emu_ioport_callback( 791 void *ctx, 792 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) 793 { 794 MemTxAttrs attrs = { 0 }; 795 address_space_rw(&address_space_io, IoAccess->Port, attrs, 796 &IoAccess->Data, IoAccess->AccessSize, 797 IoAccess->Direction); 798 return S_OK; 799 } 800 801 static HRESULT CALLBACK whpx_emu_mmio_callback( 802 void *ctx, 803 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) 804 { 805 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, 806 ma->Direction); 807 return S_OK; 808 } 809 810 static HRESULT CALLBACK whpx_emu_getreg_callback( 811 void *ctx, 812 const WHV_REGISTER_NAME *RegisterNames, 813 UINT32 RegisterCount, 814 WHV_REGISTER_VALUE *RegisterValues) 815 { 816 HRESULT hr; 817 struct whpx_state *whpx = &whpx_global; 818 CPUState *cpu = (CPUState *)ctx; 819 820 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 821 whpx->partition, cpu->cpu_index, 822 RegisterNames, RegisterCount, 823 RegisterValues); 824 if (FAILED(hr)) { 825 error_report("WHPX: Failed to get virtual processor registers," 826 " hr=%08lx", hr); 827 } 828 829 return hr; 830 } 831 832 static HRESULT CALLBACK whpx_emu_setreg_callback( 833 void *ctx, 834 const WHV_REGISTER_NAME *RegisterNames, 835 UINT32 RegisterCount, 836 const WHV_REGISTER_VALUE *RegisterValues) 837 { 838 HRESULT hr; 839 struct whpx_state *whpx = &whpx_global; 840 CPUState *cpu = (CPUState *)ctx; 841 842 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 843 whpx->partition, cpu->cpu_index, 844 RegisterNames, RegisterCount, 845 RegisterValues); 846 if (FAILED(hr)) { 847 error_report("WHPX: Failed to set virtual processor registers," 848 " hr=%08lx", hr); 849 } 850 851 /* 852 * The emulator just successfully wrote the register state. We clear the 853 * dirty state so we avoid the double write on resume of the VP. 854 */ 855 cpu->vcpu_dirty = false; 856 857 return hr; 858 } 859 860 static HRESULT CALLBACK whpx_emu_translate_callback( 861 void *ctx, 862 WHV_GUEST_VIRTUAL_ADDRESS Gva, 863 WHV_TRANSLATE_GVA_FLAGS TranslateFlags, 864 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, 865 WHV_GUEST_PHYSICAL_ADDRESS *Gpa) 866 { 867 HRESULT hr; 868 struct whpx_state *whpx = &whpx_global; 869 CPUState *cpu = (CPUState *)ctx; 870 WHV_TRANSLATE_GVA_RESULT res; 871 872 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, 873 Gva, TranslateFlags, &res, Gpa); 874 if (FAILED(hr)) { 875 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); 876 } else { 877 *TranslationResult = res.ResultCode; 878 } 879 880 return hr; 881 } 882 883 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { 884 .Size = sizeof(WHV_EMULATOR_CALLBACKS), 885 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, 886 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, 887 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, 888 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, 889 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, 890 }; 891 892 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) 893 { 894 HRESULT hr; 895 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 896 WHV_EMULATOR_STATUS emu_status; 897 898 hr = whp_dispatch.WHvEmulatorTryMmioEmulation( 899 vcpu->emulator, cpu, 900 &vcpu->exit_ctx.VpContext, ctx, 901 &emu_status); 902 if (FAILED(hr)) { 903 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); 904 return -1; 905 } 906 907 if (!emu_status.EmulationSuccessful) { 908 error_report("WHPX: Failed to emulate MMIO access with" 909 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 910 return -1; 911 } 912 913 return 0; 914 } 915 916 static int whpx_handle_portio(CPUState *cpu, 917 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) 918 { 919 HRESULT hr; 920 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 921 WHV_EMULATOR_STATUS emu_status; 922 923 hr = whp_dispatch.WHvEmulatorTryIoEmulation( 924 vcpu->emulator, cpu, 925 &vcpu->exit_ctx.VpContext, ctx, 926 &emu_status); 927 if (FAILED(hr)) { 928 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); 929 return -1; 930 } 931 932 if (!emu_status.EmulationSuccessful) { 933 error_report("WHPX: Failed to emulate PortIO access with" 934 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 935 return -1; 936 } 937 938 return 0; 939 } 940 941 /* 942 * Controls whether we should intercept various exceptions on the guest, 943 * namely breakpoint/single-step events. 944 * 945 * The 'exceptions' argument accepts a bitmask, e.g: 946 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) 947 */ 948 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) 949 { 950 struct whpx_state *whpx = &whpx_global; 951 WHV_PARTITION_PROPERTY prop = { 0, }; 952 HRESULT hr; 953 954 if (exceptions == whpx->exception_exit_bitmap) { 955 return S_OK; 956 } 957 958 prop.ExceptionExitBitmap = exceptions; 959 960 hr = whp_dispatch.WHvSetPartitionProperty( 961 whpx->partition, 962 WHvPartitionPropertyCodeExceptionExitBitmap, 963 &prop, 964 sizeof(WHV_PARTITION_PROPERTY)); 965 966 if (SUCCEEDED(hr)) { 967 whpx->exception_exit_bitmap = exceptions; 968 } 969 970 return hr; 971 } 972 973 974 /* 975 * This function is called before/after stepping over a single instruction. 976 * It will update the CPU registers to arm/disarm the instruction stepping 977 * accordingly. 978 */ 979 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, 980 bool set, 981 uint64_t *exit_context_rflags) 982 { 983 WHV_REGISTER_NAME reg_name; 984 WHV_REGISTER_VALUE reg_value; 985 HRESULT hr; 986 struct whpx_state *whpx = &whpx_global; 987 988 /* 989 * If we are trying to step over a single instruction, we need to set the 990 * TF bit in rflags. Otherwise, clear it. 991 */ 992 reg_name = WHvX64RegisterRflags; 993 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 994 whpx->partition, 995 cpu->cpu_index, 996 ®_name, 997 1, 998 ®_value); 999 1000 if (FAILED(hr)) { 1001 error_report("WHPX: Failed to get rflags, hr=%08lx", hr); 1002 return hr; 1003 } 1004 1005 if (exit_context_rflags) { 1006 assert(*exit_context_rflags == reg_value.Reg64); 1007 } 1008 1009 if (set) { 1010 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ 1011 reg_value.Reg64 |= TF_MASK; 1012 } else { 1013 reg_value.Reg64 &= ~TF_MASK; 1014 } 1015 1016 if (exit_context_rflags) { 1017 *exit_context_rflags = reg_value.Reg64; 1018 } 1019 1020 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1021 whpx->partition, 1022 cpu->cpu_index, 1023 ®_name, 1024 1, 1025 ®_value); 1026 1027 if (FAILED(hr)) { 1028 error_report("WHPX: Failed to set rflags," 1029 " hr=%08lx", 1030 hr); 1031 return hr; 1032 } 1033 1034 reg_name = WHvRegisterInterruptState; 1035 reg_value.Reg64 = 0; 1036 1037 /* Suspend delivery of hardware interrupts during single-stepping. */ 1038 reg_value.InterruptState.InterruptShadow = set != 0; 1039 1040 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1041 whpx->partition, 1042 cpu->cpu_index, 1043 ®_name, 1044 1, 1045 ®_value); 1046 1047 if (FAILED(hr)) { 1048 error_report("WHPX: Failed to set InterruptState," 1049 " hr=%08lx", 1050 hr); 1051 return hr; 1052 } 1053 1054 if (!set) { 1055 /* 1056 * We have just finished stepping over a single instruction, 1057 * and intercepted the INT1 generated by it. 1058 * We need to now hide the INT1 from the guest, 1059 * as it would not be expecting it. 1060 */ 1061 1062 reg_name = WHvX64RegisterPendingDebugException; 1063 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1064 whpx->partition, 1065 cpu->cpu_index, 1066 ®_name, 1067 1, 1068 ®_value); 1069 1070 if (FAILED(hr)) { 1071 error_report("WHPX: Failed to get pending debug exceptions," 1072 "hr=%08lx", hr); 1073 return hr; 1074 } 1075 1076 if (reg_value.PendingDebugException.SingleStep) { 1077 reg_value.PendingDebugException.SingleStep = 0; 1078 1079 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1080 whpx->partition, 1081 cpu->cpu_index, 1082 ®_name, 1083 1, 1084 ®_value); 1085 1086 if (FAILED(hr)) { 1087 error_report("WHPX: Failed to clear pending debug exceptions," 1088 "hr=%08lx", hr); 1089 return hr; 1090 } 1091 } 1092 1093 } 1094 1095 return S_OK; 1096 } 1097 1098 /* Tries to find a breakpoint at the specified address. */ 1099 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) 1100 { 1101 struct whpx_state *whpx = &whpx_global; 1102 int i; 1103 1104 if (whpx->breakpoints.breakpoints) { 1105 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { 1106 if (address == whpx->breakpoints.breakpoints->data[i].address) { 1107 return &whpx->breakpoints.breakpoints->data[i]; 1108 } 1109 } 1110 } 1111 1112 return NULL; 1113 } 1114 1115 /* 1116 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for 1117 * debugging user-mode applications. Since the WHPX API does not offer 1118 * an easy way to pass the intercepted exception back to the guest, we 1119 * resort to using INT1 instead, and let the guest always handle INT3. 1120 */ 1121 static const uint8_t whpx_breakpoint_instruction = 0xF1; 1122 1123 /* 1124 * The WHPX QEMU backend implements breakpoints by writing the INT1 1125 * instruction into memory (ignoring the DRx registers). This raises a few 1126 * issues that need to be carefully handled: 1127 * 1128 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints 1129 * at the same location, and later remove them in arbitrary order. 1130 * This should not cause memory corruption, and should only remove the 1131 * physical breakpoint instruction when the last QEMU breakpoint is gone. 1132 * 1133 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid 1134 * physical location. Hence, physically adding/removing a breakpoint can 1135 * theoretically fail at any time. We need to keep track of it. 1136 * 1137 * The function below rebuilds a list of low-level breakpoints (one per 1138 * address, tracking the original instruction and any errors) from the list of 1139 * high-level breakpoints (set via cpu_breakpoint_insert()). 1140 * 1141 * In order to optimize performance, this function stores the list of 1142 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the 1143 * low-level ones, so that it won't be re-invoked until these breakpoints 1144 * change. 1145 * 1146 * Note that this function decides which breakpoints should be inserted into, 1147 * memory, but doesn't actually do it. The memory accessing is done in 1148 * whpx_apply_breakpoints(). 1149 */ 1150 static void whpx_translate_cpu_breakpoints( 1151 struct whpx_breakpoints *breakpoints, 1152 CPUState *cpu, 1153 int cpu_breakpoint_count) 1154 { 1155 CPUBreakpoint *bp; 1156 int cpu_bp_index = 0; 1157 1158 breakpoints->original_addresses = 1159 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); 1160 1161 breakpoints->original_address_count = cpu_breakpoint_count; 1162 1163 int max_breakpoints = cpu_breakpoint_count + 1164 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); 1165 1166 struct whpx_breakpoint_collection *new_breakpoints = 1167 g_malloc0(sizeof(struct whpx_breakpoint_collection) 1168 + max_breakpoints * sizeof(struct whpx_breakpoint)); 1169 1170 new_breakpoints->allocated = max_breakpoints; 1171 new_breakpoints->used = 0; 1172 1173 /* 1174 * 1. Preserve all old breakpoints that could not be automatically 1175 * cleared when the CPU got stopped. 1176 */ 1177 if (breakpoints->breakpoints) { 1178 int i; 1179 for (i = 0; i < breakpoints->breakpoints->used; i++) { 1180 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { 1181 new_breakpoints->data[new_breakpoints->used++] = 1182 breakpoints->breakpoints->data[i]; 1183 } 1184 } 1185 } 1186 1187 /* 2. Map all CPU breakpoints to WHPX breakpoints */ 1188 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1189 int i; 1190 bool found = false; 1191 1192 /* This will be used to detect changed CPU breakpoints later. */ 1193 breakpoints->original_addresses[cpu_bp_index++] = bp->pc; 1194 1195 for (i = 0; i < new_breakpoints->used; i++) { 1196 /* 1197 * WARNING: This loop has O(N^2) complexity, where N is the 1198 * number of breakpoints. It should not be a bottleneck in 1199 * real-world scenarios, since it only needs to run once after 1200 * the breakpoints have been modified. 1201 * If this ever becomes a concern, it can be optimized by storing 1202 * high-level breakpoint objects in a tree or hash map. 1203 */ 1204 1205 if (new_breakpoints->data[i].address == bp->pc) { 1206 /* There was already a breakpoint at this address. */ 1207 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { 1208 new_breakpoints->data[i].state = WHPX_BP_SET; 1209 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { 1210 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; 1211 } 1212 1213 found = true; 1214 break; 1215 } 1216 } 1217 1218 if (!found && new_breakpoints->used < new_breakpoints->allocated) { 1219 /* No WHPX breakpoint at this address. Create one. */ 1220 new_breakpoints->data[new_breakpoints->used].address = bp->pc; 1221 new_breakpoints->data[new_breakpoints->used].state = 1222 WHPX_BP_SET_PENDING; 1223 new_breakpoints->used++; 1224 } 1225 } 1226 1227 /* 1228 * Free the previous breakpoint list. This can be optimized by keeping 1229 * it as shadow buffer for the next computation instead of freeing 1230 * it immediately. 1231 */ 1232 g_free(breakpoints->breakpoints); 1233 1234 breakpoints->breakpoints = new_breakpoints; 1235 } 1236 1237 /* 1238 * Physically inserts/removes the breakpoints by reading and writing the 1239 * physical memory, keeping a track of the failed attempts. 1240 * 1241 * Passing resuming=true will try to set all previously unset breakpoints. 1242 * Passing resuming=false will remove all inserted ones. 1243 */ 1244 static void whpx_apply_breakpoints( 1245 struct whpx_breakpoint_collection *breakpoints, 1246 CPUState *cpu, 1247 bool resuming) 1248 { 1249 int i, rc; 1250 if (!breakpoints) { 1251 return; 1252 } 1253 1254 for (i = 0; i < breakpoints->used; i++) { 1255 /* Decide what to do right now based on the last known state. */ 1256 WhpxBreakpointState state = breakpoints->data[i].state; 1257 switch (state) { 1258 case WHPX_BP_CLEARED: 1259 if (resuming) { 1260 state = WHPX_BP_SET_PENDING; 1261 } 1262 break; 1263 case WHPX_BP_SET_PENDING: 1264 if (!resuming) { 1265 state = WHPX_BP_CLEARED; 1266 } 1267 break; 1268 case WHPX_BP_SET: 1269 if (!resuming) { 1270 state = WHPX_BP_CLEAR_PENDING; 1271 } 1272 break; 1273 case WHPX_BP_CLEAR_PENDING: 1274 if (resuming) { 1275 state = WHPX_BP_SET; 1276 } 1277 break; 1278 } 1279 1280 if (state == WHPX_BP_SET_PENDING) { 1281 /* Remember the original instruction. */ 1282 rc = cpu_memory_rw_debug(cpu, 1283 breakpoints->data[i].address, 1284 &breakpoints->data[i].original_instruction, 1285 1, 1286 false); 1287 1288 if (!rc) { 1289 /* Write the breakpoint instruction. */ 1290 rc = cpu_memory_rw_debug(cpu, 1291 breakpoints->data[i].address, 1292 (void *)&whpx_breakpoint_instruction, 1293 1, 1294 true); 1295 } 1296 1297 if (!rc) { 1298 state = WHPX_BP_SET; 1299 } 1300 1301 } 1302 1303 if (state == WHPX_BP_CLEAR_PENDING) { 1304 /* Restore the original instruction. */ 1305 rc = cpu_memory_rw_debug(cpu, 1306 breakpoints->data[i].address, 1307 &breakpoints->data[i].original_instruction, 1308 1, 1309 true); 1310 1311 if (!rc) { 1312 state = WHPX_BP_CLEARED; 1313 } 1314 } 1315 1316 breakpoints->data[i].state = state; 1317 } 1318 } 1319 1320 /* 1321 * This function is called when the a VCPU is about to start and no other 1322 * VCPUs have been started so far. Since the VCPU start order could be 1323 * arbitrary, it doesn't have to be VCPU#0. 1324 * 1325 * It is used to commit the breakpoints into memory, and configure WHPX 1326 * to intercept debug exceptions. 1327 * 1328 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or 1329 * more VCPUs are already running, so this is the best place to do it. 1330 */ 1331 static int whpx_first_vcpu_starting(CPUState *cpu) 1332 { 1333 struct whpx_state *whpx = &whpx_global; 1334 HRESULT hr; 1335 1336 g_assert(qemu_mutex_iothread_locked()); 1337 1338 if (!QTAILQ_EMPTY(&cpu->breakpoints) || 1339 (whpx->breakpoints.breakpoints && 1340 whpx->breakpoints.breakpoints->used)) { 1341 CPUBreakpoint *bp; 1342 int i = 0; 1343 bool update_pending = false; 1344 1345 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1346 if (i >= whpx->breakpoints.original_address_count || 1347 bp->pc != whpx->breakpoints.original_addresses[i]) { 1348 update_pending = true; 1349 } 1350 1351 i++; 1352 } 1353 1354 if (i != whpx->breakpoints.original_address_count) { 1355 update_pending = true; 1356 } 1357 1358 if (update_pending) { 1359 /* 1360 * The CPU breakpoints have changed since the last call to 1361 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must 1362 * now be recomputed. 1363 */ 1364 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); 1365 } 1366 1367 /* Actually insert the breakpoints into the memory. */ 1368 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); 1369 } 1370 1371 uint64_t exception_mask; 1372 if (whpx->step_pending || 1373 (whpx->breakpoints.breakpoints && 1374 whpx->breakpoints.breakpoints->used)) { 1375 /* 1376 * We are either attempting to single-step one or more CPUs, or 1377 * have one or more breakpoints enabled. Both require intercepting 1378 * the WHvX64ExceptionTypeBreakpointTrap exception. 1379 */ 1380 1381 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; 1382 } else { 1383 /* Let the guest handle all exceptions. */ 1384 exception_mask = 0; 1385 } 1386 1387 hr = whpx_set_exception_exit_bitmap(exception_mask); 1388 if (!SUCCEEDED(hr)) { 1389 error_report("WHPX: Failed to update exception exit mask," 1390 "hr=%08lx.", hr); 1391 return 1; 1392 } 1393 1394 return 0; 1395 } 1396 1397 /* 1398 * This function is called when the last VCPU has finished running. 1399 * It is used to remove any previously set breakpoints from memory. 1400 */ 1401 static int whpx_last_vcpu_stopping(CPUState *cpu) 1402 { 1403 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); 1404 return 0; 1405 } 1406 1407 /* Returns the address of the next instruction that is about to be executed. */ 1408 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) 1409 { 1410 if (cpu->vcpu_dirty) { 1411 /* The CPU registers have been modified by other parts of QEMU. */ 1412 CPUArchState *env = (CPUArchState *)(cpu->env_ptr); 1413 return env->eip; 1414 } else if (exit_context_valid) { 1415 /* 1416 * The CPU registers have not been modified by neither other parts 1417 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). 1418 * This is the most common case. 1419 */ 1420 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1421 return vcpu->exit_ctx.VpContext.Rip; 1422 } else { 1423 /* 1424 * The CPU registers have been modified by a call to 1425 * WHvSetVirtualProcessorRegisters() and must be re-queried from 1426 * the target. 1427 */ 1428 WHV_REGISTER_VALUE reg_value; 1429 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; 1430 HRESULT hr; 1431 struct whpx_state *whpx = &whpx_global; 1432 1433 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1434 whpx->partition, 1435 cpu->cpu_index, 1436 ®_name, 1437 1, 1438 ®_value); 1439 1440 if (FAILED(hr)) { 1441 error_report("WHPX: Failed to get PC, hr=%08lx", hr); 1442 return 0; 1443 } 1444 1445 return reg_value.Reg64; 1446 } 1447 } 1448 1449 static int whpx_handle_halt(CPUState *cpu) 1450 { 1451 CPUX86State *env = cpu->env_ptr; 1452 int ret = 0; 1453 1454 qemu_mutex_lock_iothread(); 1455 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1456 (env->eflags & IF_MASK)) && 1457 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1458 cpu->exception_index = EXCP_HLT; 1459 cpu->halted = true; 1460 ret = 1; 1461 } 1462 qemu_mutex_unlock_iothread(); 1463 1464 return ret; 1465 } 1466 1467 static void whpx_vcpu_pre_run(CPUState *cpu) 1468 { 1469 HRESULT hr; 1470 struct whpx_state *whpx = &whpx_global; 1471 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1472 CPUX86State *env = cpu->env_ptr; 1473 X86CPU *x86_cpu = X86_CPU(cpu); 1474 int irq; 1475 uint8_t tpr; 1476 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; 1477 UINT32 reg_count = 0; 1478 WHV_REGISTER_VALUE reg_values[3]; 1479 WHV_REGISTER_NAME reg_names[3]; 1480 1481 memset(&new_int, 0, sizeof(new_int)); 1482 memset(reg_values, 0, sizeof(reg_values)); 1483 1484 qemu_mutex_lock_iothread(); 1485 1486 /* Inject NMI */ 1487 if (!vcpu->interruption_pending && 1488 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 1489 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 1490 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 1491 vcpu->interruptable = false; 1492 new_int.InterruptionType = WHvX64PendingNmi; 1493 new_int.InterruptionPending = 1; 1494 new_int.InterruptionVector = 2; 1495 } 1496 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 1497 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 1498 } 1499 } 1500 1501 /* 1502 * Force the VCPU out of its inner loop to process any INIT requests or 1503 * commit pending TPR access. 1504 */ 1505 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 1506 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1507 !(env->hflags & HF_SMM_MASK)) { 1508 cpu->exit_request = 1; 1509 } 1510 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1511 cpu->exit_request = 1; 1512 } 1513 } 1514 1515 /* Get pending hard interruption or replay one that was overwritten */ 1516 if (!whpx_apic_in_platform()) { 1517 if (!vcpu->interruption_pending && 1518 vcpu->interruptable && (env->eflags & IF_MASK)) { 1519 assert(!new_int.InterruptionPending); 1520 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1521 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1522 irq = cpu_get_pic_interrupt(env); 1523 if (irq >= 0) { 1524 new_int.InterruptionType = WHvX64PendingInterrupt; 1525 new_int.InterruptionPending = 1; 1526 new_int.InterruptionVector = irq; 1527 } 1528 } 1529 } 1530 1531 /* Setup interrupt state if new one was prepared */ 1532 if (new_int.InterruptionPending) { 1533 reg_values[reg_count].PendingInterruption = new_int; 1534 reg_names[reg_count] = WHvRegisterPendingInterruption; 1535 reg_count += 1; 1536 } 1537 } else if (vcpu->ready_for_pic_interrupt && 1538 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 1539 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1540 irq = cpu_get_pic_interrupt(env); 1541 if (irq >= 0) { 1542 reg_names[reg_count] = WHvRegisterPendingEvent; 1543 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) 1544 { 1545 .EventPending = 1, 1546 .EventType = WHvX64PendingEventExtInt, 1547 .Vector = irq, 1548 }; 1549 reg_count += 1; 1550 } 1551 } 1552 1553 /* Sync the TPR to the CR8 if was modified during the intercept */ 1554 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 1555 if (tpr != vcpu->tpr) { 1556 vcpu->tpr = tpr; 1557 reg_values[reg_count].Reg64 = tpr; 1558 cpu->exit_request = 1; 1559 reg_names[reg_count] = WHvX64RegisterCr8; 1560 reg_count += 1; 1561 } 1562 1563 /* Update the state of the interrupt delivery notification */ 1564 if (!vcpu->window_registered && 1565 cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1566 reg_values[reg_count].DeliverabilityNotifications = 1567 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { 1568 .InterruptNotification = 1 1569 }; 1570 vcpu->window_registered = 1; 1571 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; 1572 reg_count += 1; 1573 } 1574 1575 qemu_mutex_unlock_iothread(); 1576 vcpu->ready_for_pic_interrupt = false; 1577 1578 if (reg_count) { 1579 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1580 whpx->partition, cpu->cpu_index, 1581 reg_names, reg_count, reg_values); 1582 if (FAILED(hr)) { 1583 error_report("WHPX: Failed to set interrupt state registers," 1584 " hr=%08lx", hr); 1585 } 1586 } 1587 1588 return; 1589 } 1590 1591 static void whpx_vcpu_post_run(CPUState *cpu) 1592 { 1593 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1594 CPUX86State *env = cpu->env_ptr; 1595 X86CPU *x86_cpu = X86_CPU(cpu); 1596 1597 env->eflags = vcpu->exit_ctx.VpContext.Rflags; 1598 1599 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; 1600 if (vcpu->tpr != tpr) { 1601 vcpu->tpr = tpr; 1602 qemu_mutex_lock_iothread(); 1603 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr)); 1604 qemu_mutex_unlock_iothread(); 1605 } 1606 1607 vcpu->interruption_pending = 1608 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; 1609 1610 vcpu->interruptable = 1611 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; 1612 1613 return; 1614 } 1615 1616 static void whpx_vcpu_process_async_events(CPUState *cpu) 1617 { 1618 CPUX86State *env = cpu->env_ptr; 1619 X86CPU *x86_cpu = X86_CPU(cpu); 1620 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1621 1622 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1623 !(env->hflags & HF_SMM_MASK)) { 1624 whpx_cpu_synchronize_state(cpu); 1625 do_cpu_init(x86_cpu); 1626 vcpu->interruptable = true; 1627 } 1628 1629 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 1630 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 1631 apic_poll_irq(x86_cpu->apic_state); 1632 } 1633 1634 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1635 (env->eflags & IF_MASK)) || 1636 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1637 cpu->halted = false; 1638 } 1639 1640 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 1641 whpx_cpu_synchronize_state(cpu); 1642 do_cpu_sipi(x86_cpu); 1643 } 1644 1645 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1646 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 1647 whpx_cpu_synchronize_state(cpu); 1648 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 1649 env->tpr_access_type); 1650 } 1651 1652 return; 1653 } 1654 1655 static int whpx_vcpu_run(CPUState *cpu) 1656 { 1657 HRESULT hr; 1658 struct whpx_state *whpx = &whpx_global; 1659 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1660 struct whpx_breakpoint *stepped_over_bp = NULL; 1661 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; 1662 int ret; 1663 1664 g_assert(qemu_mutex_iothread_locked()); 1665 1666 if (whpx->running_cpus++ == 0) { 1667 /* Insert breakpoints into memory, update exception exit bitmap. */ 1668 ret = whpx_first_vcpu_starting(cpu); 1669 if (ret != 0) { 1670 return ret; 1671 } 1672 } 1673 1674 if (whpx->breakpoints.breakpoints && 1675 whpx->breakpoints.breakpoints->used > 0) 1676 { 1677 uint64_t pc = whpx_vcpu_get_pc(cpu, true); 1678 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); 1679 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { 1680 stepped_over_bp = NULL; 1681 } 1682 1683 if (stepped_over_bp) { 1684 /* 1685 * We are trying to run the instruction overwritten by an active 1686 * breakpoint. We will temporarily disable the breakpoint, suspend 1687 * other CPUs, and step over the instruction. 1688 */ 1689 exclusive_step_mode = WHPX_STEP_EXCLUSIVE; 1690 } 1691 } 1692 1693 if (exclusive_step_mode == WHPX_STEP_NONE) { 1694 whpx_vcpu_process_async_events(cpu); 1695 if (cpu->halted && !whpx_apic_in_platform()) { 1696 cpu->exception_index = EXCP_HLT; 1697 qatomic_set(&cpu->exit_request, false); 1698 return 0; 1699 } 1700 } 1701 1702 qemu_mutex_unlock_iothread(); 1703 1704 if (exclusive_step_mode != WHPX_STEP_NONE) { 1705 start_exclusive(); 1706 g_assert(cpu == current_cpu); 1707 g_assert(!cpu->running); 1708 cpu->running = true; 1709 1710 hr = whpx_set_exception_exit_bitmap( 1711 1UL << WHvX64ExceptionTypeDebugTrapOrFault); 1712 if (!SUCCEEDED(hr)) { 1713 error_report("WHPX: Failed to update exception exit mask, " 1714 "hr=%08lx.", hr); 1715 return 1; 1716 } 1717 1718 if (stepped_over_bp) { 1719 /* Temporarily disable the triggered breakpoint. */ 1720 cpu_memory_rw_debug(cpu, 1721 stepped_over_bp->address, 1722 &stepped_over_bp->original_instruction, 1723 1, 1724 true); 1725 } 1726 } else { 1727 cpu_exec_start(cpu); 1728 } 1729 1730 do { 1731 if (cpu->vcpu_dirty) { 1732 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); 1733 cpu->vcpu_dirty = false; 1734 } 1735 1736 if (exclusive_step_mode == WHPX_STEP_NONE) { 1737 whpx_vcpu_pre_run(cpu); 1738 1739 if (qatomic_read(&cpu->exit_request)) { 1740 whpx_vcpu_kick(cpu); 1741 } 1742 } 1743 1744 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1745 whpx_vcpu_configure_single_stepping(cpu, true, NULL); 1746 } 1747 1748 hr = whp_dispatch.WHvRunVirtualProcessor( 1749 whpx->partition, cpu->cpu_index, 1750 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); 1751 1752 if (FAILED(hr)) { 1753 error_report("WHPX: Failed to exec a virtual processor," 1754 " hr=%08lx", hr); 1755 ret = -1; 1756 break; 1757 } 1758 1759 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1760 whpx_vcpu_configure_single_stepping(cpu, 1761 false, 1762 &vcpu->exit_ctx.VpContext.Rflags); 1763 } 1764 1765 whpx_vcpu_post_run(cpu); 1766 1767 switch (vcpu->exit_ctx.ExitReason) { 1768 case WHvRunVpExitReasonMemoryAccess: 1769 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); 1770 break; 1771 1772 case WHvRunVpExitReasonX64IoPortAccess: 1773 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); 1774 break; 1775 1776 case WHvRunVpExitReasonX64InterruptWindow: 1777 vcpu->ready_for_pic_interrupt = 1; 1778 vcpu->window_registered = 0; 1779 ret = 0; 1780 break; 1781 1782 case WHvRunVpExitReasonX64ApicEoi: 1783 assert(whpx_apic_in_platform()); 1784 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); 1785 break; 1786 1787 case WHvRunVpExitReasonX64Halt: 1788 /* 1789 * WARNING: as of build 19043.1526 (21H1), this exit reason is no 1790 * longer used. 1791 */ 1792 ret = whpx_handle_halt(cpu); 1793 break; 1794 1795 case WHvRunVpExitReasonX64ApicInitSipiTrap: { 1796 WHV_INTERRUPT_CONTROL ipi = {0}; 1797 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; 1798 uint32_t delivery_mode = 1799 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; 1800 int dest_shorthand = 1801 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; 1802 bool broadcast = false; 1803 bool include_self = false; 1804 uint32_t i; 1805 1806 /* We only registered for INIT and SIPI exits. */ 1807 if ((delivery_mode != APIC_DM_INIT) && 1808 (delivery_mode != APIC_DM_SIPI)) { 1809 error_report( 1810 "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); 1811 break; 1812 } 1813 1814 if (delivery_mode == APIC_DM_INIT) { 1815 ipi.Type = WHvX64InterruptTypeInit; 1816 } else { 1817 ipi.Type = WHvX64InterruptTypeSipi; 1818 } 1819 1820 ipi.DestinationMode = 1821 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? 1822 WHvX64InterruptDestinationModeLogical : 1823 WHvX64InterruptDestinationModePhysical; 1824 1825 ipi.TriggerMode = 1826 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? 1827 WHvX64InterruptTriggerModeLevel : 1828 WHvX64InterruptTriggerModeEdge; 1829 1830 ipi.Vector = icr & APIC_VECTOR_MASK; 1831 switch (dest_shorthand) { 1832 /* no shorthand. Bits 56-63 contain the destination. */ 1833 case 0: 1834 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; 1835 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1836 &ipi, sizeof(ipi)); 1837 if (FAILED(hr)) { 1838 error_report("WHPX: Failed to request interrupt hr=%08lx", 1839 hr); 1840 } 1841 1842 break; 1843 1844 /* self */ 1845 case 1: 1846 include_self = true; 1847 break; 1848 1849 /* broadcast, including self */ 1850 case 2: 1851 broadcast = true; 1852 include_self = true; 1853 break; 1854 1855 /* broadcast, excluding self */ 1856 case 3: 1857 broadcast = true; 1858 break; 1859 } 1860 1861 if (!broadcast && !include_self) { 1862 break; 1863 } 1864 1865 for (i = 0; i <= max_vcpu_index; i++) { 1866 if (i == cpu->cpu_index && !include_self) { 1867 continue; 1868 } 1869 1870 /* 1871 * Assuming that APIC Ids are identity mapped since 1872 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers 1873 * are not handled yet and the hypervisor doesn't allow the 1874 * guest to modify the APIC ID. 1875 */ 1876 ipi.Destination = i; 1877 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1878 &ipi, sizeof(ipi)); 1879 if (FAILED(hr)) { 1880 error_report( 1881 "WHPX: Failed to request SIPI for %d, hr=%08lx", 1882 i, hr); 1883 } 1884 } 1885 1886 break; 1887 } 1888 1889 case WHvRunVpExitReasonCanceled: 1890 if (exclusive_step_mode != WHPX_STEP_NONE) { 1891 /* 1892 * We are trying to step over a single instruction, and 1893 * likely got a request to stop from another thread. 1894 * Delay it until we are done stepping 1895 * over. 1896 */ 1897 ret = 0; 1898 } else { 1899 cpu->exception_index = EXCP_INTERRUPT; 1900 ret = 1; 1901 } 1902 break; 1903 case WHvRunVpExitReasonX64MsrAccess: { 1904 WHV_REGISTER_VALUE reg_values[3] = {0}; 1905 WHV_REGISTER_NAME reg_names[3]; 1906 UINT32 reg_count; 1907 1908 reg_names[0] = WHvX64RegisterRip; 1909 reg_names[1] = WHvX64RegisterRax; 1910 reg_names[2] = WHvX64RegisterRdx; 1911 1912 reg_values[0].Reg64 = 1913 vcpu->exit_ctx.VpContext.Rip + 1914 vcpu->exit_ctx.VpContext.InstructionLength; 1915 1916 /* 1917 * For all unsupported MSR access we: 1918 * ignore writes 1919 * return 0 on read. 1920 */ 1921 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? 1922 1 : 3; 1923 1924 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1925 whpx->partition, 1926 cpu->cpu_index, 1927 reg_names, reg_count, 1928 reg_values); 1929 1930 if (FAILED(hr)) { 1931 error_report("WHPX: Failed to set MsrAccess state " 1932 " registers, hr=%08lx", hr); 1933 } 1934 ret = 0; 1935 break; 1936 } 1937 case WHvRunVpExitReasonX64Cpuid: { 1938 WHV_REGISTER_VALUE reg_values[5]; 1939 WHV_REGISTER_NAME reg_names[5]; 1940 UINT32 reg_count = 5; 1941 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; 1942 X86CPU *x86_cpu = X86_CPU(cpu); 1943 CPUX86State *env = &x86_cpu->env; 1944 1945 memset(reg_values, 0, sizeof(reg_values)); 1946 1947 rip = vcpu->exit_ctx.VpContext.Rip + 1948 vcpu->exit_ctx.VpContext.InstructionLength; 1949 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; 1950 1951 /* 1952 * Ideally, these should be supplied to the hypervisor during VCPU 1953 * initialization and it should be able to satisfy this request. 1954 * But, currently, WHPX doesn't support setting CPUID values in the 1955 * hypervisor once the partition has been setup, which is too late 1956 * since VCPUs are realized later. For now, use the values from 1957 * QEMU to satisfy these requests, until WHPX adds support for 1958 * being able to set these values in the hypervisor at runtime. 1959 */ 1960 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, 1961 (UINT32 *)&rcx, (UINT32 *)&rdx); 1962 switch (cpuid_fn) { 1963 case 0x40000000: 1964 /* Expose the vmware cpu frequency cpuid leaf */ 1965 rax = 0x40000010; 1966 rbx = rcx = rdx = 0; 1967 break; 1968 1969 case 0x40000010: 1970 rax = env->tsc_khz; 1971 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1972 rcx = rdx = 0; 1973 break; 1974 1975 case 0x80000001: 1976 /* Remove any support of OSVW */ 1977 rcx &= ~CPUID_EXT3_OSVW; 1978 break; 1979 } 1980 1981 reg_names[0] = WHvX64RegisterRip; 1982 reg_names[1] = WHvX64RegisterRax; 1983 reg_names[2] = WHvX64RegisterRcx; 1984 reg_names[3] = WHvX64RegisterRdx; 1985 reg_names[4] = WHvX64RegisterRbx; 1986 1987 reg_values[0].Reg64 = rip; 1988 reg_values[1].Reg64 = rax; 1989 reg_values[2].Reg64 = rcx; 1990 reg_values[3].Reg64 = rdx; 1991 reg_values[4].Reg64 = rbx; 1992 1993 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1994 whpx->partition, cpu->cpu_index, 1995 reg_names, 1996 reg_count, 1997 reg_values); 1998 1999 if (FAILED(hr)) { 2000 error_report("WHPX: Failed to set CpuidAccess state registers," 2001 " hr=%08lx", hr); 2002 } 2003 ret = 0; 2004 break; 2005 } 2006 case WHvRunVpExitReasonException: 2007 whpx_get_registers(cpu); 2008 2009 if ((vcpu->exit_ctx.VpException.ExceptionType == 2010 WHvX64ExceptionTypeDebugTrapOrFault) && 2011 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && 2012 (vcpu->exit_ctx.VpException.InstructionBytes[0] == 2013 whpx_breakpoint_instruction)) { 2014 /* Stopped at a software breakpoint. */ 2015 cpu->exception_index = EXCP_DEBUG; 2016 } else if ((vcpu->exit_ctx.VpException.ExceptionType == 2017 WHvX64ExceptionTypeDebugTrapOrFault) && 2018 !cpu->singlestep_enabled) { 2019 /* 2020 * Just finished stepping over a breakpoint, but the 2021 * gdb does not expect us to do single-stepping. 2022 * Don't do anything special. 2023 */ 2024 cpu->exception_index = EXCP_INTERRUPT; 2025 } else { 2026 /* Another exception or debug event. Report it to GDB. */ 2027 cpu->exception_index = EXCP_DEBUG; 2028 } 2029 2030 ret = 1; 2031 break; 2032 case WHvRunVpExitReasonNone: 2033 case WHvRunVpExitReasonUnrecoverableException: 2034 case WHvRunVpExitReasonInvalidVpRegisterValue: 2035 case WHvRunVpExitReasonUnsupportedFeature: 2036 default: 2037 error_report("WHPX: Unexpected VP exit code %d", 2038 vcpu->exit_ctx.ExitReason); 2039 whpx_get_registers(cpu); 2040 qemu_mutex_lock_iothread(); 2041 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2042 qemu_mutex_unlock_iothread(); 2043 break; 2044 } 2045 2046 } while (!ret); 2047 2048 if (stepped_over_bp) { 2049 /* Restore the breakpoint we stepped over */ 2050 cpu_memory_rw_debug(cpu, 2051 stepped_over_bp->address, 2052 (void *)&whpx_breakpoint_instruction, 2053 1, 2054 true); 2055 } 2056 2057 if (exclusive_step_mode != WHPX_STEP_NONE) { 2058 g_assert(cpu_in_exclusive_context(cpu)); 2059 cpu->running = false; 2060 end_exclusive(); 2061 2062 exclusive_step_mode = WHPX_STEP_NONE; 2063 } else { 2064 cpu_exec_end(cpu); 2065 } 2066 2067 qemu_mutex_lock_iothread(); 2068 current_cpu = cpu; 2069 2070 if (--whpx->running_cpus == 0) { 2071 whpx_last_vcpu_stopping(cpu); 2072 } 2073 2074 qatomic_set(&cpu->exit_request, false); 2075 2076 return ret < 0; 2077 } 2078 2079 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2080 { 2081 if (!cpu->vcpu_dirty) { 2082 whpx_get_registers(cpu); 2083 cpu->vcpu_dirty = true; 2084 } 2085 } 2086 2087 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, 2088 run_on_cpu_data arg) 2089 { 2090 whpx_set_registers(cpu, WHPX_SET_RESET_STATE); 2091 cpu->vcpu_dirty = false; 2092 } 2093 2094 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, 2095 run_on_cpu_data arg) 2096 { 2097 whpx_set_registers(cpu, WHPX_SET_FULL_STATE); 2098 cpu->vcpu_dirty = false; 2099 } 2100 2101 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, 2102 run_on_cpu_data arg) 2103 { 2104 cpu->vcpu_dirty = true; 2105 } 2106 2107 /* 2108 * CPU support. 2109 */ 2110 2111 void whpx_cpu_synchronize_state(CPUState *cpu) 2112 { 2113 if (!cpu->vcpu_dirty) { 2114 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); 2115 } 2116 } 2117 2118 void whpx_cpu_synchronize_post_reset(CPUState *cpu) 2119 { 2120 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2121 } 2122 2123 void whpx_cpu_synchronize_post_init(CPUState *cpu) 2124 { 2125 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2126 } 2127 2128 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) 2129 { 2130 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2131 } 2132 2133 void whpx_cpu_synchronize_pre_resume(bool step_pending) 2134 { 2135 whpx_global.step_pending = step_pending; 2136 } 2137 2138 /* 2139 * Vcpu support. 2140 */ 2141 2142 static Error *whpx_migration_blocker; 2143 2144 static void whpx_cpu_update_state(void *opaque, bool running, RunState state) 2145 { 2146 CPUX86State *env = opaque; 2147 2148 if (running) { 2149 env->tsc_valid = false; 2150 } 2151 } 2152 2153 int whpx_init_vcpu(CPUState *cpu) 2154 { 2155 HRESULT hr; 2156 struct whpx_state *whpx = &whpx_global; 2157 struct whpx_vcpu *vcpu = NULL; 2158 Error *local_error = NULL; 2159 CPUX86State *env = cpu->env_ptr; 2160 X86CPU *x86_cpu = X86_CPU(cpu); 2161 UINT64 freq = 0; 2162 int ret; 2163 2164 /* Add migration blockers for all unsupported features of the 2165 * Windows Hypervisor Platform 2166 */ 2167 if (whpx_migration_blocker == NULL) { 2168 error_setg(&whpx_migration_blocker, 2169 "State blocked due to non-migratable CPUID feature support," 2170 "dirty memory tracking support, and XSAVE/XRSTOR support"); 2171 2172 if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) { 2173 error_report_err(local_error); 2174 error_free(whpx_migration_blocker); 2175 ret = -EINVAL; 2176 goto error; 2177 } 2178 } 2179 2180 vcpu = g_new0(struct whpx_vcpu, 1); 2181 2182 if (!vcpu) { 2183 error_report("WHPX: Failed to allocte VCPU context."); 2184 ret = -ENOMEM; 2185 goto error; 2186 } 2187 2188 hr = whp_dispatch.WHvEmulatorCreateEmulator( 2189 &whpx_emu_callbacks, 2190 &vcpu->emulator); 2191 if (FAILED(hr)) { 2192 error_report("WHPX: Failed to setup instruction completion support," 2193 " hr=%08lx", hr); 2194 ret = -EINVAL; 2195 goto error; 2196 } 2197 2198 hr = whp_dispatch.WHvCreateVirtualProcessor( 2199 whpx->partition, cpu->cpu_index, 0); 2200 if (FAILED(hr)) { 2201 error_report("WHPX: Failed to create a virtual processor," 2202 " hr=%08lx", hr); 2203 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2204 ret = -EINVAL; 2205 goto error; 2206 } 2207 2208 /* 2209 * vcpu's TSC frequency is either specified by user, or use the value 2210 * provided by Hyper-V if the former is not present. In the latter case, we 2211 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC 2212 * frequency can be migrated later via this field. 2213 */ 2214 if (!env->tsc_khz) { 2215 hr = whp_dispatch.WHvGetCapability( 2216 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), 2217 NULL); 2218 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2219 if (FAILED(hr)) { 2220 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); 2221 } else { 2222 env->tsc_khz = freq / 1000; /* Hz to KHz */ 2223 } 2224 } 2225 } 2226 2227 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; 2228 hr = whp_dispatch.WHvGetCapability( 2229 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); 2230 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2231 if (FAILED(hr)) { 2232 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); 2233 } else { 2234 env->apic_bus_freq = freq; 2235 } 2236 } 2237 2238 /* 2239 * If the vmware cpuid frequency leaf option is set, and we have a valid 2240 * tsc value, trap the corresponding cpuid's. 2241 */ 2242 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { 2243 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; 2244 2245 hr = whp_dispatch.WHvSetPartitionProperty( 2246 whpx->partition, 2247 WHvPartitionPropertyCodeCpuidExitList, 2248 cpuidExitList, 2249 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2250 2251 if (FAILED(hr)) { 2252 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2253 hr); 2254 ret = -EINVAL; 2255 goto error; 2256 } 2257 } 2258 2259 vcpu->interruptable = true; 2260 cpu->vcpu_dirty = true; 2261 cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu; 2262 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); 2263 qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr); 2264 2265 return 0; 2266 2267 error: 2268 g_free(vcpu); 2269 2270 return ret; 2271 } 2272 2273 int whpx_vcpu_exec(CPUState *cpu) 2274 { 2275 int ret; 2276 int fatal; 2277 2278 for (;;) { 2279 if (cpu->exception_index >= EXCP_INTERRUPT) { 2280 ret = cpu->exception_index; 2281 cpu->exception_index = -1; 2282 break; 2283 } 2284 2285 fatal = whpx_vcpu_run(cpu); 2286 2287 if (fatal) { 2288 error_report("WHPX: Failed to exec a virtual processor"); 2289 abort(); 2290 } 2291 } 2292 2293 return ret; 2294 } 2295 2296 void whpx_destroy_vcpu(CPUState *cpu) 2297 { 2298 struct whpx_state *whpx = &whpx_global; 2299 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 2300 2301 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); 2302 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2303 g_free(cpu->hax_vcpu); 2304 return; 2305 } 2306 2307 void whpx_vcpu_kick(CPUState *cpu) 2308 { 2309 struct whpx_state *whpx = &whpx_global; 2310 whp_dispatch.WHvCancelRunVirtualProcessor( 2311 whpx->partition, cpu->cpu_index, 0); 2312 } 2313 2314 /* 2315 * Memory support. 2316 */ 2317 2318 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, 2319 void *host_va, int add, int rom, 2320 const char *name) 2321 { 2322 struct whpx_state *whpx = &whpx_global; 2323 HRESULT hr; 2324 2325 /* 2326 if (add) { 2327 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", 2328 (void*)start_pa, (void*)size, host_va, 2329 (rom ? "ROM" : "RAM"), name); 2330 } else { 2331 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", 2332 (void*)start_pa, (void*)size, host_va, name); 2333 } 2334 */ 2335 2336 if (add) { 2337 hr = whp_dispatch.WHvMapGpaRange(whpx->partition, 2338 host_va, 2339 start_pa, 2340 size, 2341 (WHvMapGpaRangeFlagRead | 2342 WHvMapGpaRangeFlagExecute | 2343 (rom ? 0 : WHvMapGpaRangeFlagWrite))); 2344 } else { 2345 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, 2346 start_pa, 2347 size); 2348 } 2349 2350 if (FAILED(hr)) { 2351 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," 2352 " Host:%p, hr=%08lx", 2353 (add ? "MAP" : "UNMAP"), name, 2354 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); 2355 } 2356 } 2357 2358 static void whpx_process_section(MemoryRegionSection *section, int add) 2359 { 2360 MemoryRegion *mr = section->mr; 2361 hwaddr start_pa = section->offset_within_address_space; 2362 ram_addr_t size = int128_get64(section->size); 2363 unsigned int delta; 2364 uint64_t host_va; 2365 2366 if (!memory_region_is_ram(mr)) { 2367 return; 2368 } 2369 2370 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 2371 delta &= ~qemu_real_host_page_mask(); 2372 if (delta > size) { 2373 return; 2374 } 2375 start_pa += delta; 2376 size -= delta; 2377 size &= qemu_real_host_page_mask(); 2378 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 2379 return; 2380 } 2381 2382 host_va = (uintptr_t)memory_region_get_ram_ptr(mr) 2383 + section->offset_within_region + delta; 2384 2385 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, 2386 memory_region_is_rom(mr), mr->name); 2387 } 2388 2389 static void whpx_region_add(MemoryListener *listener, 2390 MemoryRegionSection *section) 2391 { 2392 memory_region_ref(section->mr); 2393 whpx_process_section(section, 1); 2394 } 2395 2396 static void whpx_region_del(MemoryListener *listener, 2397 MemoryRegionSection *section) 2398 { 2399 whpx_process_section(section, 0); 2400 memory_region_unref(section->mr); 2401 } 2402 2403 static void whpx_transaction_begin(MemoryListener *listener) 2404 { 2405 } 2406 2407 static void whpx_transaction_commit(MemoryListener *listener) 2408 { 2409 } 2410 2411 static void whpx_log_sync(MemoryListener *listener, 2412 MemoryRegionSection *section) 2413 { 2414 MemoryRegion *mr = section->mr; 2415 2416 if (!memory_region_is_ram(mr)) { 2417 return; 2418 } 2419 2420 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 2421 } 2422 2423 static MemoryListener whpx_memory_listener = { 2424 .name = "whpx", 2425 .begin = whpx_transaction_begin, 2426 .commit = whpx_transaction_commit, 2427 .region_add = whpx_region_add, 2428 .region_del = whpx_region_del, 2429 .log_sync = whpx_log_sync, 2430 .priority = 10, 2431 }; 2432 2433 static void whpx_memory_init(void) 2434 { 2435 memory_listener_register(&whpx_memory_listener, &address_space_memory); 2436 } 2437 2438 /* 2439 * Load the functions from the given library, using the given handle. If a 2440 * handle is provided, it is used, otherwise the library is opened. The 2441 * handle will be updated on return with the opened one. 2442 */ 2443 static bool load_whp_dispatch_fns(HMODULE *handle, 2444 WHPFunctionList function_list) 2445 { 2446 HMODULE hLib = *handle; 2447 2448 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" 2449 #define WINHV_EMULATION_DLL "WinHvEmulation.dll" 2450 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ 2451 whp_dispatch.function_name = \ 2452 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2453 2454 #define WHP_LOAD_FIELD(return_type, function_name, signature) \ 2455 whp_dispatch.function_name = \ 2456 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2457 if (!whp_dispatch.function_name) { \ 2458 error_report("Could not load function %s", #function_name); \ 2459 goto error; \ 2460 } \ 2461 2462 #define WHP_LOAD_LIB(lib_name, handle_lib) \ 2463 if (!handle_lib) { \ 2464 handle_lib = LoadLibrary(lib_name); \ 2465 if (!handle_lib) { \ 2466 error_report("Could not load library %s.", lib_name); \ 2467 goto error; \ 2468 } \ 2469 } \ 2470 2471 switch (function_list) { 2472 case WINHV_PLATFORM_FNS_DEFAULT: 2473 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2474 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) 2475 break; 2476 2477 case WINHV_EMULATION_FNS_DEFAULT: 2478 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) 2479 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) 2480 break; 2481 2482 case WINHV_PLATFORM_FNS_SUPPLEMENTAL: 2483 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2484 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) 2485 break; 2486 } 2487 2488 *handle = hLib; 2489 return true; 2490 2491 error: 2492 if (hLib) { 2493 FreeLibrary(hLib); 2494 } 2495 2496 return false; 2497 } 2498 2499 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, 2500 const char *name, void *opaque, 2501 Error **errp) 2502 { 2503 struct whpx_state *whpx = &whpx_global; 2504 OnOffSplit mode; 2505 2506 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 2507 return; 2508 } 2509 2510 switch (mode) { 2511 case ON_OFF_SPLIT_ON: 2512 whpx->kernel_irqchip_allowed = true; 2513 whpx->kernel_irqchip_required = true; 2514 break; 2515 2516 case ON_OFF_SPLIT_OFF: 2517 whpx->kernel_irqchip_allowed = false; 2518 whpx->kernel_irqchip_required = false; 2519 break; 2520 2521 case ON_OFF_SPLIT_SPLIT: 2522 error_setg(errp, "WHPX: split irqchip currently not supported"); 2523 error_append_hint(errp, 2524 "Try without kernel-irqchip or with kernel-irqchip=on|off"); 2525 break; 2526 2527 default: 2528 /* 2529 * The value was checked in visit_type_OnOffSplit() above. If 2530 * we get here, then something is wrong in QEMU. 2531 */ 2532 abort(); 2533 } 2534 } 2535 2536 /* 2537 * Partition support 2538 */ 2539 2540 static int whpx_accel_init(MachineState *ms) 2541 { 2542 struct whpx_state *whpx; 2543 int ret; 2544 HRESULT hr; 2545 WHV_CAPABILITY whpx_cap; 2546 UINT32 whpx_cap_size; 2547 WHV_PARTITION_PROPERTY prop; 2548 UINT32 cpuidExitList[] = {1, 0x80000001}; 2549 WHV_CAPABILITY_FEATURES features = {0}; 2550 2551 whpx = &whpx_global; 2552 2553 if (!init_whp_dispatch()) { 2554 ret = -ENOSYS; 2555 goto error; 2556 } 2557 2558 whpx->mem_quota = ms->ram_size; 2559 2560 hr = whp_dispatch.WHvGetCapability( 2561 WHvCapabilityCodeHypervisorPresent, &whpx_cap, 2562 sizeof(whpx_cap), &whpx_cap_size); 2563 if (FAILED(hr) || !whpx_cap.HypervisorPresent) { 2564 error_report("WHPX: No accelerator found, hr=%08lx", hr); 2565 ret = -ENOSPC; 2566 goto error; 2567 } 2568 2569 hr = whp_dispatch.WHvGetCapability( 2570 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); 2571 if (FAILED(hr)) { 2572 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); 2573 ret = -EINVAL; 2574 goto error; 2575 } 2576 2577 hr = whp_dispatch.WHvCreatePartition(&whpx->partition); 2578 if (FAILED(hr)) { 2579 error_report("WHPX: Failed to create partition, hr=%08lx", hr); 2580 ret = -EINVAL; 2581 goto error; 2582 } 2583 2584 /* 2585 * Query the XSAVE capability of the partition. Any error here is not 2586 * considered fatal. 2587 */ 2588 hr = whp_dispatch.WHvGetPartitionProperty( 2589 whpx->partition, 2590 WHvPartitionPropertyCodeProcessorXsaveFeatures, 2591 &whpx_xsave_cap, 2592 sizeof(whpx_xsave_cap), 2593 &whpx_cap_size); 2594 2595 /* 2596 * Windows version which don't support this property will return with the 2597 * specific error code. 2598 */ 2599 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) { 2600 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr); 2601 } 2602 2603 if (!whpx_has_xsave()) { 2604 printf("WHPX: Partition is not XSAVE capable\n"); 2605 } 2606 2607 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2608 prop.ProcessorCount = ms->smp.cpus; 2609 hr = whp_dispatch.WHvSetPartitionProperty( 2610 whpx->partition, 2611 WHvPartitionPropertyCodeProcessorCount, 2612 &prop, 2613 sizeof(WHV_PARTITION_PROPERTY)); 2614 2615 if (FAILED(hr)) { 2616 error_report("WHPX: Failed to set partition core count to %d," 2617 " hr=%08lx", ms->smp.cores, hr); 2618 ret = -EINVAL; 2619 goto error; 2620 } 2621 2622 /* 2623 * Error out if WHP doesn't support apic emulation and user is requiring 2624 * it. 2625 */ 2626 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || 2627 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { 2628 error_report("WHPX: kernel irqchip requested, but unavailable. " 2629 "Try without kernel-irqchip or with kernel-irqchip=off"); 2630 ret = -EINVAL; 2631 goto error; 2632 } 2633 2634 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && 2635 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { 2636 WHV_X64_LOCAL_APIC_EMULATION_MODE mode = 2637 WHvX64LocalApicEmulationModeXApic; 2638 printf("WHPX: setting APIC emulation mode in the hypervisor\n"); 2639 hr = whp_dispatch.WHvSetPartitionProperty( 2640 whpx->partition, 2641 WHvPartitionPropertyCodeLocalApicEmulationMode, 2642 &mode, 2643 sizeof(mode)); 2644 if (FAILED(hr)) { 2645 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); 2646 if (whpx->kernel_irqchip_required) { 2647 error_report("WHPX: kernel irqchip requested, but unavailable"); 2648 ret = -EINVAL; 2649 goto error; 2650 } 2651 } else { 2652 whpx->apic_in_platform = true; 2653 } 2654 } 2655 2656 /* Register for MSR and CPUID exits */ 2657 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2658 prop.ExtendedVmExits.X64MsrExit = 1; 2659 prop.ExtendedVmExits.X64CpuidExit = 1; 2660 prop.ExtendedVmExits.ExceptionExit = 1; 2661 if (whpx_apic_in_platform()) { 2662 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; 2663 } 2664 2665 hr = whp_dispatch.WHvSetPartitionProperty( 2666 whpx->partition, 2667 WHvPartitionPropertyCodeExtendedVmExits, 2668 &prop, 2669 sizeof(WHV_PARTITION_PROPERTY)); 2670 if (FAILED(hr)) { 2671 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); 2672 ret = -EINVAL; 2673 goto error; 2674 } 2675 2676 hr = whp_dispatch.WHvSetPartitionProperty( 2677 whpx->partition, 2678 WHvPartitionPropertyCodeCpuidExitList, 2679 cpuidExitList, 2680 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2681 2682 if (FAILED(hr)) { 2683 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2684 hr); 2685 ret = -EINVAL; 2686 goto error; 2687 } 2688 2689 /* 2690 * We do not want to intercept any exceptions from the guest, 2691 * until we actually start debugging with gdb. 2692 */ 2693 whpx->exception_exit_bitmap = -1; 2694 hr = whpx_set_exception_exit_bitmap(0); 2695 2696 if (FAILED(hr)) { 2697 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); 2698 ret = -EINVAL; 2699 goto error; 2700 } 2701 2702 hr = whp_dispatch.WHvSetupPartition(whpx->partition); 2703 if (FAILED(hr)) { 2704 error_report("WHPX: Failed to setup partition, hr=%08lx", hr); 2705 ret = -EINVAL; 2706 goto error; 2707 } 2708 2709 whpx_memory_init(); 2710 2711 printf("Windows Hypervisor Platform accelerator is operational\n"); 2712 return 0; 2713 2714 error: 2715 2716 if (NULL != whpx->partition) { 2717 whp_dispatch.WHvDeletePartition(whpx->partition); 2718 whpx->partition = NULL; 2719 } 2720 2721 return ret; 2722 } 2723 2724 int whpx_enabled(void) 2725 { 2726 return whpx_allowed; 2727 } 2728 2729 bool whpx_apic_in_platform(void) { 2730 return whpx_global.apic_in_platform; 2731 } 2732 2733 static void whpx_accel_class_init(ObjectClass *oc, void *data) 2734 { 2735 AccelClass *ac = ACCEL_CLASS(oc); 2736 ac->name = "WHPX"; 2737 ac->init_machine = whpx_accel_init; 2738 ac->allowed = &whpx_allowed; 2739 2740 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 2741 NULL, whpx_set_kernel_irqchip, 2742 NULL, NULL); 2743 object_class_property_set_description(oc, "kernel-irqchip", 2744 "Configure WHPX in-kernel irqchip"); 2745 } 2746 2747 static void whpx_accel_instance_init(Object *obj) 2748 { 2749 struct whpx_state *whpx = &whpx_global; 2750 2751 memset(whpx, 0, sizeof(struct whpx_state)); 2752 /* Turn on kernel-irqchip, by default */ 2753 whpx->kernel_irqchip_allowed = true; 2754 } 2755 2756 static const TypeInfo whpx_accel_type = { 2757 .name = ACCEL_CLASS_NAME("whpx"), 2758 .parent = TYPE_ACCEL, 2759 .instance_init = whpx_accel_instance_init, 2760 .class_init = whpx_accel_class_init, 2761 }; 2762 2763 static void whpx_type_init(void) 2764 { 2765 type_register_static(&whpx_accel_type); 2766 } 2767 2768 bool init_whp_dispatch(void) 2769 { 2770 if (whp_dispatch_initialized) { 2771 return true; 2772 } 2773 2774 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { 2775 goto error; 2776 } 2777 2778 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { 2779 goto error; 2780 } 2781 2782 assert(load_whp_dispatch_fns(&hWinHvPlatform, 2783 WINHV_PLATFORM_FNS_SUPPLEMENTAL)); 2784 whp_dispatch_initialized = true; 2785 2786 return true; 2787 error: 2788 if (hWinHvPlatform) { 2789 FreeLibrary(hWinHvPlatform); 2790 } 2791 2792 if (hWinHvEmulation) { 2793 FreeLibrary(hWinHvEmulation); 2794 } 2795 2796 return false; 2797 } 2798 2799 type_init(whpx_type_init); 2800