1 /* 2 * QEMU Windows Hypervisor Platform accelerator (WHPX) 3 * 4 * Copyright Microsoft Corp. 2017 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 11 #include "qemu/osdep.h" 12 #include "cpu.h" 13 #include "exec/address-spaces.h" 14 #include "exec/ioport.h" 15 #include "gdbstub/helpers.h" 16 #include "qemu/accel.h" 17 #include "sysemu/whpx.h" 18 #include "sysemu/cpus.h" 19 #include "sysemu/runstate.h" 20 #include "qemu/main-loop.h" 21 #include "hw/boards.h" 22 #include "hw/intc/ioapic.h" 23 #include "hw/i386/apic_internal.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "qapi/qapi-types-common.h" 27 #include "qapi/qapi-visit-common.h" 28 #include "migration/blocker.h" 29 #include <winerror.h> 30 31 #include "whpx-internal.h" 32 #include "whpx-accel-ops.h" 33 34 #include <winhvplatform.h> 35 #include <winhvemulation.h> 36 37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) 38 39 static const WHV_REGISTER_NAME whpx_register_names[] = { 40 41 /* X64 General purpose registers */ 42 WHvX64RegisterRax, 43 WHvX64RegisterRcx, 44 WHvX64RegisterRdx, 45 WHvX64RegisterRbx, 46 WHvX64RegisterRsp, 47 WHvX64RegisterRbp, 48 WHvX64RegisterRsi, 49 WHvX64RegisterRdi, 50 WHvX64RegisterR8, 51 WHvX64RegisterR9, 52 WHvX64RegisterR10, 53 WHvX64RegisterR11, 54 WHvX64RegisterR12, 55 WHvX64RegisterR13, 56 WHvX64RegisterR14, 57 WHvX64RegisterR15, 58 WHvX64RegisterRip, 59 WHvX64RegisterRflags, 60 61 /* X64 Segment registers */ 62 WHvX64RegisterEs, 63 WHvX64RegisterCs, 64 WHvX64RegisterSs, 65 WHvX64RegisterDs, 66 WHvX64RegisterFs, 67 WHvX64RegisterGs, 68 WHvX64RegisterLdtr, 69 WHvX64RegisterTr, 70 71 /* X64 Table registers */ 72 WHvX64RegisterIdtr, 73 WHvX64RegisterGdtr, 74 75 /* X64 Control Registers */ 76 WHvX64RegisterCr0, 77 WHvX64RegisterCr2, 78 WHvX64RegisterCr3, 79 WHvX64RegisterCr4, 80 WHvX64RegisterCr8, 81 82 /* X64 Debug Registers */ 83 /* 84 * WHvX64RegisterDr0, 85 * WHvX64RegisterDr1, 86 * WHvX64RegisterDr2, 87 * WHvX64RegisterDr3, 88 * WHvX64RegisterDr6, 89 * WHvX64RegisterDr7, 90 */ 91 92 /* X64 Floating Point and Vector Registers */ 93 WHvX64RegisterXmm0, 94 WHvX64RegisterXmm1, 95 WHvX64RegisterXmm2, 96 WHvX64RegisterXmm3, 97 WHvX64RegisterXmm4, 98 WHvX64RegisterXmm5, 99 WHvX64RegisterXmm6, 100 WHvX64RegisterXmm7, 101 WHvX64RegisterXmm8, 102 WHvX64RegisterXmm9, 103 WHvX64RegisterXmm10, 104 WHvX64RegisterXmm11, 105 WHvX64RegisterXmm12, 106 WHvX64RegisterXmm13, 107 WHvX64RegisterXmm14, 108 WHvX64RegisterXmm15, 109 WHvX64RegisterFpMmx0, 110 WHvX64RegisterFpMmx1, 111 WHvX64RegisterFpMmx2, 112 WHvX64RegisterFpMmx3, 113 WHvX64RegisterFpMmx4, 114 WHvX64RegisterFpMmx5, 115 WHvX64RegisterFpMmx6, 116 WHvX64RegisterFpMmx7, 117 WHvX64RegisterFpControlStatus, 118 WHvX64RegisterXmmControlStatus, 119 120 /* X64 MSRs */ 121 WHvX64RegisterEfer, 122 #ifdef TARGET_X86_64 123 WHvX64RegisterKernelGsBase, 124 #endif 125 WHvX64RegisterApicBase, 126 /* WHvX64RegisterPat, */ 127 WHvX64RegisterSysenterCs, 128 WHvX64RegisterSysenterEip, 129 WHvX64RegisterSysenterEsp, 130 WHvX64RegisterStar, 131 #ifdef TARGET_X86_64 132 WHvX64RegisterLstar, 133 WHvX64RegisterCstar, 134 WHvX64RegisterSfmask, 135 #endif 136 137 /* Interrupt / Event Registers */ 138 /* 139 * WHvRegisterPendingInterruption, 140 * WHvRegisterInterruptState, 141 * WHvRegisterPendingEvent0, 142 * WHvRegisterPendingEvent1 143 * WHvX64RegisterDeliverabilityNotifications, 144 */ 145 }; 146 147 struct whpx_register_set { 148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; 149 }; 150 151 /* 152 * The current implementation of instruction stepping sets the TF flag 153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction. 154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. 155 * 156 * This approach has a few limitations: 157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag 158 * along with the other flags, possibly restoring it later. It would 159 * result in another INT1 when the flags are restored, triggering 160 * a stop in gdb that could be cleared by doing another step. 161 * 162 * Stepping over a POPF/LAHF instruction will let it overwrite the 163 * TF flags, ending the stepping mode. 164 * 165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, 166 * or anything that could result in a page fault) will save the flags 167 * to the stack, clear the TF flag, and let the guest execute the 168 * handler. Normally, the guest will restore the original flags, 169 * that will continue single-stepping. 170 * 171 * 3. Debuggers running on the guest may wish to set TF to do instruction 172 * stepping. INT1 events generated by it would be intercepted by us, 173 * as long as the gdb is connected to QEMU. 174 * 175 * In practice this means that: 176 * 1. Stepping through flags-modifying instructions may cause gdb to 177 * continue or stop in unexpected places. This will be fully recoverable 178 * and will not crash the target. 179 * 180 * 2. Stepping over an instruction that triggers an exception will step 181 * over the exception handler, not into it. 182 * 183 * 3. Debugging the guest via gdb, while running debugger on the guest 184 * at the same time may lead to unexpected effects. Removing all 185 * breakpoints set via QEMU will prevent any further interference 186 * with the guest-level debuggers. 187 * 188 * The limitations can be addressed as shown below: 189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of 190 * stepping through them. The exact semantics of the instructions is 191 * defined in the "Combined Volume Set of Intel 64 and IA-32 192 * Architectures Software Developer's Manuals", however it involves a 193 * fair amount of corner cases due to compatibility with real mode, 194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes. 195 * 196 * 2. We could step into the guest's exception handlers using the following 197 * sequence: 198 * a. Temporarily enable catching of all exception types via 199 * whpx_set_exception_exit_bitmap(). 200 * b. Once an exception is intercepted, read the IDT/GDT and locate 201 * the original handler. 202 * c. Patch the original handler, injecting an INT3 at the beginning. 203 * d. Update the exception exit bitmap to only catch the 204 * WHvX64ExceptionTypeBreakpointTrap exception. 205 * e. Let the affected CPU run in the exclusive mode. 206 * f. Restore the original handler and the exception exit bitmap. 207 * Note that handling all corner cases related to IDT/GDT is harder 208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a 209 * rough idea. 210 * 211 * 3. In order to properly support guest-level debugging in parallel with 212 * the QEMU-level debugging, we would need to be able to pass some INT1 213 * events to the guest. This could be done via the following methods: 214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, 215 * it seems to only work for interrupts and not software 216 * exceptions. 217 * b. Locating and patching the original handler by parsing IDT/GDT. 218 * This involves relatively complex logic outlined in the previous 219 * paragraph. 220 * c. Emulating the exception invocation (i.e. manually updating RIP, 221 * RFLAGS, and pushing the old values to stack). This is even more 222 * complicated than the previous option, since it involves checking 223 * CPL, gate attributes, and doing various adjustments depending 224 * on the current CPU mode, whether the CPL is changing, etc. 225 */ 226 typedef enum WhpxStepMode { 227 WHPX_STEP_NONE = 0, 228 /* Halt other VCPUs */ 229 WHPX_STEP_EXCLUSIVE, 230 } WhpxStepMode; 231 232 struct AccelCPUState { 233 WHV_EMULATOR_HANDLE emulator; 234 bool window_registered; 235 bool interruptable; 236 bool ready_for_pic_interrupt; 237 uint64_t tpr; 238 uint64_t apic_base; 239 bool interruption_pending; 240 241 /* Must be the last field as it may have a tail */ 242 WHV_RUN_VP_EXIT_CONTEXT exit_ctx; 243 }; 244 245 static bool whpx_allowed; 246 static bool whp_dispatch_initialized; 247 static HMODULE hWinHvPlatform, hWinHvEmulation; 248 static uint32_t max_vcpu_index; 249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap; 250 251 struct whpx_state whpx_global; 252 struct WHPDispatch whp_dispatch; 253 254 static bool whpx_has_xsave(void) 255 { 256 return whpx_xsave_cap.XsaveSupport; 257 } 258 259 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, 260 int r86) 261 { 262 WHV_X64_SEGMENT_REGISTER hs; 263 unsigned flags = qs->flags; 264 265 hs.Base = qs->base; 266 hs.Limit = qs->limit; 267 hs.Selector = qs->selector; 268 269 if (v86) { 270 hs.Attributes = 0; 271 hs.SegmentType = 3; 272 hs.Present = 1; 273 hs.DescriptorPrivilegeLevel = 3; 274 hs.NonSystemSegment = 1; 275 276 } else { 277 hs.Attributes = (flags >> DESC_TYPE_SHIFT); 278 279 if (r86) { 280 /* hs.Base &= 0xfffff; */ 281 } 282 } 283 284 return hs; 285 } 286 287 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) 288 { 289 SegmentCache qs; 290 291 qs.base = hs->Base; 292 qs.limit = hs->Limit; 293 qs.selector = hs->Selector; 294 295 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; 296 297 return qs; 298 } 299 300 /* X64 Extended Control Registers */ 301 static void whpx_set_xcrs(CPUState *cpu) 302 { 303 CPUX86State *env = cpu_env(cpu); 304 HRESULT hr; 305 struct whpx_state *whpx = &whpx_global; 306 WHV_REGISTER_VALUE xcr0; 307 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 308 309 if (!whpx_has_xsave()) { 310 return; 311 } 312 313 /* Only xcr0 is supported by the hypervisor currently */ 314 xcr0.Reg64 = env->xcr0; 315 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 316 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 317 if (FAILED(hr)) { 318 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr); 319 } 320 } 321 322 static int whpx_set_tsc(CPUState *cpu) 323 { 324 CPUX86State *env = cpu_env(cpu); 325 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 326 WHV_REGISTER_VALUE tsc_val; 327 HRESULT hr; 328 struct whpx_state *whpx = &whpx_global; 329 330 /* 331 * Suspend the partition prior to setting the TSC to reduce the variance 332 * in TSC across vCPUs. When the first vCPU runs post suspend, the 333 * partition is automatically resumed. 334 */ 335 if (whp_dispatch.WHvSuspendPartitionTime) { 336 337 /* 338 * Unable to suspend partition while setting TSC is not a fatal 339 * error. It just increases the likelihood of TSC variance between 340 * vCPUs and some guest OS are able to handle that just fine. 341 */ 342 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); 343 if (FAILED(hr)) { 344 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); 345 } 346 } 347 348 tsc_val.Reg64 = env->tsc; 349 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 350 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 351 if (FAILED(hr)) { 352 error_report("WHPX: Failed to set TSC, hr=%08lx", hr); 353 return -1; 354 } 355 356 return 0; 357 } 358 359 /* 360 * The CR8 register in the CPU is mapped to the TPR register of the APIC, 361 * however, they use a slightly different encoding. Specifically: 362 * 363 * APIC.TPR[bits 7:4] = CR8[bits 3:0] 364 * 365 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 366 * and IA-32 Architectures Software Developer's Manual. 367 * 368 * The functions below translate the value of CR8 to TPR and vice versa. 369 */ 370 371 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) 372 { 373 return tpr >> 4; 374 } 375 376 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8) 377 { 378 return cr8 << 4; 379 } 380 381 static void whpx_set_registers(CPUState *cpu, int level) 382 { 383 struct whpx_state *whpx = &whpx_global; 384 AccelCPUState *vcpu = cpu->accel; 385 X86CPU *x86_cpu = X86_CPU(cpu); 386 CPUX86State *env = &x86_cpu->env; 387 struct whpx_register_set vcxt; 388 HRESULT hr; 389 int idx; 390 int idx_next; 391 int i; 392 int v86, r86; 393 394 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 395 396 /* 397 * Following MSRs have side effects on the guest or are too heavy for 398 * runtime. Limit them to full state update. 399 */ 400 if (level >= WHPX_SET_RESET_STATE) { 401 whpx_set_tsc(cpu); 402 } 403 404 memset(&vcxt, 0, sizeof(struct whpx_register_set)); 405 406 v86 = (env->eflags & VM_MASK); 407 r86 = !(env->cr[0] & CR0_PE_MASK); 408 409 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 410 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); 411 412 idx = 0; 413 414 /* Indexes for first 16 registers match between HV and QEMU definitions */ 415 idx_next = 16; 416 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 417 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; 418 } 419 idx = idx_next; 420 421 /* Same goes for RIP and RFLAGS */ 422 assert(whpx_register_names[idx] == WHvX64RegisterRip); 423 vcxt.values[idx++].Reg64 = env->eip; 424 425 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 426 vcxt.values[idx++].Reg64 = env->eflags; 427 428 /* Translate 6+4 segment registers. HV and QEMU order matches */ 429 assert(idx == WHvX64RegisterEs); 430 for (i = 0; i < 6; i += 1, idx += 1) { 431 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); 432 } 433 434 assert(idx == WHvX64RegisterLdtr); 435 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); 436 437 assert(idx == WHvX64RegisterTr); 438 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); 439 440 assert(idx == WHvX64RegisterIdtr); 441 vcxt.values[idx].Table.Base = env->idt.base; 442 vcxt.values[idx].Table.Limit = env->idt.limit; 443 idx += 1; 444 445 assert(idx == WHvX64RegisterGdtr); 446 vcxt.values[idx].Table.Base = env->gdt.base; 447 vcxt.values[idx].Table.Limit = env->gdt.limit; 448 idx += 1; 449 450 /* CR0, 2, 3, 4, 8 */ 451 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 452 vcxt.values[idx++].Reg64 = env->cr[0]; 453 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 454 vcxt.values[idx++].Reg64 = env->cr[2]; 455 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 456 vcxt.values[idx++].Reg64 = env->cr[3]; 457 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 458 vcxt.values[idx++].Reg64 = env->cr[4]; 459 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 460 vcxt.values[idx++].Reg64 = vcpu->tpr; 461 462 /* 8 Debug Registers - Skipped */ 463 464 /* 465 * Extended control registers needs to be handled separately depending 466 * on whether xsave is supported/enabled or not. 467 */ 468 whpx_set_xcrs(cpu); 469 470 /* 16 XMM registers */ 471 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 472 idx_next = idx + 16; 473 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 474 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); 475 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); 476 } 477 idx = idx_next; 478 479 /* 8 FP registers */ 480 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 481 for (i = 0; i < 8; i += 1, idx += 1) { 482 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); 483 /* vcxt.values[idx].Fp.AsUINT128.High64 = 484 env->fpregs[i].mmx.MMX_Q(1); 485 */ 486 } 487 488 /* FP control status register */ 489 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 490 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; 491 vcxt.values[idx].FpControlStatus.FpStatus = 492 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; 493 vcxt.values[idx].FpControlStatus.FpTag = 0; 494 for (i = 0; i < 8; ++i) { 495 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; 496 } 497 vcxt.values[idx].FpControlStatus.Reserved = 0; 498 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; 499 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; 500 idx += 1; 501 502 /* XMM control status register */ 503 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 504 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; 505 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; 506 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; 507 idx += 1; 508 509 /* MSRs */ 510 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 511 vcxt.values[idx++].Reg64 = env->efer; 512 #ifdef TARGET_X86_64 513 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 514 vcxt.values[idx++].Reg64 = env->kernelgsbase; 515 #endif 516 517 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 518 vcxt.values[idx++].Reg64 = vcpu->apic_base; 519 520 /* WHvX64RegisterPat - Skipped */ 521 522 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 523 vcxt.values[idx++].Reg64 = env->sysenter_cs; 524 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 525 vcxt.values[idx++].Reg64 = env->sysenter_eip; 526 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 527 vcxt.values[idx++].Reg64 = env->sysenter_esp; 528 assert(whpx_register_names[idx] == WHvX64RegisterStar); 529 vcxt.values[idx++].Reg64 = env->star; 530 #ifdef TARGET_X86_64 531 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 532 vcxt.values[idx++].Reg64 = env->lstar; 533 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 534 vcxt.values[idx++].Reg64 = env->cstar; 535 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 536 vcxt.values[idx++].Reg64 = env->fmask; 537 #endif 538 539 /* Interrupt / Event Registers - Skipped */ 540 541 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 542 543 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 544 whpx->partition, cpu->cpu_index, 545 whpx_register_names, 546 RTL_NUMBER_OF(whpx_register_names), 547 &vcxt.values[0]); 548 549 if (FAILED(hr)) { 550 error_report("WHPX: Failed to set virtual processor context, hr=%08lx", 551 hr); 552 } 553 554 return; 555 } 556 557 static int whpx_get_tsc(CPUState *cpu) 558 { 559 CPUX86State *env = cpu_env(cpu); 560 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 561 WHV_REGISTER_VALUE tsc_val; 562 HRESULT hr; 563 struct whpx_state *whpx = &whpx_global; 564 565 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 566 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 567 if (FAILED(hr)) { 568 error_report("WHPX: Failed to get TSC, hr=%08lx", hr); 569 return -1; 570 } 571 572 env->tsc = tsc_val.Reg64; 573 return 0; 574 } 575 576 /* X64 Extended Control Registers */ 577 static void whpx_get_xcrs(CPUState *cpu) 578 { 579 CPUX86State *env = cpu_env(cpu); 580 HRESULT hr; 581 struct whpx_state *whpx = &whpx_global; 582 WHV_REGISTER_VALUE xcr0; 583 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 584 585 if (!whpx_has_xsave()) { 586 return; 587 } 588 589 /* Only xcr0 is supported by the hypervisor currently */ 590 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 591 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 592 if (FAILED(hr)) { 593 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr); 594 return; 595 } 596 597 env->xcr0 = xcr0.Reg64; 598 } 599 600 static void whpx_get_registers(CPUState *cpu) 601 { 602 struct whpx_state *whpx = &whpx_global; 603 AccelCPUState *vcpu = cpu->accel; 604 X86CPU *x86_cpu = X86_CPU(cpu); 605 CPUX86State *env = &x86_cpu->env; 606 struct whpx_register_set vcxt; 607 uint64_t tpr, apic_base; 608 HRESULT hr; 609 int idx; 610 int idx_next; 611 int i; 612 613 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 614 615 if (!env->tsc_valid) { 616 whpx_get_tsc(cpu); 617 env->tsc_valid = !runstate_is_running(); 618 } 619 620 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 621 whpx->partition, cpu->cpu_index, 622 whpx_register_names, 623 RTL_NUMBER_OF(whpx_register_names), 624 &vcxt.values[0]); 625 if (FAILED(hr)) { 626 error_report("WHPX: Failed to get virtual processor context, hr=%08lx", 627 hr); 628 } 629 630 if (whpx_apic_in_platform()) { 631 /* 632 * Fetch the TPR value from the emulated APIC. It may get overwritten 633 * below with the value from CR8 returned by 634 * WHvGetVirtualProcessorRegisters(). 635 */ 636 whpx_apic_get(x86_cpu->apic_state); 637 vcpu->tpr = whpx_apic_tpr_to_cr8( 638 cpu_get_apic_tpr(x86_cpu->apic_state)); 639 } 640 641 idx = 0; 642 643 /* Indexes for first 16 registers match between HV and QEMU definitions */ 644 idx_next = 16; 645 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 646 env->regs[idx] = vcxt.values[idx].Reg64; 647 } 648 idx = idx_next; 649 650 /* Same goes for RIP and RFLAGS */ 651 assert(whpx_register_names[idx] == WHvX64RegisterRip); 652 env->eip = vcxt.values[idx++].Reg64; 653 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 654 env->eflags = vcxt.values[idx++].Reg64; 655 656 /* Translate 6+4 segment registers. HV and QEMU order matches */ 657 assert(idx == WHvX64RegisterEs); 658 for (i = 0; i < 6; i += 1, idx += 1) { 659 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); 660 } 661 662 assert(idx == WHvX64RegisterLdtr); 663 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); 664 assert(idx == WHvX64RegisterTr); 665 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); 666 assert(idx == WHvX64RegisterIdtr); 667 env->idt.base = vcxt.values[idx].Table.Base; 668 env->idt.limit = vcxt.values[idx].Table.Limit; 669 idx += 1; 670 assert(idx == WHvX64RegisterGdtr); 671 env->gdt.base = vcxt.values[idx].Table.Base; 672 env->gdt.limit = vcxt.values[idx].Table.Limit; 673 idx += 1; 674 675 /* CR0, 2, 3, 4, 8 */ 676 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 677 env->cr[0] = vcxt.values[idx++].Reg64; 678 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 679 env->cr[2] = vcxt.values[idx++].Reg64; 680 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 681 env->cr[3] = vcxt.values[idx++].Reg64; 682 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 683 env->cr[4] = vcxt.values[idx++].Reg64; 684 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 685 tpr = vcxt.values[idx++].Reg64; 686 if (tpr != vcpu->tpr) { 687 vcpu->tpr = tpr; 688 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr)); 689 } 690 691 /* 8 Debug Registers - Skipped */ 692 693 /* 694 * Extended control registers needs to be handled separately depending 695 * on whether xsave is supported/enabled or not. 696 */ 697 whpx_get_xcrs(cpu); 698 699 /* 16 XMM registers */ 700 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 701 idx_next = idx + 16; 702 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 703 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; 704 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; 705 } 706 idx = idx_next; 707 708 /* 8 FP registers */ 709 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 710 for (i = 0; i < 8; i += 1, idx += 1) { 711 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; 712 /* env->fpregs[i].mmx.MMX_Q(1) = 713 vcxt.values[idx].Fp.AsUINT128.High64; 714 */ 715 } 716 717 /* FP control status register */ 718 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 719 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; 720 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; 721 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; 722 for (i = 0; i < 8; ++i) { 723 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); 724 } 725 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; 726 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; 727 idx += 1; 728 729 /* XMM control status register */ 730 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 731 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; 732 idx += 1; 733 734 /* MSRs */ 735 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 736 env->efer = vcxt.values[idx++].Reg64; 737 #ifdef TARGET_X86_64 738 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 739 env->kernelgsbase = vcxt.values[idx++].Reg64; 740 #endif 741 742 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 743 apic_base = vcxt.values[idx++].Reg64; 744 if (apic_base != vcpu->apic_base) { 745 vcpu->apic_base = apic_base; 746 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); 747 } 748 749 /* WHvX64RegisterPat - Skipped */ 750 751 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 752 env->sysenter_cs = vcxt.values[idx++].Reg64; 753 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 754 env->sysenter_eip = vcxt.values[idx++].Reg64; 755 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 756 env->sysenter_esp = vcxt.values[idx++].Reg64; 757 assert(whpx_register_names[idx] == WHvX64RegisterStar); 758 env->star = vcxt.values[idx++].Reg64; 759 #ifdef TARGET_X86_64 760 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 761 env->lstar = vcxt.values[idx++].Reg64; 762 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 763 env->cstar = vcxt.values[idx++].Reg64; 764 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 765 env->fmask = vcxt.values[idx++].Reg64; 766 #endif 767 768 /* Interrupt / Event Registers - Skipped */ 769 770 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 771 772 if (whpx_apic_in_platform()) { 773 whpx_apic_get(x86_cpu->apic_state); 774 } 775 776 x86_update_hflags(env); 777 778 return; 779 } 780 781 static HRESULT CALLBACK whpx_emu_ioport_callback( 782 void *ctx, 783 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) 784 { 785 MemTxAttrs attrs = { 0 }; 786 address_space_rw(&address_space_io, IoAccess->Port, attrs, 787 &IoAccess->Data, IoAccess->AccessSize, 788 IoAccess->Direction); 789 return S_OK; 790 } 791 792 static HRESULT CALLBACK whpx_emu_mmio_callback( 793 void *ctx, 794 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) 795 { 796 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, 797 ma->Direction); 798 return S_OK; 799 } 800 801 static HRESULT CALLBACK whpx_emu_getreg_callback( 802 void *ctx, 803 const WHV_REGISTER_NAME *RegisterNames, 804 UINT32 RegisterCount, 805 WHV_REGISTER_VALUE *RegisterValues) 806 { 807 HRESULT hr; 808 struct whpx_state *whpx = &whpx_global; 809 CPUState *cpu = (CPUState *)ctx; 810 811 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 812 whpx->partition, cpu->cpu_index, 813 RegisterNames, RegisterCount, 814 RegisterValues); 815 if (FAILED(hr)) { 816 error_report("WHPX: Failed to get virtual processor registers," 817 " hr=%08lx", hr); 818 } 819 820 return hr; 821 } 822 823 static HRESULT CALLBACK whpx_emu_setreg_callback( 824 void *ctx, 825 const WHV_REGISTER_NAME *RegisterNames, 826 UINT32 RegisterCount, 827 const WHV_REGISTER_VALUE *RegisterValues) 828 { 829 HRESULT hr; 830 struct whpx_state *whpx = &whpx_global; 831 CPUState *cpu = (CPUState *)ctx; 832 833 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 834 whpx->partition, cpu->cpu_index, 835 RegisterNames, RegisterCount, 836 RegisterValues); 837 if (FAILED(hr)) { 838 error_report("WHPX: Failed to set virtual processor registers," 839 " hr=%08lx", hr); 840 } 841 842 /* 843 * The emulator just successfully wrote the register state. We clear the 844 * dirty state so we avoid the double write on resume of the VP. 845 */ 846 cpu->vcpu_dirty = false; 847 848 return hr; 849 } 850 851 static HRESULT CALLBACK whpx_emu_translate_callback( 852 void *ctx, 853 WHV_GUEST_VIRTUAL_ADDRESS Gva, 854 WHV_TRANSLATE_GVA_FLAGS TranslateFlags, 855 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, 856 WHV_GUEST_PHYSICAL_ADDRESS *Gpa) 857 { 858 HRESULT hr; 859 struct whpx_state *whpx = &whpx_global; 860 CPUState *cpu = (CPUState *)ctx; 861 WHV_TRANSLATE_GVA_RESULT res; 862 863 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, 864 Gva, TranslateFlags, &res, Gpa); 865 if (FAILED(hr)) { 866 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); 867 } else { 868 *TranslationResult = res.ResultCode; 869 } 870 871 return hr; 872 } 873 874 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { 875 .Size = sizeof(WHV_EMULATOR_CALLBACKS), 876 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, 877 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, 878 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, 879 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, 880 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, 881 }; 882 883 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) 884 { 885 HRESULT hr; 886 AccelCPUState *vcpu = cpu->accel; 887 WHV_EMULATOR_STATUS emu_status; 888 889 hr = whp_dispatch.WHvEmulatorTryMmioEmulation( 890 vcpu->emulator, cpu, 891 &vcpu->exit_ctx.VpContext, ctx, 892 &emu_status); 893 if (FAILED(hr)) { 894 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); 895 return -1; 896 } 897 898 if (!emu_status.EmulationSuccessful) { 899 error_report("WHPX: Failed to emulate MMIO access with" 900 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 901 return -1; 902 } 903 904 return 0; 905 } 906 907 static int whpx_handle_portio(CPUState *cpu, 908 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) 909 { 910 HRESULT hr; 911 AccelCPUState *vcpu = cpu->accel; 912 WHV_EMULATOR_STATUS emu_status; 913 914 hr = whp_dispatch.WHvEmulatorTryIoEmulation( 915 vcpu->emulator, cpu, 916 &vcpu->exit_ctx.VpContext, ctx, 917 &emu_status); 918 if (FAILED(hr)) { 919 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); 920 return -1; 921 } 922 923 if (!emu_status.EmulationSuccessful) { 924 error_report("WHPX: Failed to emulate PortIO access with" 925 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 926 return -1; 927 } 928 929 return 0; 930 } 931 932 /* 933 * Controls whether we should intercept various exceptions on the guest, 934 * namely breakpoint/single-step events. 935 * 936 * The 'exceptions' argument accepts a bitmask, e.g: 937 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) 938 */ 939 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) 940 { 941 struct whpx_state *whpx = &whpx_global; 942 WHV_PARTITION_PROPERTY prop = { 0, }; 943 HRESULT hr; 944 945 if (exceptions == whpx->exception_exit_bitmap) { 946 return S_OK; 947 } 948 949 prop.ExceptionExitBitmap = exceptions; 950 951 hr = whp_dispatch.WHvSetPartitionProperty( 952 whpx->partition, 953 WHvPartitionPropertyCodeExceptionExitBitmap, 954 &prop, 955 sizeof(WHV_PARTITION_PROPERTY)); 956 957 if (SUCCEEDED(hr)) { 958 whpx->exception_exit_bitmap = exceptions; 959 } 960 961 return hr; 962 } 963 964 965 /* 966 * This function is called before/after stepping over a single instruction. 967 * It will update the CPU registers to arm/disarm the instruction stepping 968 * accordingly. 969 */ 970 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, 971 bool set, 972 uint64_t *exit_context_rflags) 973 { 974 WHV_REGISTER_NAME reg_name; 975 WHV_REGISTER_VALUE reg_value; 976 HRESULT hr; 977 struct whpx_state *whpx = &whpx_global; 978 979 /* 980 * If we are trying to step over a single instruction, we need to set the 981 * TF bit in rflags. Otherwise, clear it. 982 */ 983 reg_name = WHvX64RegisterRflags; 984 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 985 whpx->partition, 986 cpu->cpu_index, 987 ®_name, 988 1, 989 ®_value); 990 991 if (FAILED(hr)) { 992 error_report("WHPX: Failed to get rflags, hr=%08lx", hr); 993 return hr; 994 } 995 996 if (exit_context_rflags) { 997 assert(*exit_context_rflags == reg_value.Reg64); 998 } 999 1000 if (set) { 1001 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ 1002 reg_value.Reg64 |= TF_MASK; 1003 } else { 1004 reg_value.Reg64 &= ~TF_MASK; 1005 } 1006 1007 if (exit_context_rflags) { 1008 *exit_context_rflags = reg_value.Reg64; 1009 } 1010 1011 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1012 whpx->partition, 1013 cpu->cpu_index, 1014 ®_name, 1015 1, 1016 ®_value); 1017 1018 if (FAILED(hr)) { 1019 error_report("WHPX: Failed to set rflags," 1020 " hr=%08lx", 1021 hr); 1022 return hr; 1023 } 1024 1025 reg_name = WHvRegisterInterruptState; 1026 reg_value.Reg64 = 0; 1027 1028 /* Suspend delivery of hardware interrupts during single-stepping. */ 1029 reg_value.InterruptState.InterruptShadow = set != 0; 1030 1031 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1032 whpx->partition, 1033 cpu->cpu_index, 1034 ®_name, 1035 1, 1036 ®_value); 1037 1038 if (FAILED(hr)) { 1039 error_report("WHPX: Failed to set InterruptState," 1040 " hr=%08lx", 1041 hr); 1042 return hr; 1043 } 1044 1045 if (!set) { 1046 /* 1047 * We have just finished stepping over a single instruction, 1048 * and intercepted the INT1 generated by it. 1049 * We need to now hide the INT1 from the guest, 1050 * as it would not be expecting it. 1051 */ 1052 1053 reg_name = WHvX64RegisterPendingDebugException; 1054 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1055 whpx->partition, 1056 cpu->cpu_index, 1057 ®_name, 1058 1, 1059 ®_value); 1060 1061 if (FAILED(hr)) { 1062 error_report("WHPX: Failed to get pending debug exceptions," 1063 "hr=%08lx", hr); 1064 return hr; 1065 } 1066 1067 if (reg_value.PendingDebugException.SingleStep) { 1068 reg_value.PendingDebugException.SingleStep = 0; 1069 1070 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1071 whpx->partition, 1072 cpu->cpu_index, 1073 ®_name, 1074 1, 1075 ®_value); 1076 1077 if (FAILED(hr)) { 1078 error_report("WHPX: Failed to clear pending debug exceptions," 1079 "hr=%08lx", hr); 1080 return hr; 1081 } 1082 } 1083 1084 } 1085 1086 return S_OK; 1087 } 1088 1089 /* Tries to find a breakpoint at the specified address. */ 1090 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) 1091 { 1092 struct whpx_state *whpx = &whpx_global; 1093 int i; 1094 1095 if (whpx->breakpoints.breakpoints) { 1096 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { 1097 if (address == whpx->breakpoints.breakpoints->data[i].address) { 1098 return &whpx->breakpoints.breakpoints->data[i]; 1099 } 1100 } 1101 } 1102 1103 return NULL; 1104 } 1105 1106 /* 1107 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for 1108 * debugging user-mode applications. Since the WHPX API does not offer 1109 * an easy way to pass the intercepted exception back to the guest, we 1110 * resort to using INT1 instead, and let the guest always handle INT3. 1111 */ 1112 static const uint8_t whpx_breakpoint_instruction = 0xF1; 1113 1114 /* 1115 * The WHPX QEMU backend implements breakpoints by writing the INT1 1116 * instruction into memory (ignoring the DRx registers). This raises a few 1117 * issues that need to be carefully handled: 1118 * 1119 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints 1120 * at the same location, and later remove them in arbitrary order. 1121 * This should not cause memory corruption, and should only remove the 1122 * physical breakpoint instruction when the last QEMU breakpoint is gone. 1123 * 1124 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid 1125 * physical location. Hence, physically adding/removing a breakpoint can 1126 * theoretically fail at any time. We need to keep track of it. 1127 * 1128 * The function below rebuilds a list of low-level breakpoints (one per 1129 * address, tracking the original instruction and any errors) from the list of 1130 * high-level breakpoints (set via cpu_breakpoint_insert()). 1131 * 1132 * In order to optimize performance, this function stores the list of 1133 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the 1134 * low-level ones, so that it won't be re-invoked until these breakpoints 1135 * change. 1136 * 1137 * Note that this function decides which breakpoints should be inserted into, 1138 * memory, but doesn't actually do it. The memory accessing is done in 1139 * whpx_apply_breakpoints(). 1140 */ 1141 static void whpx_translate_cpu_breakpoints( 1142 struct whpx_breakpoints *breakpoints, 1143 CPUState *cpu, 1144 int cpu_breakpoint_count) 1145 { 1146 CPUBreakpoint *bp; 1147 int cpu_bp_index = 0; 1148 1149 breakpoints->original_addresses = 1150 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); 1151 1152 breakpoints->original_address_count = cpu_breakpoint_count; 1153 1154 int max_breakpoints = cpu_breakpoint_count + 1155 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); 1156 1157 struct whpx_breakpoint_collection *new_breakpoints = 1158 g_malloc0(sizeof(struct whpx_breakpoint_collection) 1159 + max_breakpoints * sizeof(struct whpx_breakpoint)); 1160 1161 new_breakpoints->allocated = max_breakpoints; 1162 new_breakpoints->used = 0; 1163 1164 /* 1165 * 1. Preserve all old breakpoints that could not be automatically 1166 * cleared when the CPU got stopped. 1167 */ 1168 if (breakpoints->breakpoints) { 1169 int i; 1170 for (i = 0; i < breakpoints->breakpoints->used; i++) { 1171 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { 1172 new_breakpoints->data[new_breakpoints->used++] = 1173 breakpoints->breakpoints->data[i]; 1174 } 1175 } 1176 } 1177 1178 /* 2. Map all CPU breakpoints to WHPX breakpoints */ 1179 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1180 int i; 1181 bool found = false; 1182 1183 /* This will be used to detect changed CPU breakpoints later. */ 1184 breakpoints->original_addresses[cpu_bp_index++] = bp->pc; 1185 1186 for (i = 0; i < new_breakpoints->used; i++) { 1187 /* 1188 * WARNING: This loop has O(N^2) complexity, where N is the 1189 * number of breakpoints. It should not be a bottleneck in 1190 * real-world scenarios, since it only needs to run once after 1191 * the breakpoints have been modified. 1192 * If this ever becomes a concern, it can be optimized by storing 1193 * high-level breakpoint objects in a tree or hash map. 1194 */ 1195 1196 if (new_breakpoints->data[i].address == bp->pc) { 1197 /* There was already a breakpoint at this address. */ 1198 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { 1199 new_breakpoints->data[i].state = WHPX_BP_SET; 1200 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { 1201 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; 1202 } 1203 1204 found = true; 1205 break; 1206 } 1207 } 1208 1209 if (!found && new_breakpoints->used < new_breakpoints->allocated) { 1210 /* No WHPX breakpoint at this address. Create one. */ 1211 new_breakpoints->data[new_breakpoints->used].address = bp->pc; 1212 new_breakpoints->data[new_breakpoints->used].state = 1213 WHPX_BP_SET_PENDING; 1214 new_breakpoints->used++; 1215 } 1216 } 1217 1218 /* 1219 * Free the previous breakpoint list. This can be optimized by keeping 1220 * it as shadow buffer for the next computation instead of freeing 1221 * it immediately. 1222 */ 1223 g_free(breakpoints->breakpoints); 1224 1225 breakpoints->breakpoints = new_breakpoints; 1226 } 1227 1228 /* 1229 * Physically inserts/removes the breakpoints by reading and writing the 1230 * physical memory, keeping a track of the failed attempts. 1231 * 1232 * Passing resuming=true will try to set all previously unset breakpoints. 1233 * Passing resuming=false will remove all inserted ones. 1234 */ 1235 static void whpx_apply_breakpoints( 1236 struct whpx_breakpoint_collection *breakpoints, 1237 CPUState *cpu, 1238 bool resuming) 1239 { 1240 int i, rc; 1241 if (!breakpoints) { 1242 return; 1243 } 1244 1245 for (i = 0; i < breakpoints->used; i++) { 1246 /* Decide what to do right now based on the last known state. */ 1247 WhpxBreakpointState state = breakpoints->data[i].state; 1248 switch (state) { 1249 case WHPX_BP_CLEARED: 1250 if (resuming) { 1251 state = WHPX_BP_SET_PENDING; 1252 } 1253 break; 1254 case WHPX_BP_SET_PENDING: 1255 if (!resuming) { 1256 state = WHPX_BP_CLEARED; 1257 } 1258 break; 1259 case WHPX_BP_SET: 1260 if (!resuming) { 1261 state = WHPX_BP_CLEAR_PENDING; 1262 } 1263 break; 1264 case WHPX_BP_CLEAR_PENDING: 1265 if (resuming) { 1266 state = WHPX_BP_SET; 1267 } 1268 break; 1269 } 1270 1271 if (state == WHPX_BP_SET_PENDING) { 1272 /* Remember the original instruction. */ 1273 rc = cpu_memory_rw_debug(cpu, 1274 breakpoints->data[i].address, 1275 &breakpoints->data[i].original_instruction, 1276 1, 1277 false); 1278 1279 if (!rc) { 1280 /* Write the breakpoint instruction. */ 1281 rc = cpu_memory_rw_debug(cpu, 1282 breakpoints->data[i].address, 1283 (void *)&whpx_breakpoint_instruction, 1284 1, 1285 true); 1286 } 1287 1288 if (!rc) { 1289 state = WHPX_BP_SET; 1290 } 1291 1292 } 1293 1294 if (state == WHPX_BP_CLEAR_PENDING) { 1295 /* Restore the original instruction. */ 1296 rc = cpu_memory_rw_debug(cpu, 1297 breakpoints->data[i].address, 1298 &breakpoints->data[i].original_instruction, 1299 1, 1300 true); 1301 1302 if (!rc) { 1303 state = WHPX_BP_CLEARED; 1304 } 1305 } 1306 1307 breakpoints->data[i].state = state; 1308 } 1309 } 1310 1311 /* 1312 * This function is called when the a VCPU is about to start and no other 1313 * VCPUs have been started so far. Since the VCPU start order could be 1314 * arbitrary, it doesn't have to be VCPU#0. 1315 * 1316 * It is used to commit the breakpoints into memory, and configure WHPX 1317 * to intercept debug exceptions. 1318 * 1319 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or 1320 * more VCPUs are already running, so this is the best place to do it. 1321 */ 1322 static int whpx_first_vcpu_starting(CPUState *cpu) 1323 { 1324 struct whpx_state *whpx = &whpx_global; 1325 HRESULT hr; 1326 1327 g_assert(bql_locked()); 1328 1329 if (!QTAILQ_EMPTY(&cpu->breakpoints) || 1330 (whpx->breakpoints.breakpoints && 1331 whpx->breakpoints.breakpoints->used)) { 1332 CPUBreakpoint *bp; 1333 int i = 0; 1334 bool update_pending = false; 1335 1336 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1337 if (i >= whpx->breakpoints.original_address_count || 1338 bp->pc != whpx->breakpoints.original_addresses[i]) { 1339 update_pending = true; 1340 } 1341 1342 i++; 1343 } 1344 1345 if (i != whpx->breakpoints.original_address_count) { 1346 update_pending = true; 1347 } 1348 1349 if (update_pending) { 1350 /* 1351 * The CPU breakpoints have changed since the last call to 1352 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must 1353 * now be recomputed. 1354 */ 1355 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); 1356 } 1357 1358 /* Actually insert the breakpoints into the memory. */ 1359 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); 1360 } 1361 1362 uint64_t exception_mask; 1363 if (whpx->step_pending || 1364 (whpx->breakpoints.breakpoints && 1365 whpx->breakpoints.breakpoints->used)) { 1366 /* 1367 * We are either attempting to single-step one or more CPUs, or 1368 * have one or more breakpoints enabled. Both require intercepting 1369 * the WHvX64ExceptionTypeBreakpointTrap exception. 1370 */ 1371 1372 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; 1373 } else { 1374 /* Let the guest handle all exceptions. */ 1375 exception_mask = 0; 1376 } 1377 1378 hr = whpx_set_exception_exit_bitmap(exception_mask); 1379 if (!SUCCEEDED(hr)) { 1380 error_report("WHPX: Failed to update exception exit mask," 1381 "hr=%08lx.", hr); 1382 return 1; 1383 } 1384 1385 return 0; 1386 } 1387 1388 /* 1389 * This function is called when the last VCPU has finished running. 1390 * It is used to remove any previously set breakpoints from memory. 1391 */ 1392 static int whpx_last_vcpu_stopping(CPUState *cpu) 1393 { 1394 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); 1395 return 0; 1396 } 1397 1398 /* Returns the address of the next instruction that is about to be executed. */ 1399 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) 1400 { 1401 if (cpu->vcpu_dirty) { 1402 /* The CPU registers have been modified by other parts of QEMU. */ 1403 CPUArchState *env = cpu_env(cpu); 1404 return env->eip; 1405 } else if (exit_context_valid) { 1406 /* 1407 * The CPU registers have not been modified by neither other parts 1408 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). 1409 * This is the most common case. 1410 */ 1411 AccelCPUState *vcpu = cpu->accel; 1412 return vcpu->exit_ctx.VpContext.Rip; 1413 } else { 1414 /* 1415 * The CPU registers have been modified by a call to 1416 * WHvSetVirtualProcessorRegisters() and must be re-queried from 1417 * the target. 1418 */ 1419 WHV_REGISTER_VALUE reg_value; 1420 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; 1421 HRESULT hr; 1422 struct whpx_state *whpx = &whpx_global; 1423 1424 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1425 whpx->partition, 1426 cpu->cpu_index, 1427 ®_name, 1428 1, 1429 ®_value); 1430 1431 if (FAILED(hr)) { 1432 error_report("WHPX: Failed to get PC, hr=%08lx", hr); 1433 return 0; 1434 } 1435 1436 return reg_value.Reg64; 1437 } 1438 } 1439 1440 static int whpx_handle_halt(CPUState *cpu) 1441 { 1442 CPUX86State *env = cpu_env(cpu); 1443 int ret = 0; 1444 1445 bql_lock(); 1446 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1447 (env->eflags & IF_MASK)) && 1448 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1449 cpu->exception_index = EXCP_HLT; 1450 cpu->halted = true; 1451 ret = 1; 1452 } 1453 bql_unlock(); 1454 1455 return ret; 1456 } 1457 1458 static void whpx_vcpu_pre_run(CPUState *cpu) 1459 { 1460 HRESULT hr; 1461 struct whpx_state *whpx = &whpx_global; 1462 AccelCPUState *vcpu = cpu->accel; 1463 X86CPU *x86_cpu = X86_CPU(cpu); 1464 CPUX86State *env = &x86_cpu->env; 1465 int irq; 1466 uint8_t tpr; 1467 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; 1468 UINT32 reg_count = 0; 1469 WHV_REGISTER_VALUE reg_values[3]; 1470 WHV_REGISTER_NAME reg_names[3]; 1471 1472 memset(&new_int, 0, sizeof(new_int)); 1473 memset(reg_values, 0, sizeof(reg_values)); 1474 1475 bql_lock(); 1476 1477 /* Inject NMI */ 1478 if (!vcpu->interruption_pending && 1479 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 1480 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 1481 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 1482 vcpu->interruptable = false; 1483 new_int.InterruptionType = WHvX64PendingNmi; 1484 new_int.InterruptionPending = 1; 1485 new_int.InterruptionVector = 2; 1486 } 1487 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 1488 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 1489 } 1490 } 1491 1492 /* 1493 * Force the VCPU out of its inner loop to process any INIT requests or 1494 * commit pending TPR access. 1495 */ 1496 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 1497 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1498 !(env->hflags & HF_SMM_MASK)) { 1499 cpu->exit_request = 1; 1500 } 1501 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1502 cpu->exit_request = 1; 1503 } 1504 } 1505 1506 /* Get pending hard interruption or replay one that was overwritten */ 1507 if (!whpx_apic_in_platform()) { 1508 if (!vcpu->interruption_pending && 1509 vcpu->interruptable && (env->eflags & IF_MASK)) { 1510 assert(!new_int.InterruptionPending); 1511 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1512 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1513 irq = cpu_get_pic_interrupt(env); 1514 if (irq >= 0) { 1515 new_int.InterruptionType = WHvX64PendingInterrupt; 1516 new_int.InterruptionPending = 1; 1517 new_int.InterruptionVector = irq; 1518 } 1519 } 1520 } 1521 1522 /* Setup interrupt state if new one was prepared */ 1523 if (new_int.InterruptionPending) { 1524 reg_values[reg_count].PendingInterruption = new_int; 1525 reg_names[reg_count] = WHvRegisterPendingInterruption; 1526 reg_count += 1; 1527 } 1528 } else if (vcpu->ready_for_pic_interrupt && 1529 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 1530 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1531 irq = cpu_get_pic_interrupt(env); 1532 if (irq >= 0) { 1533 reg_names[reg_count] = WHvRegisterPendingEvent; 1534 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) 1535 { 1536 .EventPending = 1, 1537 .EventType = WHvX64PendingEventExtInt, 1538 .Vector = irq, 1539 }; 1540 reg_count += 1; 1541 } 1542 } 1543 1544 /* Sync the TPR to the CR8 if was modified during the intercept */ 1545 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 1546 if (tpr != vcpu->tpr) { 1547 vcpu->tpr = tpr; 1548 reg_values[reg_count].Reg64 = tpr; 1549 cpu->exit_request = 1; 1550 reg_names[reg_count] = WHvX64RegisterCr8; 1551 reg_count += 1; 1552 } 1553 1554 /* Update the state of the interrupt delivery notification */ 1555 if (!vcpu->window_registered && 1556 cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1557 reg_values[reg_count].DeliverabilityNotifications = 1558 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { 1559 .InterruptNotification = 1 1560 }; 1561 vcpu->window_registered = 1; 1562 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; 1563 reg_count += 1; 1564 } 1565 1566 bql_unlock(); 1567 vcpu->ready_for_pic_interrupt = false; 1568 1569 if (reg_count) { 1570 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1571 whpx->partition, cpu->cpu_index, 1572 reg_names, reg_count, reg_values); 1573 if (FAILED(hr)) { 1574 error_report("WHPX: Failed to set interrupt state registers," 1575 " hr=%08lx", hr); 1576 } 1577 } 1578 1579 return; 1580 } 1581 1582 static void whpx_vcpu_post_run(CPUState *cpu) 1583 { 1584 AccelCPUState *vcpu = cpu->accel; 1585 X86CPU *x86_cpu = X86_CPU(cpu); 1586 CPUX86State *env = &x86_cpu->env; 1587 1588 env->eflags = vcpu->exit_ctx.VpContext.Rflags; 1589 1590 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; 1591 if (vcpu->tpr != tpr) { 1592 vcpu->tpr = tpr; 1593 bql_lock(); 1594 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr)); 1595 bql_unlock(); 1596 } 1597 1598 vcpu->interruption_pending = 1599 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; 1600 1601 vcpu->interruptable = 1602 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; 1603 1604 return; 1605 } 1606 1607 static void whpx_vcpu_process_async_events(CPUState *cpu) 1608 { 1609 X86CPU *x86_cpu = X86_CPU(cpu); 1610 CPUX86State *env = &x86_cpu->env; 1611 AccelCPUState *vcpu = cpu->accel; 1612 1613 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1614 !(env->hflags & HF_SMM_MASK)) { 1615 whpx_cpu_synchronize_state(cpu); 1616 do_cpu_init(x86_cpu); 1617 vcpu->interruptable = true; 1618 } 1619 1620 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 1621 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 1622 apic_poll_irq(x86_cpu->apic_state); 1623 } 1624 1625 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1626 (env->eflags & IF_MASK)) || 1627 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1628 cpu->halted = false; 1629 } 1630 1631 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 1632 whpx_cpu_synchronize_state(cpu); 1633 do_cpu_sipi(x86_cpu); 1634 } 1635 1636 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1637 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 1638 whpx_cpu_synchronize_state(cpu); 1639 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 1640 env->tpr_access_type); 1641 } 1642 1643 return; 1644 } 1645 1646 static int whpx_vcpu_run(CPUState *cpu) 1647 { 1648 HRESULT hr; 1649 struct whpx_state *whpx = &whpx_global; 1650 AccelCPUState *vcpu = cpu->accel; 1651 struct whpx_breakpoint *stepped_over_bp = NULL; 1652 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; 1653 int ret; 1654 1655 g_assert(bql_locked()); 1656 1657 if (whpx->running_cpus++ == 0) { 1658 /* Insert breakpoints into memory, update exception exit bitmap. */ 1659 ret = whpx_first_vcpu_starting(cpu); 1660 if (ret != 0) { 1661 return ret; 1662 } 1663 } 1664 1665 if (whpx->breakpoints.breakpoints && 1666 whpx->breakpoints.breakpoints->used > 0) 1667 { 1668 uint64_t pc = whpx_vcpu_get_pc(cpu, true); 1669 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); 1670 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { 1671 stepped_over_bp = NULL; 1672 } 1673 1674 if (stepped_over_bp) { 1675 /* 1676 * We are trying to run the instruction overwritten by an active 1677 * breakpoint. We will temporarily disable the breakpoint, suspend 1678 * other CPUs, and step over the instruction. 1679 */ 1680 exclusive_step_mode = WHPX_STEP_EXCLUSIVE; 1681 } 1682 } 1683 1684 if (exclusive_step_mode == WHPX_STEP_NONE) { 1685 whpx_vcpu_process_async_events(cpu); 1686 if (cpu->halted && !whpx_apic_in_platform()) { 1687 cpu->exception_index = EXCP_HLT; 1688 qatomic_set(&cpu->exit_request, false); 1689 return 0; 1690 } 1691 } 1692 1693 bql_unlock(); 1694 1695 if (exclusive_step_mode != WHPX_STEP_NONE) { 1696 start_exclusive(); 1697 g_assert(cpu == current_cpu); 1698 g_assert(!cpu->running); 1699 cpu->running = true; 1700 1701 hr = whpx_set_exception_exit_bitmap( 1702 1UL << WHvX64ExceptionTypeDebugTrapOrFault); 1703 if (!SUCCEEDED(hr)) { 1704 error_report("WHPX: Failed to update exception exit mask, " 1705 "hr=%08lx.", hr); 1706 return 1; 1707 } 1708 1709 if (stepped_over_bp) { 1710 /* Temporarily disable the triggered breakpoint. */ 1711 cpu_memory_rw_debug(cpu, 1712 stepped_over_bp->address, 1713 &stepped_over_bp->original_instruction, 1714 1, 1715 true); 1716 } 1717 } else { 1718 cpu_exec_start(cpu); 1719 } 1720 1721 do { 1722 if (cpu->vcpu_dirty) { 1723 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); 1724 cpu->vcpu_dirty = false; 1725 } 1726 1727 if (exclusive_step_mode == WHPX_STEP_NONE) { 1728 whpx_vcpu_pre_run(cpu); 1729 1730 if (qatomic_read(&cpu->exit_request)) { 1731 whpx_vcpu_kick(cpu); 1732 } 1733 } 1734 1735 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1736 whpx_vcpu_configure_single_stepping(cpu, true, NULL); 1737 } 1738 1739 hr = whp_dispatch.WHvRunVirtualProcessor( 1740 whpx->partition, cpu->cpu_index, 1741 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); 1742 1743 if (FAILED(hr)) { 1744 error_report("WHPX: Failed to exec a virtual processor," 1745 " hr=%08lx", hr); 1746 ret = -1; 1747 break; 1748 } 1749 1750 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1751 whpx_vcpu_configure_single_stepping(cpu, 1752 false, 1753 &vcpu->exit_ctx.VpContext.Rflags); 1754 } 1755 1756 whpx_vcpu_post_run(cpu); 1757 1758 switch (vcpu->exit_ctx.ExitReason) { 1759 case WHvRunVpExitReasonMemoryAccess: 1760 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); 1761 break; 1762 1763 case WHvRunVpExitReasonX64IoPortAccess: 1764 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); 1765 break; 1766 1767 case WHvRunVpExitReasonX64InterruptWindow: 1768 vcpu->ready_for_pic_interrupt = 1; 1769 vcpu->window_registered = 0; 1770 ret = 0; 1771 break; 1772 1773 case WHvRunVpExitReasonX64ApicEoi: 1774 assert(whpx_apic_in_platform()); 1775 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); 1776 break; 1777 1778 case WHvRunVpExitReasonX64Halt: 1779 /* 1780 * WARNING: as of build 19043.1526 (21H1), this exit reason is no 1781 * longer used. 1782 */ 1783 ret = whpx_handle_halt(cpu); 1784 break; 1785 1786 case WHvRunVpExitReasonX64ApicInitSipiTrap: { 1787 WHV_INTERRUPT_CONTROL ipi = {0}; 1788 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; 1789 uint32_t delivery_mode = 1790 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; 1791 int dest_shorthand = 1792 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; 1793 bool broadcast = false; 1794 bool include_self = false; 1795 uint32_t i; 1796 1797 /* We only registered for INIT and SIPI exits. */ 1798 if ((delivery_mode != APIC_DM_INIT) && 1799 (delivery_mode != APIC_DM_SIPI)) { 1800 error_report( 1801 "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); 1802 break; 1803 } 1804 1805 if (delivery_mode == APIC_DM_INIT) { 1806 ipi.Type = WHvX64InterruptTypeInit; 1807 } else { 1808 ipi.Type = WHvX64InterruptTypeSipi; 1809 } 1810 1811 ipi.DestinationMode = 1812 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? 1813 WHvX64InterruptDestinationModeLogical : 1814 WHvX64InterruptDestinationModePhysical; 1815 1816 ipi.TriggerMode = 1817 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? 1818 WHvX64InterruptTriggerModeLevel : 1819 WHvX64InterruptTriggerModeEdge; 1820 1821 ipi.Vector = icr & APIC_VECTOR_MASK; 1822 switch (dest_shorthand) { 1823 /* no shorthand. Bits 56-63 contain the destination. */ 1824 case 0: 1825 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; 1826 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1827 &ipi, sizeof(ipi)); 1828 if (FAILED(hr)) { 1829 error_report("WHPX: Failed to request interrupt hr=%08lx", 1830 hr); 1831 } 1832 1833 break; 1834 1835 /* self */ 1836 case 1: 1837 include_self = true; 1838 break; 1839 1840 /* broadcast, including self */ 1841 case 2: 1842 broadcast = true; 1843 include_self = true; 1844 break; 1845 1846 /* broadcast, excluding self */ 1847 case 3: 1848 broadcast = true; 1849 break; 1850 } 1851 1852 if (!broadcast && !include_self) { 1853 break; 1854 } 1855 1856 for (i = 0; i <= max_vcpu_index; i++) { 1857 if (i == cpu->cpu_index && !include_self) { 1858 continue; 1859 } 1860 1861 /* 1862 * Assuming that APIC Ids are identity mapped since 1863 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers 1864 * are not handled yet and the hypervisor doesn't allow the 1865 * guest to modify the APIC ID. 1866 */ 1867 ipi.Destination = i; 1868 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1869 &ipi, sizeof(ipi)); 1870 if (FAILED(hr)) { 1871 error_report( 1872 "WHPX: Failed to request SIPI for %d, hr=%08lx", 1873 i, hr); 1874 } 1875 } 1876 1877 break; 1878 } 1879 1880 case WHvRunVpExitReasonCanceled: 1881 if (exclusive_step_mode != WHPX_STEP_NONE) { 1882 /* 1883 * We are trying to step over a single instruction, and 1884 * likely got a request to stop from another thread. 1885 * Delay it until we are done stepping 1886 * over. 1887 */ 1888 ret = 0; 1889 } else { 1890 cpu->exception_index = EXCP_INTERRUPT; 1891 ret = 1; 1892 } 1893 break; 1894 case WHvRunVpExitReasonX64MsrAccess: { 1895 WHV_REGISTER_VALUE reg_values[3] = {0}; 1896 WHV_REGISTER_NAME reg_names[3]; 1897 UINT32 reg_count; 1898 1899 reg_names[0] = WHvX64RegisterRip; 1900 reg_names[1] = WHvX64RegisterRax; 1901 reg_names[2] = WHvX64RegisterRdx; 1902 1903 reg_values[0].Reg64 = 1904 vcpu->exit_ctx.VpContext.Rip + 1905 vcpu->exit_ctx.VpContext.InstructionLength; 1906 1907 /* 1908 * For all unsupported MSR access we: 1909 * ignore writes 1910 * return 0 on read. 1911 */ 1912 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? 1913 1 : 3; 1914 1915 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1916 whpx->partition, 1917 cpu->cpu_index, 1918 reg_names, reg_count, 1919 reg_values); 1920 1921 if (FAILED(hr)) { 1922 error_report("WHPX: Failed to set MsrAccess state " 1923 " registers, hr=%08lx", hr); 1924 } 1925 ret = 0; 1926 break; 1927 } 1928 case WHvRunVpExitReasonX64Cpuid: { 1929 WHV_REGISTER_VALUE reg_values[5]; 1930 WHV_REGISTER_NAME reg_names[5]; 1931 UINT32 reg_count = 5; 1932 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; 1933 X86CPU *x86_cpu = X86_CPU(cpu); 1934 CPUX86State *env = &x86_cpu->env; 1935 1936 memset(reg_values, 0, sizeof(reg_values)); 1937 1938 rip = vcpu->exit_ctx.VpContext.Rip + 1939 vcpu->exit_ctx.VpContext.InstructionLength; 1940 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; 1941 1942 /* 1943 * Ideally, these should be supplied to the hypervisor during VCPU 1944 * initialization and it should be able to satisfy this request. 1945 * But, currently, WHPX doesn't support setting CPUID values in the 1946 * hypervisor once the partition has been setup, which is too late 1947 * since VCPUs are realized later. For now, use the values from 1948 * QEMU to satisfy these requests, until WHPX adds support for 1949 * being able to set these values in the hypervisor at runtime. 1950 */ 1951 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, 1952 (UINT32 *)&rcx, (UINT32 *)&rdx); 1953 switch (cpuid_fn) { 1954 case 0x40000000: 1955 /* Expose the vmware cpu frequency cpuid leaf */ 1956 rax = 0x40000010; 1957 rbx = rcx = rdx = 0; 1958 break; 1959 1960 case 0x40000010: 1961 rax = env->tsc_khz; 1962 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1963 rcx = rdx = 0; 1964 break; 1965 1966 case 0x80000001: 1967 /* Remove any support of OSVW */ 1968 rcx &= ~CPUID_EXT3_OSVW; 1969 break; 1970 } 1971 1972 reg_names[0] = WHvX64RegisterRip; 1973 reg_names[1] = WHvX64RegisterRax; 1974 reg_names[2] = WHvX64RegisterRcx; 1975 reg_names[3] = WHvX64RegisterRdx; 1976 reg_names[4] = WHvX64RegisterRbx; 1977 1978 reg_values[0].Reg64 = rip; 1979 reg_values[1].Reg64 = rax; 1980 reg_values[2].Reg64 = rcx; 1981 reg_values[3].Reg64 = rdx; 1982 reg_values[4].Reg64 = rbx; 1983 1984 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1985 whpx->partition, cpu->cpu_index, 1986 reg_names, 1987 reg_count, 1988 reg_values); 1989 1990 if (FAILED(hr)) { 1991 error_report("WHPX: Failed to set CpuidAccess state registers," 1992 " hr=%08lx", hr); 1993 } 1994 ret = 0; 1995 break; 1996 } 1997 case WHvRunVpExitReasonException: 1998 whpx_get_registers(cpu); 1999 2000 if ((vcpu->exit_ctx.VpException.ExceptionType == 2001 WHvX64ExceptionTypeDebugTrapOrFault) && 2002 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && 2003 (vcpu->exit_ctx.VpException.InstructionBytes[0] == 2004 whpx_breakpoint_instruction)) { 2005 /* Stopped at a software breakpoint. */ 2006 cpu->exception_index = EXCP_DEBUG; 2007 } else if ((vcpu->exit_ctx.VpException.ExceptionType == 2008 WHvX64ExceptionTypeDebugTrapOrFault) && 2009 !cpu->singlestep_enabled) { 2010 /* 2011 * Just finished stepping over a breakpoint, but the 2012 * gdb does not expect us to do single-stepping. 2013 * Don't do anything special. 2014 */ 2015 cpu->exception_index = EXCP_INTERRUPT; 2016 } else { 2017 /* Another exception or debug event. Report it to GDB. */ 2018 cpu->exception_index = EXCP_DEBUG; 2019 } 2020 2021 ret = 1; 2022 break; 2023 case WHvRunVpExitReasonNone: 2024 case WHvRunVpExitReasonUnrecoverableException: 2025 case WHvRunVpExitReasonInvalidVpRegisterValue: 2026 case WHvRunVpExitReasonUnsupportedFeature: 2027 default: 2028 error_report("WHPX: Unexpected VP exit code %d", 2029 vcpu->exit_ctx.ExitReason); 2030 whpx_get_registers(cpu); 2031 bql_lock(); 2032 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2033 bql_unlock(); 2034 break; 2035 } 2036 2037 } while (!ret); 2038 2039 if (stepped_over_bp) { 2040 /* Restore the breakpoint we stepped over */ 2041 cpu_memory_rw_debug(cpu, 2042 stepped_over_bp->address, 2043 (void *)&whpx_breakpoint_instruction, 2044 1, 2045 true); 2046 } 2047 2048 if (exclusive_step_mode != WHPX_STEP_NONE) { 2049 g_assert(cpu_in_exclusive_context(cpu)); 2050 cpu->running = false; 2051 end_exclusive(); 2052 2053 exclusive_step_mode = WHPX_STEP_NONE; 2054 } else { 2055 cpu_exec_end(cpu); 2056 } 2057 2058 bql_lock(); 2059 current_cpu = cpu; 2060 2061 if (--whpx->running_cpus == 0) { 2062 whpx_last_vcpu_stopping(cpu); 2063 } 2064 2065 qatomic_set(&cpu->exit_request, false); 2066 2067 return ret < 0; 2068 } 2069 2070 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2071 { 2072 if (!cpu->vcpu_dirty) { 2073 whpx_get_registers(cpu); 2074 cpu->vcpu_dirty = true; 2075 } 2076 } 2077 2078 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, 2079 run_on_cpu_data arg) 2080 { 2081 whpx_set_registers(cpu, WHPX_SET_RESET_STATE); 2082 cpu->vcpu_dirty = false; 2083 } 2084 2085 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, 2086 run_on_cpu_data arg) 2087 { 2088 whpx_set_registers(cpu, WHPX_SET_FULL_STATE); 2089 cpu->vcpu_dirty = false; 2090 } 2091 2092 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, 2093 run_on_cpu_data arg) 2094 { 2095 cpu->vcpu_dirty = true; 2096 } 2097 2098 /* 2099 * CPU support. 2100 */ 2101 2102 void whpx_cpu_synchronize_state(CPUState *cpu) 2103 { 2104 if (!cpu->vcpu_dirty) { 2105 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); 2106 } 2107 } 2108 2109 void whpx_cpu_synchronize_post_reset(CPUState *cpu) 2110 { 2111 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2112 } 2113 2114 void whpx_cpu_synchronize_post_init(CPUState *cpu) 2115 { 2116 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2117 } 2118 2119 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) 2120 { 2121 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2122 } 2123 2124 void whpx_cpu_synchronize_pre_resume(bool step_pending) 2125 { 2126 whpx_global.step_pending = step_pending; 2127 } 2128 2129 /* 2130 * Vcpu support. 2131 */ 2132 2133 static Error *whpx_migration_blocker; 2134 2135 static void whpx_cpu_update_state(void *opaque, bool running, RunState state) 2136 { 2137 CPUX86State *env = opaque; 2138 2139 if (running) { 2140 env->tsc_valid = false; 2141 } 2142 } 2143 2144 int whpx_init_vcpu(CPUState *cpu) 2145 { 2146 HRESULT hr; 2147 struct whpx_state *whpx = &whpx_global; 2148 AccelCPUState *vcpu = NULL; 2149 Error *local_error = NULL; 2150 X86CPU *x86_cpu = X86_CPU(cpu); 2151 CPUX86State *env = &x86_cpu->env; 2152 UINT64 freq = 0; 2153 int ret; 2154 2155 /* Add migration blockers for all unsupported features of the 2156 * Windows Hypervisor Platform 2157 */ 2158 if (whpx_migration_blocker == NULL) { 2159 error_setg(&whpx_migration_blocker, 2160 "State blocked due to non-migratable CPUID feature support," 2161 "dirty memory tracking support, and XSAVE/XRSTOR support"); 2162 2163 if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) { 2164 error_report_err(local_error); 2165 ret = -EINVAL; 2166 goto error; 2167 } 2168 } 2169 2170 vcpu = g_new0(AccelCPUState, 1); 2171 2172 hr = whp_dispatch.WHvEmulatorCreateEmulator( 2173 &whpx_emu_callbacks, 2174 &vcpu->emulator); 2175 if (FAILED(hr)) { 2176 error_report("WHPX: Failed to setup instruction completion support," 2177 " hr=%08lx", hr); 2178 ret = -EINVAL; 2179 goto error; 2180 } 2181 2182 hr = whp_dispatch.WHvCreateVirtualProcessor( 2183 whpx->partition, cpu->cpu_index, 0); 2184 if (FAILED(hr)) { 2185 error_report("WHPX: Failed to create a virtual processor," 2186 " hr=%08lx", hr); 2187 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2188 ret = -EINVAL; 2189 goto error; 2190 } 2191 2192 /* 2193 * vcpu's TSC frequency is either specified by user, or use the value 2194 * provided by Hyper-V if the former is not present. In the latter case, we 2195 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC 2196 * frequency can be migrated later via this field. 2197 */ 2198 if (!env->tsc_khz) { 2199 hr = whp_dispatch.WHvGetCapability( 2200 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), 2201 NULL); 2202 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2203 if (FAILED(hr)) { 2204 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); 2205 } else { 2206 env->tsc_khz = freq / 1000; /* Hz to KHz */ 2207 } 2208 } 2209 } 2210 2211 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; 2212 hr = whp_dispatch.WHvGetCapability( 2213 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); 2214 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2215 if (FAILED(hr)) { 2216 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); 2217 } else { 2218 env->apic_bus_freq = freq; 2219 } 2220 } 2221 2222 /* 2223 * If the vmware cpuid frequency leaf option is set, and we have a valid 2224 * tsc value, trap the corresponding cpuid's. 2225 */ 2226 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { 2227 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; 2228 2229 hr = whp_dispatch.WHvSetPartitionProperty( 2230 whpx->partition, 2231 WHvPartitionPropertyCodeCpuidExitList, 2232 cpuidExitList, 2233 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2234 2235 if (FAILED(hr)) { 2236 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2237 hr); 2238 ret = -EINVAL; 2239 goto error; 2240 } 2241 } 2242 2243 vcpu->interruptable = true; 2244 cpu->vcpu_dirty = true; 2245 cpu->accel = vcpu; 2246 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); 2247 qemu_add_vm_change_state_handler(whpx_cpu_update_state, env); 2248 2249 return 0; 2250 2251 error: 2252 g_free(vcpu); 2253 2254 return ret; 2255 } 2256 2257 int whpx_vcpu_exec(CPUState *cpu) 2258 { 2259 int ret; 2260 int fatal; 2261 2262 for (;;) { 2263 if (cpu->exception_index >= EXCP_INTERRUPT) { 2264 ret = cpu->exception_index; 2265 cpu->exception_index = -1; 2266 break; 2267 } 2268 2269 fatal = whpx_vcpu_run(cpu); 2270 2271 if (fatal) { 2272 error_report("WHPX: Failed to exec a virtual processor"); 2273 abort(); 2274 } 2275 } 2276 2277 return ret; 2278 } 2279 2280 void whpx_destroy_vcpu(CPUState *cpu) 2281 { 2282 struct whpx_state *whpx = &whpx_global; 2283 AccelCPUState *vcpu = cpu->accel; 2284 2285 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); 2286 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2287 g_free(cpu->accel); 2288 return; 2289 } 2290 2291 void whpx_vcpu_kick(CPUState *cpu) 2292 { 2293 struct whpx_state *whpx = &whpx_global; 2294 whp_dispatch.WHvCancelRunVirtualProcessor( 2295 whpx->partition, cpu->cpu_index, 0); 2296 } 2297 2298 /* 2299 * Memory support. 2300 */ 2301 2302 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, 2303 void *host_va, int add, int rom, 2304 const char *name) 2305 { 2306 struct whpx_state *whpx = &whpx_global; 2307 HRESULT hr; 2308 2309 /* 2310 if (add) { 2311 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", 2312 (void*)start_pa, (void*)size, host_va, 2313 (rom ? "ROM" : "RAM"), name); 2314 } else { 2315 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", 2316 (void*)start_pa, (void*)size, host_va, name); 2317 } 2318 */ 2319 2320 if (add) { 2321 hr = whp_dispatch.WHvMapGpaRange(whpx->partition, 2322 host_va, 2323 start_pa, 2324 size, 2325 (WHvMapGpaRangeFlagRead | 2326 WHvMapGpaRangeFlagExecute | 2327 (rom ? 0 : WHvMapGpaRangeFlagWrite))); 2328 } else { 2329 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, 2330 start_pa, 2331 size); 2332 } 2333 2334 if (FAILED(hr)) { 2335 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," 2336 " Host:%p, hr=%08lx", 2337 (add ? "MAP" : "UNMAP"), name, 2338 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); 2339 } 2340 } 2341 2342 static void whpx_process_section(MemoryRegionSection *section, int add) 2343 { 2344 MemoryRegion *mr = section->mr; 2345 hwaddr start_pa = section->offset_within_address_space; 2346 ram_addr_t size = int128_get64(section->size); 2347 unsigned int delta; 2348 uint64_t host_va; 2349 2350 if (!memory_region_is_ram(mr)) { 2351 return; 2352 } 2353 2354 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 2355 delta &= ~qemu_real_host_page_mask(); 2356 if (delta > size) { 2357 return; 2358 } 2359 start_pa += delta; 2360 size -= delta; 2361 size &= qemu_real_host_page_mask(); 2362 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 2363 return; 2364 } 2365 2366 host_va = (uintptr_t)memory_region_get_ram_ptr(mr) 2367 + section->offset_within_region + delta; 2368 2369 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, 2370 memory_region_is_rom(mr), mr->name); 2371 } 2372 2373 static void whpx_region_add(MemoryListener *listener, 2374 MemoryRegionSection *section) 2375 { 2376 memory_region_ref(section->mr); 2377 whpx_process_section(section, 1); 2378 } 2379 2380 static void whpx_region_del(MemoryListener *listener, 2381 MemoryRegionSection *section) 2382 { 2383 whpx_process_section(section, 0); 2384 memory_region_unref(section->mr); 2385 } 2386 2387 static void whpx_transaction_begin(MemoryListener *listener) 2388 { 2389 } 2390 2391 static void whpx_transaction_commit(MemoryListener *listener) 2392 { 2393 } 2394 2395 static void whpx_log_sync(MemoryListener *listener, 2396 MemoryRegionSection *section) 2397 { 2398 MemoryRegion *mr = section->mr; 2399 2400 if (!memory_region_is_ram(mr)) { 2401 return; 2402 } 2403 2404 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 2405 } 2406 2407 static MemoryListener whpx_memory_listener = { 2408 .name = "whpx", 2409 .begin = whpx_transaction_begin, 2410 .commit = whpx_transaction_commit, 2411 .region_add = whpx_region_add, 2412 .region_del = whpx_region_del, 2413 .log_sync = whpx_log_sync, 2414 .priority = MEMORY_LISTENER_PRIORITY_ACCEL, 2415 }; 2416 2417 static void whpx_memory_init(void) 2418 { 2419 memory_listener_register(&whpx_memory_listener, &address_space_memory); 2420 } 2421 2422 /* 2423 * Load the functions from the given library, using the given handle. If a 2424 * handle is provided, it is used, otherwise the library is opened. The 2425 * handle will be updated on return with the opened one. 2426 */ 2427 static bool load_whp_dispatch_fns(HMODULE *handle, 2428 WHPFunctionList function_list) 2429 { 2430 HMODULE hLib = *handle; 2431 2432 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" 2433 #define WINHV_EMULATION_DLL "WinHvEmulation.dll" 2434 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ 2435 whp_dispatch.function_name = \ 2436 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2437 2438 #define WHP_LOAD_FIELD(return_type, function_name, signature) \ 2439 whp_dispatch.function_name = \ 2440 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2441 if (!whp_dispatch.function_name) { \ 2442 error_report("Could not load function %s", #function_name); \ 2443 goto error; \ 2444 } \ 2445 2446 #define WHP_LOAD_LIB(lib_name, handle_lib) \ 2447 if (!handle_lib) { \ 2448 handle_lib = LoadLibrary(lib_name); \ 2449 if (!handle_lib) { \ 2450 error_report("Could not load library %s.", lib_name); \ 2451 goto error; \ 2452 } \ 2453 } \ 2454 2455 switch (function_list) { 2456 case WINHV_PLATFORM_FNS_DEFAULT: 2457 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2458 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) 2459 break; 2460 2461 case WINHV_EMULATION_FNS_DEFAULT: 2462 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) 2463 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) 2464 break; 2465 2466 case WINHV_PLATFORM_FNS_SUPPLEMENTAL: 2467 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2468 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) 2469 break; 2470 } 2471 2472 *handle = hLib; 2473 return true; 2474 2475 error: 2476 if (hLib) { 2477 FreeLibrary(hLib); 2478 } 2479 2480 return false; 2481 } 2482 2483 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, 2484 const char *name, void *opaque, 2485 Error **errp) 2486 { 2487 struct whpx_state *whpx = &whpx_global; 2488 OnOffSplit mode; 2489 2490 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 2491 return; 2492 } 2493 2494 switch (mode) { 2495 case ON_OFF_SPLIT_ON: 2496 whpx->kernel_irqchip_allowed = true; 2497 whpx->kernel_irqchip_required = true; 2498 break; 2499 2500 case ON_OFF_SPLIT_OFF: 2501 whpx->kernel_irqchip_allowed = false; 2502 whpx->kernel_irqchip_required = false; 2503 break; 2504 2505 case ON_OFF_SPLIT_SPLIT: 2506 error_setg(errp, "WHPX: split irqchip currently not supported"); 2507 error_append_hint(errp, 2508 "Try without kernel-irqchip or with kernel-irqchip=on|off"); 2509 break; 2510 2511 default: 2512 /* 2513 * The value was checked in visit_type_OnOffSplit() above. If 2514 * we get here, then something is wrong in QEMU. 2515 */ 2516 abort(); 2517 } 2518 } 2519 2520 /* 2521 * Partition support 2522 */ 2523 2524 static int whpx_accel_init(MachineState *ms) 2525 { 2526 struct whpx_state *whpx; 2527 int ret; 2528 HRESULT hr; 2529 WHV_CAPABILITY whpx_cap; 2530 UINT32 whpx_cap_size; 2531 WHV_PARTITION_PROPERTY prop; 2532 UINT32 cpuidExitList[] = {1, 0x80000001}; 2533 WHV_CAPABILITY_FEATURES features = {0}; 2534 2535 whpx = &whpx_global; 2536 2537 if (!init_whp_dispatch()) { 2538 ret = -ENOSYS; 2539 goto error; 2540 } 2541 2542 whpx->mem_quota = ms->ram_size; 2543 2544 hr = whp_dispatch.WHvGetCapability( 2545 WHvCapabilityCodeHypervisorPresent, &whpx_cap, 2546 sizeof(whpx_cap), &whpx_cap_size); 2547 if (FAILED(hr) || !whpx_cap.HypervisorPresent) { 2548 error_report("WHPX: No accelerator found, hr=%08lx", hr); 2549 ret = -ENOSPC; 2550 goto error; 2551 } 2552 2553 hr = whp_dispatch.WHvGetCapability( 2554 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); 2555 if (FAILED(hr)) { 2556 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); 2557 ret = -EINVAL; 2558 goto error; 2559 } 2560 2561 hr = whp_dispatch.WHvCreatePartition(&whpx->partition); 2562 if (FAILED(hr)) { 2563 error_report("WHPX: Failed to create partition, hr=%08lx", hr); 2564 ret = -EINVAL; 2565 goto error; 2566 } 2567 2568 /* 2569 * Query the XSAVE capability of the partition. Any error here is not 2570 * considered fatal. 2571 */ 2572 hr = whp_dispatch.WHvGetPartitionProperty( 2573 whpx->partition, 2574 WHvPartitionPropertyCodeProcessorXsaveFeatures, 2575 &whpx_xsave_cap, 2576 sizeof(whpx_xsave_cap), 2577 &whpx_cap_size); 2578 2579 /* 2580 * Windows version which don't support this property will return with the 2581 * specific error code. 2582 */ 2583 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) { 2584 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr); 2585 } 2586 2587 if (!whpx_has_xsave()) { 2588 printf("WHPX: Partition is not XSAVE capable\n"); 2589 } 2590 2591 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2592 prop.ProcessorCount = ms->smp.cpus; 2593 hr = whp_dispatch.WHvSetPartitionProperty( 2594 whpx->partition, 2595 WHvPartitionPropertyCodeProcessorCount, 2596 &prop, 2597 sizeof(WHV_PARTITION_PROPERTY)); 2598 2599 if (FAILED(hr)) { 2600 error_report("WHPX: Failed to set partition processor count to %u," 2601 " hr=%08lx", prop.ProcessorCount, hr); 2602 ret = -EINVAL; 2603 goto error; 2604 } 2605 2606 /* 2607 * Error out if WHP doesn't support apic emulation and user is requiring 2608 * it. 2609 */ 2610 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || 2611 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { 2612 error_report("WHPX: kernel irqchip requested, but unavailable. " 2613 "Try without kernel-irqchip or with kernel-irqchip=off"); 2614 ret = -EINVAL; 2615 goto error; 2616 } 2617 2618 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && 2619 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { 2620 WHV_X64_LOCAL_APIC_EMULATION_MODE mode = 2621 WHvX64LocalApicEmulationModeXApic; 2622 printf("WHPX: setting APIC emulation mode in the hypervisor\n"); 2623 hr = whp_dispatch.WHvSetPartitionProperty( 2624 whpx->partition, 2625 WHvPartitionPropertyCodeLocalApicEmulationMode, 2626 &mode, 2627 sizeof(mode)); 2628 if (FAILED(hr)) { 2629 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); 2630 if (whpx->kernel_irqchip_required) { 2631 error_report("WHPX: kernel irqchip requested, but unavailable"); 2632 ret = -EINVAL; 2633 goto error; 2634 } 2635 } else { 2636 whpx->apic_in_platform = true; 2637 } 2638 } 2639 2640 /* Register for MSR and CPUID exits */ 2641 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2642 prop.ExtendedVmExits.X64MsrExit = 1; 2643 prop.ExtendedVmExits.X64CpuidExit = 1; 2644 prop.ExtendedVmExits.ExceptionExit = 1; 2645 if (whpx_apic_in_platform()) { 2646 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; 2647 } 2648 2649 hr = whp_dispatch.WHvSetPartitionProperty( 2650 whpx->partition, 2651 WHvPartitionPropertyCodeExtendedVmExits, 2652 &prop, 2653 sizeof(WHV_PARTITION_PROPERTY)); 2654 if (FAILED(hr)) { 2655 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); 2656 ret = -EINVAL; 2657 goto error; 2658 } 2659 2660 hr = whp_dispatch.WHvSetPartitionProperty( 2661 whpx->partition, 2662 WHvPartitionPropertyCodeCpuidExitList, 2663 cpuidExitList, 2664 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2665 2666 if (FAILED(hr)) { 2667 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2668 hr); 2669 ret = -EINVAL; 2670 goto error; 2671 } 2672 2673 /* 2674 * We do not want to intercept any exceptions from the guest, 2675 * until we actually start debugging with gdb. 2676 */ 2677 whpx->exception_exit_bitmap = -1; 2678 hr = whpx_set_exception_exit_bitmap(0); 2679 2680 if (FAILED(hr)) { 2681 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); 2682 ret = -EINVAL; 2683 goto error; 2684 } 2685 2686 hr = whp_dispatch.WHvSetupPartition(whpx->partition); 2687 if (FAILED(hr)) { 2688 error_report("WHPX: Failed to setup partition, hr=%08lx", hr); 2689 ret = -EINVAL; 2690 goto error; 2691 } 2692 2693 whpx_memory_init(); 2694 2695 printf("Windows Hypervisor Platform accelerator is operational\n"); 2696 return 0; 2697 2698 error: 2699 2700 if (NULL != whpx->partition) { 2701 whp_dispatch.WHvDeletePartition(whpx->partition); 2702 whpx->partition = NULL; 2703 } 2704 2705 return ret; 2706 } 2707 2708 int whpx_enabled(void) 2709 { 2710 return whpx_allowed; 2711 } 2712 2713 bool whpx_apic_in_platform(void) { 2714 return whpx_global.apic_in_platform; 2715 } 2716 2717 static void whpx_accel_class_init(ObjectClass *oc, void *data) 2718 { 2719 AccelClass *ac = ACCEL_CLASS(oc); 2720 ac->name = "WHPX"; 2721 ac->init_machine = whpx_accel_init; 2722 ac->allowed = &whpx_allowed; 2723 2724 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 2725 NULL, whpx_set_kernel_irqchip, 2726 NULL, NULL); 2727 object_class_property_set_description(oc, "kernel-irqchip", 2728 "Configure WHPX in-kernel irqchip"); 2729 } 2730 2731 static void whpx_accel_instance_init(Object *obj) 2732 { 2733 struct whpx_state *whpx = &whpx_global; 2734 2735 memset(whpx, 0, sizeof(struct whpx_state)); 2736 /* Turn on kernel-irqchip, by default */ 2737 whpx->kernel_irqchip_allowed = true; 2738 } 2739 2740 static const TypeInfo whpx_accel_type = { 2741 .name = ACCEL_CLASS_NAME("whpx"), 2742 .parent = TYPE_ACCEL, 2743 .instance_init = whpx_accel_instance_init, 2744 .class_init = whpx_accel_class_init, 2745 }; 2746 2747 static void whpx_type_init(void) 2748 { 2749 type_register_static(&whpx_accel_type); 2750 } 2751 2752 bool init_whp_dispatch(void) 2753 { 2754 if (whp_dispatch_initialized) { 2755 return true; 2756 } 2757 2758 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { 2759 goto error; 2760 } 2761 2762 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { 2763 goto error; 2764 } 2765 2766 assert(load_whp_dispatch_fns(&hWinHvPlatform, 2767 WINHV_PLATFORM_FNS_SUPPLEMENTAL)); 2768 whp_dispatch_initialized = true; 2769 2770 return true; 2771 error: 2772 if (hWinHvPlatform) { 2773 FreeLibrary(hWinHvPlatform); 2774 } 2775 2776 if (hWinHvEmulation) { 2777 FreeLibrary(hWinHvEmulation); 2778 } 2779 2780 return false; 2781 } 2782 2783 type_init(whpx_type_init); 2784