1 /* 2 * QEMU Windows Hypervisor Platform accelerator (WHPX) 3 * 4 * Copyright Microsoft Corp. 2017 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 11 #include "qemu/osdep.h" 12 #include "cpu.h" 13 #include "exec/address-spaces.h" 14 #include "exec/ioport.h" 15 #include "gdbstub/helpers.h" 16 #include "qemu/accel.h" 17 #include "sysemu/whpx.h" 18 #include "sysemu/cpus.h" 19 #include "sysemu/runstate.h" 20 #include "qemu/main-loop.h" 21 #include "hw/boards.h" 22 #include "hw/intc/ioapic.h" 23 #include "hw/i386/apic_internal.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "qapi/qapi-types-common.h" 27 #include "qapi/qapi-visit-common.h" 28 #include "migration/blocker.h" 29 #include <winerror.h> 30 31 #include "whpx-internal.h" 32 #include "whpx-accel-ops.h" 33 34 #include <winhvplatform.h> 35 #include <winhvemulation.h> 36 37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) 38 39 static const WHV_REGISTER_NAME whpx_register_names[] = { 40 41 /* X64 General purpose registers */ 42 WHvX64RegisterRax, 43 WHvX64RegisterRcx, 44 WHvX64RegisterRdx, 45 WHvX64RegisterRbx, 46 WHvX64RegisterRsp, 47 WHvX64RegisterRbp, 48 WHvX64RegisterRsi, 49 WHvX64RegisterRdi, 50 WHvX64RegisterR8, 51 WHvX64RegisterR9, 52 WHvX64RegisterR10, 53 WHvX64RegisterR11, 54 WHvX64RegisterR12, 55 WHvX64RegisterR13, 56 WHvX64RegisterR14, 57 WHvX64RegisterR15, 58 WHvX64RegisterRip, 59 WHvX64RegisterRflags, 60 61 /* X64 Segment registers */ 62 WHvX64RegisterEs, 63 WHvX64RegisterCs, 64 WHvX64RegisterSs, 65 WHvX64RegisterDs, 66 WHvX64RegisterFs, 67 WHvX64RegisterGs, 68 WHvX64RegisterLdtr, 69 WHvX64RegisterTr, 70 71 /* X64 Table registers */ 72 WHvX64RegisterIdtr, 73 WHvX64RegisterGdtr, 74 75 /* X64 Control Registers */ 76 WHvX64RegisterCr0, 77 WHvX64RegisterCr2, 78 WHvX64RegisterCr3, 79 WHvX64RegisterCr4, 80 WHvX64RegisterCr8, 81 82 /* X64 Debug Registers */ 83 /* 84 * WHvX64RegisterDr0, 85 * WHvX64RegisterDr1, 86 * WHvX64RegisterDr2, 87 * WHvX64RegisterDr3, 88 * WHvX64RegisterDr6, 89 * WHvX64RegisterDr7, 90 */ 91 92 /* X64 Floating Point and Vector Registers */ 93 WHvX64RegisterXmm0, 94 WHvX64RegisterXmm1, 95 WHvX64RegisterXmm2, 96 WHvX64RegisterXmm3, 97 WHvX64RegisterXmm4, 98 WHvX64RegisterXmm5, 99 WHvX64RegisterXmm6, 100 WHvX64RegisterXmm7, 101 WHvX64RegisterXmm8, 102 WHvX64RegisterXmm9, 103 WHvX64RegisterXmm10, 104 WHvX64RegisterXmm11, 105 WHvX64RegisterXmm12, 106 WHvX64RegisterXmm13, 107 WHvX64RegisterXmm14, 108 WHvX64RegisterXmm15, 109 WHvX64RegisterFpMmx0, 110 WHvX64RegisterFpMmx1, 111 WHvX64RegisterFpMmx2, 112 WHvX64RegisterFpMmx3, 113 WHvX64RegisterFpMmx4, 114 WHvX64RegisterFpMmx5, 115 WHvX64RegisterFpMmx6, 116 WHvX64RegisterFpMmx7, 117 WHvX64RegisterFpControlStatus, 118 WHvX64RegisterXmmControlStatus, 119 120 /* X64 MSRs */ 121 WHvX64RegisterEfer, 122 #ifdef TARGET_X86_64 123 WHvX64RegisterKernelGsBase, 124 #endif 125 WHvX64RegisterApicBase, 126 /* WHvX64RegisterPat, */ 127 WHvX64RegisterSysenterCs, 128 WHvX64RegisterSysenterEip, 129 WHvX64RegisterSysenterEsp, 130 WHvX64RegisterStar, 131 #ifdef TARGET_X86_64 132 WHvX64RegisterLstar, 133 WHvX64RegisterCstar, 134 WHvX64RegisterSfmask, 135 #endif 136 137 /* Interrupt / Event Registers */ 138 /* 139 * WHvRegisterPendingInterruption, 140 * WHvRegisterInterruptState, 141 * WHvRegisterPendingEvent0, 142 * WHvRegisterPendingEvent1 143 * WHvX64RegisterDeliverabilityNotifications, 144 */ 145 }; 146 147 struct whpx_register_set { 148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; 149 }; 150 151 /* 152 * The current implementation of instruction stepping sets the TF flag 153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction. 154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. 155 * 156 * This approach has a few limitations: 157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag 158 * along with the other flags, possibly restoring it later. It would 159 * result in another INT1 when the flags are restored, triggering 160 * a stop in gdb that could be cleared by doing another step. 161 * 162 * Stepping over a POPF/LAHF instruction will let it overwrite the 163 * TF flags, ending the stepping mode. 164 * 165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, 166 * or anything that could result in a page fault) will save the flags 167 * to the stack, clear the TF flag, and let the guest execute the 168 * handler. Normally, the guest will restore the original flags, 169 * that will continue single-stepping. 170 * 171 * 3. Debuggers running on the guest may wish to set TF to do instruction 172 * stepping. INT1 events generated by it would be intercepted by us, 173 * as long as the gdb is connected to QEMU. 174 * 175 * In practice this means that: 176 * 1. Stepping through flags-modifying instructions may cause gdb to 177 * continue or stop in unexpected places. This will be fully recoverable 178 * and will not crash the target. 179 * 180 * 2. Stepping over an instruction that triggers an exception will step 181 * over the exception handler, not into it. 182 * 183 * 3. Debugging the guest via gdb, while running debugger on the guest 184 * at the same time may lead to unexpected effects. Removing all 185 * breakpoints set via QEMU will prevent any further interference 186 * with the guest-level debuggers. 187 * 188 * The limitations can be addressed as shown below: 189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of 190 * stepping through them. The exact semantics of the instructions is 191 * defined in the "Combined Volume Set of Intel 64 and IA-32 192 * Architectures Software Developer's Manuals", however it involves a 193 * fair amount of corner cases due to compatibility with real mode, 194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes. 195 * 196 * 2. We could step into the guest's exception handlers using the following 197 * sequence: 198 * a. Temporarily enable catching of all exception types via 199 * whpx_set_exception_exit_bitmap(). 200 * b. Once an exception is intercepted, read the IDT/GDT and locate 201 * the original handler. 202 * c. Patch the original handler, injecting an INT3 at the beginning. 203 * d. Update the exception exit bitmap to only catch the 204 * WHvX64ExceptionTypeBreakpointTrap exception. 205 * e. Let the affected CPU run in the exclusive mode. 206 * f. Restore the original handler and the exception exit bitmap. 207 * Note that handling all corner cases related to IDT/GDT is harder 208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a 209 * rough idea. 210 * 211 * 3. In order to properly support guest-level debugging in parallel with 212 * the QEMU-level debugging, we would need to be able to pass some INT1 213 * events to the guest. This could be done via the following methods: 214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, 215 * it seems to only work for interrupts and not software 216 * exceptions. 217 * b. Locating and patching the original handler by parsing IDT/GDT. 218 * This involves relatively complex logic outlined in the previous 219 * paragraph. 220 * c. Emulating the exception invocation (i.e. manually updating RIP, 221 * RFLAGS, and pushing the old values to stack). This is even more 222 * complicated than the previous option, since it involves checking 223 * CPL, gate attributes, and doing various adjustments depending 224 * on the current CPU mode, whether the CPL is changing, etc. 225 */ 226 typedef enum WhpxStepMode { 227 WHPX_STEP_NONE = 0, 228 /* Halt other VCPUs */ 229 WHPX_STEP_EXCLUSIVE, 230 } WhpxStepMode; 231 232 struct AccelCPUState { 233 WHV_EMULATOR_HANDLE emulator; 234 bool window_registered; 235 bool interruptable; 236 bool ready_for_pic_interrupt; 237 uint64_t tpr; 238 uint64_t apic_base; 239 bool interruption_pending; 240 241 /* Must be the last field as it may have a tail */ 242 WHV_RUN_VP_EXIT_CONTEXT exit_ctx; 243 }; 244 245 static bool whpx_allowed; 246 static bool whp_dispatch_initialized; 247 static HMODULE hWinHvPlatform, hWinHvEmulation; 248 static uint32_t max_vcpu_index; 249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap; 250 251 struct whpx_state whpx_global; 252 struct WHPDispatch whp_dispatch; 253 254 static bool whpx_has_xsave(void) 255 { 256 return whpx_xsave_cap.XsaveSupport; 257 } 258 259 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, 260 int r86) 261 { 262 WHV_X64_SEGMENT_REGISTER hs; 263 unsigned flags = qs->flags; 264 265 hs.Base = qs->base; 266 hs.Limit = qs->limit; 267 hs.Selector = qs->selector; 268 269 if (v86) { 270 hs.Attributes = 0; 271 hs.SegmentType = 3; 272 hs.Present = 1; 273 hs.DescriptorPrivilegeLevel = 3; 274 hs.NonSystemSegment = 1; 275 276 } else { 277 hs.Attributes = (flags >> DESC_TYPE_SHIFT); 278 279 if (r86) { 280 /* hs.Base &= 0xfffff; */ 281 } 282 } 283 284 return hs; 285 } 286 287 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) 288 { 289 SegmentCache qs; 290 291 qs.base = hs->Base; 292 qs.limit = hs->Limit; 293 qs.selector = hs->Selector; 294 295 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; 296 297 return qs; 298 } 299 300 /* X64 Extended Control Registers */ 301 static void whpx_set_xcrs(CPUState *cpu) 302 { 303 CPUX86State *env = cpu->env_ptr; 304 HRESULT hr; 305 struct whpx_state *whpx = &whpx_global; 306 WHV_REGISTER_VALUE xcr0; 307 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 308 309 if (!whpx_has_xsave()) { 310 return; 311 } 312 313 /* Only xcr0 is supported by the hypervisor currently */ 314 xcr0.Reg64 = env->xcr0; 315 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 316 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 317 if (FAILED(hr)) { 318 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr); 319 } 320 } 321 322 static int whpx_set_tsc(CPUState *cpu) 323 { 324 CPUX86State *env = cpu->env_ptr; 325 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 326 WHV_REGISTER_VALUE tsc_val; 327 HRESULT hr; 328 struct whpx_state *whpx = &whpx_global; 329 330 /* 331 * Suspend the partition prior to setting the TSC to reduce the variance 332 * in TSC across vCPUs. When the first vCPU runs post suspend, the 333 * partition is automatically resumed. 334 */ 335 if (whp_dispatch.WHvSuspendPartitionTime) { 336 337 /* 338 * Unable to suspend partition while setting TSC is not a fatal 339 * error. It just increases the likelihood of TSC variance between 340 * vCPUs and some guest OS are able to handle that just fine. 341 */ 342 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); 343 if (FAILED(hr)) { 344 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); 345 } 346 } 347 348 tsc_val.Reg64 = env->tsc; 349 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 350 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 351 if (FAILED(hr)) { 352 error_report("WHPX: Failed to set TSC, hr=%08lx", hr); 353 return -1; 354 } 355 356 return 0; 357 } 358 359 /* 360 * The CR8 register in the CPU is mapped to the TPR register of the APIC, 361 * however, they use a slightly different encoding. Specifically: 362 * 363 * APIC.TPR[bits 7:4] = CR8[bits 3:0] 364 * 365 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 366 * and IA-32 Architectures Software Developer's Manual. 367 * 368 * The functions below translate the value of CR8 to TPR and vice versa. 369 */ 370 371 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) 372 { 373 return tpr >> 4; 374 } 375 376 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8) 377 { 378 return cr8 << 4; 379 } 380 381 static void whpx_set_registers(CPUState *cpu, int level) 382 { 383 struct whpx_state *whpx = &whpx_global; 384 AccelCPUState *vcpu = cpu->accel; 385 CPUX86State *env = cpu->env_ptr; 386 X86CPU *x86_cpu = X86_CPU(cpu); 387 struct whpx_register_set vcxt; 388 HRESULT hr; 389 int idx; 390 int idx_next; 391 int i; 392 int v86, r86; 393 394 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 395 396 /* 397 * Following MSRs have side effects on the guest or are too heavy for 398 * runtime. Limit them to full state update. 399 */ 400 if (level >= WHPX_SET_RESET_STATE) { 401 whpx_set_tsc(cpu); 402 } 403 404 memset(&vcxt, 0, sizeof(struct whpx_register_set)); 405 406 v86 = (env->eflags & VM_MASK); 407 r86 = !(env->cr[0] & CR0_PE_MASK); 408 409 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 410 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); 411 412 idx = 0; 413 414 /* Indexes for first 16 registers match between HV and QEMU definitions */ 415 idx_next = 16; 416 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 417 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; 418 } 419 idx = idx_next; 420 421 /* Same goes for RIP and RFLAGS */ 422 assert(whpx_register_names[idx] == WHvX64RegisterRip); 423 vcxt.values[idx++].Reg64 = env->eip; 424 425 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 426 vcxt.values[idx++].Reg64 = env->eflags; 427 428 /* Translate 6+4 segment registers. HV and QEMU order matches */ 429 assert(idx == WHvX64RegisterEs); 430 for (i = 0; i < 6; i += 1, idx += 1) { 431 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); 432 } 433 434 assert(idx == WHvX64RegisterLdtr); 435 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); 436 437 assert(idx == WHvX64RegisterTr); 438 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); 439 440 assert(idx == WHvX64RegisterIdtr); 441 vcxt.values[idx].Table.Base = env->idt.base; 442 vcxt.values[idx].Table.Limit = env->idt.limit; 443 idx += 1; 444 445 assert(idx == WHvX64RegisterGdtr); 446 vcxt.values[idx].Table.Base = env->gdt.base; 447 vcxt.values[idx].Table.Limit = env->gdt.limit; 448 idx += 1; 449 450 /* CR0, 2, 3, 4, 8 */ 451 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 452 vcxt.values[idx++].Reg64 = env->cr[0]; 453 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 454 vcxt.values[idx++].Reg64 = env->cr[2]; 455 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 456 vcxt.values[idx++].Reg64 = env->cr[3]; 457 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 458 vcxt.values[idx++].Reg64 = env->cr[4]; 459 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 460 vcxt.values[idx++].Reg64 = vcpu->tpr; 461 462 /* 8 Debug Registers - Skipped */ 463 464 /* 465 * Extended control registers needs to be handled separately depending 466 * on whether xsave is supported/enabled or not. 467 */ 468 whpx_set_xcrs(cpu); 469 470 /* 16 XMM registers */ 471 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 472 idx_next = idx + 16; 473 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 474 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); 475 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); 476 } 477 idx = idx_next; 478 479 /* 8 FP registers */ 480 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 481 for (i = 0; i < 8; i += 1, idx += 1) { 482 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); 483 /* vcxt.values[idx].Fp.AsUINT128.High64 = 484 env->fpregs[i].mmx.MMX_Q(1); 485 */ 486 } 487 488 /* FP control status register */ 489 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 490 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; 491 vcxt.values[idx].FpControlStatus.FpStatus = 492 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; 493 vcxt.values[idx].FpControlStatus.FpTag = 0; 494 for (i = 0; i < 8; ++i) { 495 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; 496 } 497 vcxt.values[idx].FpControlStatus.Reserved = 0; 498 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; 499 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; 500 idx += 1; 501 502 /* XMM control status register */ 503 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 504 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; 505 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; 506 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; 507 idx += 1; 508 509 /* MSRs */ 510 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 511 vcxt.values[idx++].Reg64 = env->efer; 512 #ifdef TARGET_X86_64 513 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 514 vcxt.values[idx++].Reg64 = env->kernelgsbase; 515 #endif 516 517 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 518 vcxt.values[idx++].Reg64 = vcpu->apic_base; 519 520 /* WHvX64RegisterPat - Skipped */ 521 522 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 523 vcxt.values[idx++].Reg64 = env->sysenter_cs; 524 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 525 vcxt.values[idx++].Reg64 = env->sysenter_eip; 526 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 527 vcxt.values[idx++].Reg64 = env->sysenter_esp; 528 assert(whpx_register_names[idx] == WHvX64RegisterStar); 529 vcxt.values[idx++].Reg64 = env->star; 530 #ifdef TARGET_X86_64 531 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 532 vcxt.values[idx++].Reg64 = env->lstar; 533 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 534 vcxt.values[idx++].Reg64 = env->cstar; 535 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 536 vcxt.values[idx++].Reg64 = env->fmask; 537 #endif 538 539 /* Interrupt / Event Registers - Skipped */ 540 541 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 542 543 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 544 whpx->partition, cpu->cpu_index, 545 whpx_register_names, 546 RTL_NUMBER_OF(whpx_register_names), 547 &vcxt.values[0]); 548 549 if (FAILED(hr)) { 550 error_report("WHPX: Failed to set virtual processor context, hr=%08lx", 551 hr); 552 } 553 554 return; 555 } 556 557 static int whpx_get_tsc(CPUState *cpu) 558 { 559 CPUX86State *env = cpu->env_ptr; 560 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 561 WHV_REGISTER_VALUE tsc_val; 562 HRESULT hr; 563 struct whpx_state *whpx = &whpx_global; 564 565 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 566 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 567 if (FAILED(hr)) { 568 error_report("WHPX: Failed to get TSC, hr=%08lx", hr); 569 return -1; 570 } 571 572 env->tsc = tsc_val.Reg64; 573 return 0; 574 } 575 576 /* X64 Extended Control Registers */ 577 static void whpx_get_xcrs(CPUState *cpu) 578 { 579 CPUX86State *env = cpu->env_ptr; 580 HRESULT hr; 581 struct whpx_state *whpx = &whpx_global; 582 WHV_REGISTER_VALUE xcr0; 583 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 584 585 if (!whpx_has_xsave()) { 586 return; 587 } 588 589 /* Only xcr0 is supported by the hypervisor currently */ 590 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 591 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 592 if (FAILED(hr)) { 593 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr); 594 return; 595 } 596 597 env->xcr0 = xcr0.Reg64; 598 } 599 600 static void whpx_get_registers(CPUState *cpu) 601 { 602 struct whpx_state *whpx = &whpx_global; 603 AccelCPUState *vcpu = cpu->accel; 604 CPUX86State *env = cpu->env_ptr; 605 X86CPU *x86_cpu = X86_CPU(cpu); 606 struct whpx_register_set vcxt; 607 uint64_t tpr, apic_base; 608 HRESULT hr; 609 int idx; 610 int idx_next; 611 int i; 612 613 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 614 615 if (!env->tsc_valid) { 616 whpx_get_tsc(cpu); 617 env->tsc_valid = !runstate_is_running(); 618 } 619 620 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 621 whpx->partition, cpu->cpu_index, 622 whpx_register_names, 623 RTL_NUMBER_OF(whpx_register_names), 624 &vcxt.values[0]); 625 if (FAILED(hr)) { 626 error_report("WHPX: Failed to get virtual processor context, hr=%08lx", 627 hr); 628 } 629 630 if (whpx_apic_in_platform()) { 631 /* 632 * Fetch the TPR value from the emulated APIC. It may get overwritten 633 * below with the value from CR8 returned by 634 * WHvGetVirtualProcessorRegisters(). 635 */ 636 whpx_apic_get(x86_cpu->apic_state); 637 vcpu->tpr = whpx_apic_tpr_to_cr8( 638 cpu_get_apic_tpr(x86_cpu->apic_state)); 639 } 640 641 idx = 0; 642 643 /* Indexes for first 16 registers match between HV and QEMU definitions */ 644 idx_next = 16; 645 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 646 env->regs[idx] = vcxt.values[idx].Reg64; 647 } 648 idx = idx_next; 649 650 /* Same goes for RIP and RFLAGS */ 651 assert(whpx_register_names[idx] == WHvX64RegisterRip); 652 env->eip = vcxt.values[idx++].Reg64; 653 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 654 env->eflags = vcxt.values[idx++].Reg64; 655 656 /* Translate 6+4 segment registers. HV and QEMU order matches */ 657 assert(idx == WHvX64RegisterEs); 658 for (i = 0; i < 6; i += 1, idx += 1) { 659 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); 660 } 661 662 assert(idx == WHvX64RegisterLdtr); 663 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); 664 assert(idx == WHvX64RegisterTr); 665 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); 666 assert(idx == WHvX64RegisterIdtr); 667 env->idt.base = vcxt.values[idx].Table.Base; 668 env->idt.limit = vcxt.values[idx].Table.Limit; 669 idx += 1; 670 assert(idx == WHvX64RegisterGdtr); 671 env->gdt.base = vcxt.values[idx].Table.Base; 672 env->gdt.limit = vcxt.values[idx].Table.Limit; 673 idx += 1; 674 675 /* CR0, 2, 3, 4, 8 */ 676 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 677 env->cr[0] = vcxt.values[idx++].Reg64; 678 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 679 env->cr[2] = vcxt.values[idx++].Reg64; 680 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 681 env->cr[3] = vcxt.values[idx++].Reg64; 682 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 683 env->cr[4] = vcxt.values[idx++].Reg64; 684 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 685 tpr = vcxt.values[idx++].Reg64; 686 if (tpr != vcpu->tpr) { 687 vcpu->tpr = tpr; 688 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr)); 689 } 690 691 /* 8 Debug Registers - Skipped */ 692 693 /* 694 * Extended control registers needs to be handled separately depending 695 * on whether xsave is supported/enabled or not. 696 */ 697 whpx_get_xcrs(cpu); 698 699 /* 16 XMM registers */ 700 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 701 idx_next = idx + 16; 702 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 703 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; 704 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; 705 } 706 idx = idx_next; 707 708 /* 8 FP registers */ 709 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 710 for (i = 0; i < 8; i += 1, idx += 1) { 711 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; 712 /* env->fpregs[i].mmx.MMX_Q(1) = 713 vcxt.values[idx].Fp.AsUINT128.High64; 714 */ 715 } 716 717 /* FP control status register */ 718 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 719 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; 720 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; 721 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; 722 for (i = 0; i < 8; ++i) { 723 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); 724 } 725 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; 726 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; 727 idx += 1; 728 729 /* XMM control status register */ 730 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 731 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; 732 idx += 1; 733 734 /* MSRs */ 735 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 736 env->efer = vcxt.values[idx++].Reg64; 737 #ifdef TARGET_X86_64 738 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 739 env->kernelgsbase = vcxt.values[idx++].Reg64; 740 #endif 741 742 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 743 apic_base = vcxt.values[idx++].Reg64; 744 if (apic_base != vcpu->apic_base) { 745 vcpu->apic_base = apic_base; 746 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); 747 } 748 749 /* WHvX64RegisterPat - Skipped */ 750 751 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 752 env->sysenter_cs = vcxt.values[idx++].Reg64; 753 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 754 env->sysenter_eip = vcxt.values[idx++].Reg64; 755 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 756 env->sysenter_esp = vcxt.values[idx++].Reg64; 757 assert(whpx_register_names[idx] == WHvX64RegisterStar); 758 env->star = vcxt.values[idx++].Reg64; 759 #ifdef TARGET_X86_64 760 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 761 env->lstar = vcxt.values[idx++].Reg64; 762 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 763 env->cstar = vcxt.values[idx++].Reg64; 764 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 765 env->fmask = vcxt.values[idx++].Reg64; 766 #endif 767 768 /* Interrupt / Event Registers - Skipped */ 769 770 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 771 772 if (whpx_apic_in_platform()) { 773 whpx_apic_get(x86_cpu->apic_state); 774 } 775 776 x86_update_hflags(env); 777 778 return; 779 } 780 781 static HRESULT CALLBACK whpx_emu_ioport_callback( 782 void *ctx, 783 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) 784 { 785 MemTxAttrs attrs = { 0 }; 786 address_space_rw(&address_space_io, IoAccess->Port, attrs, 787 &IoAccess->Data, IoAccess->AccessSize, 788 IoAccess->Direction); 789 return S_OK; 790 } 791 792 static HRESULT CALLBACK whpx_emu_mmio_callback( 793 void *ctx, 794 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) 795 { 796 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, 797 ma->Direction); 798 return S_OK; 799 } 800 801 static HRESULT CALLBACK whpx_emu_getreg_callback( 802 void *ctx, 803 const WHV_REGISTER_NAME *RegisterNames, 804 UINT32 RegisterCount, 805 WHV_REGISTER_VALUE *RegisterValues) 806 { 807 HRESULT hr; 808 struct whpx_state *whpx = &whpx_global; 809 CPUState *cpu = (CPUState *)ctx; 810 811 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 812 whpx->partition, cpu->cpu_index, 813 RegisterNames, RegisterCount, 814 RegisterValues); 815 if (FAILED(hr)) { 816 error_report("WHPX: Failed to get virtual processor registers," 817 " hr=%08lx", hr); 818 } 819 820 return hr; 821 } 822 823 static HRESULT CALLBACK whpx_emu_setreg_callback( 824 void *ctx, 825 const WHV_REGISTER_NAME *RegisterNames, 826 UINT32 RegisterCount, 827 const WHV_REGISTER_VALUE *RegisterValues) 828 { 829 HRESULT hr; 830 struct whpx_state *whpx = &whpx_global; 831 CPUState *cpu = (CPUState *)ctx; 832 833 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 834 whpx->partition, cpu->cpu_index, 835 RegisterNames, RegisterCount, 836 RegisterValues); 837 if (FAILED(hr)) { 838 error_report("WHPX: Failed to set virtual processor registers," 839 " hr=%08lx", hr); 840 } 841 842 /* 843 * The emulator just successfully wrote the register state. We clear the 844 * dirty state so we avoid the double write on resume of the VP. 845 */ 846 cpu->vcpu_dirty = false; 847 848 return hr; 849 } 850 851 static HRESULT CALLBACK whpx_emu_translate_callback( 852 void *ctx, 853 WHV_GUEST_VIRTUAL_ADDRESS Gva, 854 WHV_TRANSLATE_GVA_FLAGS TranslateFlags, 855 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, 856 WHV_GUEST_PHYSICAL_ADDRESS *Gpa) 857 { 858 HRESULT hr; 859 struct whpx_state *whpx = &whpx_global; 860 CPUState *cpu = (CPUState *)ctx; 861 WHV_TRANSLATE_GVA_RESULT res; 862 863 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, 864 Gva, TranslateFlags, &res, Gpa); 865 if (FAILED(hr)) { 866 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); 867 } else { 868 *TranslationResult = res.ResultCode; 869 } 870 871 return hr; 872 } 873 874 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { 875 .Size = sizeof(WHV_EMULATOR_CALLBACKS), 876 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, 877 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, 878 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, 879 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, 880 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, 881 }; 882 883 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) 884 { 885 HRESULT hr; 886 AccelCPUState *vcpu = cpu->accel; 887 WHV_EMULATOR_STATUS emu_status; 888 889 hr = whp_dispatch.WHvEmulatorTryMmioEmulation( 890 vcpu->emulator, cpu, 891 &vcpu->exit_ctx.VpContext, ctx, 892 &emu_status); 893 if (FAILED(hr)) { 894 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); 895 return -1; 896 } 897 898 if (!emu_status.EmulationSuccessful) { 899 error_report("WHPX: Failed to emulate MMIO access with" 900 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 901 return -1; 902 } 903 904 return 0; 905 } 906 907 static int whpx_handle_portio(CPUState *cpu, 908 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) 909 { 910 HRESULT hr; 911 AccelCPUState *vcpu = cpu->accel; 912 WHV_EMULATOR_STATUS emu_status; 913 914 hr = whp_dispatch.WHvEmulatorTryIoEmulation( 915 vcpu->emulator, cpu, 916 &vcpu->exit_ctx.VpContext, ctx, 917 &emu_status); 918 if (FAILED(hr)) { 919 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); 920 return -1; 921 } 922 923 if (!emu_status.EmulationSuccessful) { 924 error_report("WHPX: Failed to emulate PortIO access with" 925 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 926 return -1; 927 } 928 929 return 0; 930 } 931 932 /* 933 * Controls whether we should intercept various exceptions on the guest, 934 * namely breakpoint/single-step events. 935 * 936 * The 'exceptions' argument accepts a bitmask, e.g: 937 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) 938 */ 939 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) 940 { 941 struct whpx_state *whpx = &whpx_global; 942 WHV_PARTITION_PROPERTY prop = { 0, }; 943 HRESULT hr; 944 945 if (exceptions == whpx->exception_exit_bitmap) { 946 return S_OK; 947 } 948 949 prop.ExceptionExitBitmap = exceptions; 950 951 hr = whp_dispatch.WHvSetPartitionProperty( 952 whpx->partition, 953 WHvPartitionPropertyCodeExceptionExitBitmap, 954 &prop, 955 sizeof(WHV_PARTITION_PROPERTY)); 956 957 if (SUCCEEDED(hr)) { 958 whpx->exception_exit_bitmap = exceptions; 959 } 960 961 return hr; 962 } 963 964 965 /* 966 * This function is called before/after stepping over a single instruction. 967 * It will update the CPU registers to arm/disarm the instruction stepping 968 * accordingly. 969 */ 970 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, 971 bool set, 972 uint64_t *exit_context_rflags) 973 { 974 WHV_REGISTER_NAME reg_name; 975 WHV_REGISTER_VALUE reg_value; 976 HRESULT hr; 977 struct whpx_state *whpx = &whpx_global; 978 979 /* 980 * If we are trying to step over a single instruction, we need to set the 981 * TF bit in rflags. Otherwise, clear it. 982 */ 983 reg_name = WHvX64RegisterRflags; 984 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 985 whpx->partition, 986 cpu->cpu_index, 987 ®_name, 988 1, 989 ®_value); 990 991 if (FAILED(hr)) { 992 error_report("WHPX: Failed to get rflags, hr=%08lx", hr); 993 return hr; 994 } 995 996 if (exit_context_rflags) { 997 assert(*exit_context_rflags == reg_value.Reg64); 998 } 999 1000 if (set) { 1001 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ 1002 reg_value.Reg64 |= TF_MASK; 1003 } else { 1004 reg_value.Reg64 &= ~TF_MASK; 1005 } 1006 1007 if (exit_context_rflags) { 1008 *exit_context_rflags = reg_value.Reg64; 1009 } 1010 1011 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1012 whpx->partition, 1013 cpu->cpu_index, 1014 ®_name, 1015 1, 1016 ®_value); 1017 1018 if (FAILED(hr)) { 1019 error_report("WHPX: Failed to set rflags," 1020 " hr=%08lx", 1021 hr); 1022 return hr; 1023 } 1024 1025 reg_name = WHvRegisterInterruptState; 1026 reg_value.Reg64 = 0; 1027 1028 /* Suspend delivery of hardware interrupts during single-stepping. */ 1029 reg_value.InterruptState.InterruptShadow = set != 0; 1030 1031 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1032 whpx->partition, 1033 cpu->cpu_index, 1034 ®_name, 1035 1, 1036 ®_value); 1037 1038 if (FAILED(hr)) { 1039 error_report("WHPX: Failed to set InterruptState," 1040 " hr=%08lx", 1041 hr); 1042 return hr; 1043 } 1044 1045 if (!set) { 1046 /* 1047 * We have just finished stepping over a single instruction, 1048 * and intercepted the INT1 generated by it. 1049 * We need to now hide the INT1 from the guest, 1050 * as it would not be expecting it. 1051 */ 1052 1053 reg_name = WHvX64RegisterPendingDebugException; 1054 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1055 whpx->partition, 1056 cpu->cpu_index, 1057 ®_name, 1058 1, 1059 ®_value); 1060 1061 if (FAILED(hr)) { 1062 error_report("WHPX: Failed to get pending debug exceptions," 1063 "hr=%08lx", hr); 1064 return hr; 1065 } 1066 1067 if (reg_value.PendingDebugException.SingleStep) { 1068 reg_value.PendingDebugException.SingleStep = 0; 1069 1070 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1071 whpx->partition, 1072 cpu->cpu_index, 1073 ®_name, 1074 1, 1075 ®_value); 1076 1077 if (FAILED(hr)) { 1078 error_report("WHPX: Failed to clear pending debug exceptions," 1079 "hr=%08lx", hr); 1080 return hr; 1081 } 1082 } 1083 1084 } 1085 1086 return S_OK; 1087 } 1088 1089 /* Tries to find a breakpoint at the specified address. */ 1090 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) 1091 { 1092 struct whpx_state *whpx = &whpx_global; 1093 int i; 1094 1095 if (whpx->breakpoints.breakpoints) { 1096 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { 1097 if (address == whpx->breakpoints.breakpoints->data[i].address) { 1098 return &whpx->breakpoints.breakpoints->data[i]; 1099 } 1100 } 1101 } 1102 1103 return NULL; 1104 } 1105 1106 /* 1107 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for 1108 * debugging user-mode applications. Since the WHPX API does not offer 1109 * an easy way to pass the intercepted exception back to the guest, we 1110 * resort to using INT1 instead, and let the guest always handle INT3. 1111 */ 1112 static const uint8_t whpx_breakpoint_instruction = 0xF1; 1113 1114 /* 1115 * The WHPX QEMU backend implements breakpoints by writing the INT1 1116 * instruction into memory (ignoring the DRx registers). This raises a few 1117 * issues that need to be carefully handled: 1118 * 1119 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints 1120 * at the same location, and later remove them in arbitrary order. 1121 * This should not cause memory corruption, and should only remove the 1122 * physical breakpoint instruction when the last QEMU breakpoint is gone. 1123 * 1124 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid 1125 * physical location. Hence, physically adding/removing a breakpoint can 1126 * theoretically fail at any time. We need to keep track of it. 1127 * 1128 * The function below rebuilds a list of low-level breakpoints (one per 1129 * address, tracking the original instruction and any errors) from the list of 1130 * high-level breakpoints (set via cpu_breakpoint_insert()). 1131 * 1132 * In order to optimize performance, this function stores the list of 1133 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the 1134 * low-level ones, so that it won't be re-invoked until these breakpoints 1135 * change. 1136 * 1137 * Note that this function decides which breakpoints should be inserted into, 1138 * memory, but doesn't actually do it. The memory accessing is done in 1139 * whpx_apply_breakpoints(). 1140 */ 1141 static void whpx_translate_cpu_breakpoints( 1142 struct whpx_breakpoints *breakpoints, 1143 CPUState *cpu, 1144 int cpu_breakpoint_count) 1145 { 1146 CPUBreakpoint *bp; 1147 int cpu_bp_index = 0; 1148 1149 breakpoints->original_addresses = 1150 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); 1151 1152 breakpoints->original_address_count = cpu_breakpoint_count; 1153 1154 int max_breakpoints = cpu_breakpoint_count + 1155 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); 1156 1157 struct whpx_breakpoint_collection *new_breakpoints = 1158 g_malloc0(sizeof(struct whpx_breakpoint_collection) 1159 + max_breakpoints * sizeof(struct whpx_breakpoint)); 1160 1161 new_breakpoints->allocated = max_breakpoints; 1162 new_breakpoints->used = 0; 1163 1164 /* 1165 * 1. Preserve all old breakpoints that could not be automatically 1166 * cleared when the CPU got stopped. 1167 */ 1168 if (breakpoints->breakpoints) { 1169 int i; 1170 for (i = 0; i < breakpoints->breakpoints->used; i++) { 1171 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { 1172 new_breakpoints->data[new_breakpoints->used++] = 1173 breakpoints->breakpoints->data[i]; 1174 } 1175 } 1176 } 1177 1178 /* 2. Map all CPU breakpoints to WHPX breakpoints */ 1179 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1180 int i; 1181 bool found = false; 1182 1183 /* This will be used to detect changed CPU breakpoints later. */ 1184 breakpoints->original_addresses[cpu_bp_index++] = bp->pc; 1185 1186 for (i = 0; i < new_breakpoints->used; i++) { 1187 /* 1188 * WARNING: This loop has O(N^2) complexity, where N is the 1189 * number of breakpoints. It should not be a bottleneck in 1190 * real-world scenarios, since it only needs to run once after 1191 * the breakpoints have been modified. 1192 * If this ever becomes a concern, it can be optimized by storing 1193 * high-level breakpoint objects in a tree or hash map. 1194 */ 1195 1196 if (new_breakpoints->data[i].address == bp->pc) { 1197 /* There was already a breakpoint at this address. */ 1198 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { 1199 new_breakpoints->data[i].state = WHPX_BP_SET; 1200 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { 1201 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; 1202 } 1203 1204 found = true; 1205 break; 1206 } 1207 } 1208 1209 if (!found && new_breakpoints->used < new_breakpoints->allocated) { 1210 /* No WHPX breakpoint at this address. Create one. */ 1211 new_breakpoints->data[new_breakpoints->used].address = bp->pc; 1212 new_breakpoints->data[new_breakpoints->used].state = 1213 WHPX_BP_SET_PENDING; 1214 new_breakpoints->used++; 1215 } 1216 } 1217 1218 /* 1219 * Free the previous breakpoint list. This can be optimized by keeping 1220 * it as shadow buffer for the next computation instead of freeing 1221 * it immediately. 1222 */ 1223 g_free(breakpoints->breakpoints); 1224 1225 breakpoints->breakpoints = new_breakpoints; 1226 } 1227 1228 /* 1229 * Physically inserts/removes the breakpoints by reading and writing the 1230 * physical memory, keeping a track of the failed attempts. 1231 * 1232 * Passing resuming=true will try to set all previously unset breakpoints. 1233 * Passing resuming=false will remove all inserted ones. 1234 */ 1235 static void whpx_apply_breakpoints( 1236 struct whpx_breakpoint_collection *breakpoints, 1237 CPUState *cpu, 1238 bool resuming) 1239 { 1240 int i, rc; 1241 if (!breakpoints) { 1242 return; 1243 } 1244 1245 for (i = 0; i < breakpoints->used; i++) { 1246 /* Decide what to do right now based on the last known state. */ 1247 WhpxBreakpointState state = breakpoints->data[i].state; 1248 switch (state) { 1249 case WHPX_BP_CLEARED: 1250 if (resuming) { 1251 state = WHPX_BP_SET_PENDING; 1252 } 1253 break; 1254 case WHPX_BP_SET_PENDING: 1255 if (!resuming) { 1256 state = WHPX_BP_CLEARED; 1257 } 1258 break; 1259 case WHPX_BP_SET: 1260 if (!resuming) { 1261 state = WHPX_BP_CLEAR_PENDING; 1262 } 1263 break; 1264 case WHPX_BP_CLEAR_PENDING: 1265 if (resuming) { 1266 state = WHPX_BP_SET; 1267 } 1268 break; 1269 } 1270 1271 if (state == WHPX_BP_SET_PENDING) { 1272 /* Remember the original instruction. */ 1273 rc = cpu_memory_rw_debug(cpu, 1274 breakpoints->data[i].address, 1275 &breakpoints->data[i].original_instruction, 1276 1, 1277 false); 1278 1279 if (!rc) { 1280 /* Write the breakpoint instruction. */ 1281 rc = cpu_memory_rw_debug(cpu, 1282 breakpoints->data[i].address, 1283 (void *)&whpx_breakpoint_instruction, 1284 1, 1285 true); 1286 } 1287 1288 if (!rc) { 1289 state = WHPX_BP_SET; 1290 } 1291 1292 } 1293 1294 if (state == WHPX_BP_CLEAR_PENDING) { 1295 /* Restore the original instruction. */ 1296 rc = cpu_memory_rw_debug(cpu, 1297 breakpoints->data[i].address, 1298 &breakpoints->data[i].original_instruction, 1299 1, 1300 true); 1301 1302 if (!rc) { 1303 state = WHPX_BP_CLEARED; 1304 } 1305 } 1306 1307 breakpoints->data[i].state = state; 1308 } 1309 } 1310 1311 /* 1312 * This function is called when the a VCPU is about to start and no other 1313 * VCPUs have been started so far. Since the VCPU start order could be 1314 * arbitrary, it doesn't have to be VCPU#0. 1315 * 1316 * It is used to commit the breakpoints into memory, and configure WHPX 1317 * to intercept debug exceptions. 1318 * 1319 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or 1320 * more VCPUs are already running, so this is the best place to do it. 1321 */ 1322 static int whpx_first_vcpu_starting(CPUState *cpu) 1323 { 1324 struct whpx_state *whpx = &whpx_global; 1325 HRESULT hr; 1326 1327 g_assert(qemu_mutex_iothread_locked()); 1328 1329 if (!QTAILQ_EMPTY(&cpu->breakpoints) || 1330 (whpx->breakpoints.breakpoints && 1331 whpx->breakpoints.breakpoints->used)) { 1332 CPUBreakpoint *bp; 1333 int i = 0; 1334 bool update_pending = false; 1335 1336 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1337 if (i >= whpx->breakpoints.original_address_count || 1338 bp->pc != whpx->breakpoints.original_addresses[i]) { 1339 update_pending = true; 1340 } 1341 1342 i++; 1343 } 1344 1345 if (i != whpx->breakpoints.original_address_count) { 1346 update_pending = true; 1347 } 1348 1349 if (update_pending) { 1350 /* 1351 * The CPU breakpoints have changed since the last call to 1352 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must 1353 * now be recomputed. 1354 */ 1355 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); 1356 } 1357 1358 /* Actually insert the breakpoints into the memory. */ 1359 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); 1360 } 1361 1362 uint64_t exception_mask; 1363 if (whpx->step_pending || 1364 (whpx->breakpoints.breakpoints && 1365 whpx->breakpoints.breakpoints->used)) { 1366 /* 1367 * We are either attempting to single-step one or more CPUs, or 1368 * have one or more breakpoints enabled. Both require intercepting 1369 * the WHvX64ExceptionTypeBreakpointTrap exception. 1370 */ 1371 1372 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; 1373 } else { 1374 /* Let the guest handle all exceptions. */ 1375 exception_mask = 0; 1376 } 1377 1378 hr = whpx_set_exception_exit_bitmap(exception_mask); 1379 if (!SUCCEEDED(hr)) { 1380 error_report("WHPX: Failed to update exception exit mask," 1381 "hr=%08lx.", hr); 1382 return 1; 1383 } 1384 1385 return 0; 1386 } 1387 1388 /* 1389 * This function is called when the last VCPU has finished running. 1390 * It is used to remove any previously set breakpoints from memory. 1391 */ 1392 static int whpx_last_vcpu_stopping(CPUState *cpu) 1393 { 1394 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); 1395 return 0; 1396 } 1397 1398 /* Returns the address of the next instruction that is about to be executed. */ 1399 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) 1400 { 1401 if (cpu->vcpu_dirty) { 1402 /* The CPU registers have been modified by other parts of QEMU. */ 1403 CPUArchState *env = (CPUArchState *)(cpu->env_ptr); 1404 return env->eip; 1405 } else if (exit_context_valid) { 1406 /* 1407 * The CPU registers have not been modified by neither other parts 1408 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). 1409 * This is the most common case. 1410 */ 1411 AccelCPUState *vcpu = cpu->accel; 1412 return vcpu->exit_ctx.VpContext.Rip; 1413 } else { 1414 /* 1415 * The CPU registers have been modified by a call to 1416 * WHvSetVirtualProcessorRegisters() and must be re-queried from 1417 * the target. 1418 */ 1419 WHV_REGISTER_VALUE reg_value; 1420 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; 1421 HRESULT hr; 1422 struct whpx_state *whpx = &whpx_global; 1423 1424 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1425 whpx->partition, 1426 cpu->cpu_index, 1427 ®_name, 1428 1, 1429 ®_value); 1430 1431 if (FAILED(hr)) { 1432 error_report("WHPX: Failed to get PC, hr=%08lx", hr); 1433 return 0; 1434 } 1435 1436 return reg_value.Reg64; 1437 } 1438 } 1439 1440 static int whpx_handle_halt(CPUState *cpu) 1441 { 1442 CPUX86State *env = cpu->env_ptr; 1443 int ret = 0; 1444 1445 qemu_mutex_lock_iothread(); 1446 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1447 (env->eflags & IF_MASK)) && 1448 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1449 cpu->exception_index = EXCP_HLT; 1450 cpu->halted = true; 1451 ret = 1; 1452 } 1453 qemu_mutex_unlock_iothread(); 1454 1455 return ret; 1456 } 1457 1458 static void whpx_vcpu_pre_run(CPUState *cpu) 1459 { 1460 HRESULT hr; 1461 struct whpx_state *whpx = &whpx_global; 1462 AccelCPUState *vcpu = cpu->accel; 1463 CPUX86State *env = cpu->env_ptr; 1464 X86CPU *x86_cpu = X86_CPU(cpu); 1465 int irq; 1466 uint8_t tpr; 1467 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; 1468 UINT32 reg_count = 0; 1469 WHV_REGISTER_VALUE reg_values[3]; 1470 WHV_REGISTER_NAME reg_names[3]; 1471 1472 memset(&new_int, 0, sizeof(new_int)); 1473 memset(reg_values, 0, sizeof(reg_values)); 1474 1475 qemu_mutex_lock_iothread(); 1476 1477 /* Inject NMI */ 1478 if (!vcpu->interruption_pending && 1479 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 1480 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 1481 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 1482 vcpu->interruptable = false; 1483 new_int.InterruptionType = WHvX64PendingNmi; 1484 new_int.InterruptionPending = 1; 1485 new_int.InterruptionVector = 2; 1486 } 1487 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 1488 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 1489 } 1490 } 1491 1492 /* 1493 * Force the VCPU out of its inner loop to process any INIT requests or 1494 * commit pending TPR access. 1495 */ 1496 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 1497 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1498 !(env->hflags & HF_SMM_MASK)) { 1499 cpu->exit_request = 1; 1500 } 1501 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1502 cpu->exit_request = 1; 1503 } 1504 } 1505 1506 /* Get pending hard interruption or replay one that was overwritten */ 1507 if (!whpx_apic_in_platform()) { 1508 if (!vcpu->interruption_pending && 1509 vcpu->interruptable && (env->eflags & IF_MASK)) { 1510 assert(!new_int.InterruptionPending); 1511 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1512 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1513 irq = cpu_get_pic_interrupt(env); 1514 if (irq >= 0) { 1515 new_int.InterruptionType = WHvX64PendingInterrupt; 1516 new_int.InterruptionPending = 1; 1517 new_int.InterruptionVector = irq; 1518 } 1519 } 1520 } 1521 1522 /* Setup interrupt state if new one was prepared */ 1523 if (new_int.InterruptionPending) { 1524 reg_values[reg_count].PendingInterruption = new_int; 1525 reg_names[reg_count] = WHvRegisterPendingInterruption; 1526 reg_count += 1; 1527 } 1528 } else if (vcpu->ready_for_pic_interrupt && 1529 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 1530 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1531 irq = cpu_get_pic_interrupt(env); 1532 if (irq >= 0) { 1533 reg_names[reg_count] = WHvRegisterPendingEvent; 1534 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) 1535 { 1536 .EventPending = 1, 1537 .EventType = WHvX64PendingEventExtInt, 1538 .Vector = irq, 1539 }; 1540 reg_count += 1; 1541 } 1542 } 1543 1544 /* Sync the TPR to the CR8 if was modified during the intercept */ 1545 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 1546 if (tpr != vcpu->tpr) { 1547 vcpu->tpr = tpr; 1548 reg_values[reg_count].Reg64 = tpr; 1549 cpu->exit_request = 1; 1550 reg_names[reg_count] = WHvX64RegisterCr8; 1551 reg_count += 1; 1552 } 1553 1554 /* Update the state of the interrupt delivery notification */ 1555 if (!vcpu->window_registered && 1556 cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1557 reg_values[reg_count].DeliverabilityNotifications = 1558 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { 1559 .InterruptNotification = 1 1560 }; 1561 vcpu->window_registered = 1; 1562 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; 1563 reg_count += 1; 1564 } 1565 1566 qemu_mutex_unlock_iothread(); 1567 vcpu->ready_for_pic_interrupt = false; 1568 1569 if (reg_count) { 1570 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1571 whpx->partition, cpu->cpu_index, 1572 reg_names, reg_count, reg_values); 1573 if (FAILED(hr)) { 1574 error_report("WHPX: Failed to set interrupt state registers," 1575 " hr=%08lx", hr); 1576 } 1577 } 1578 1579 return; 1580 } 1581 1582 static void whpx_vcpu_post_run(CPUState *cpu) 1583 { 1584 AccelCPUState *vcpu = cpu->accel; 1585 CPUX86State *env = cpu->env_ptr; 1586 X86CPU *x86_cpu = X86_CPU(cpu); 1587 1588 env->eflags = vcpu->exit_ctx.VpContext.Rflags; 1589 1590 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; 1591 if (vcpu->tpr != tpr) { 1592 vcpu->tpr = tpr; 1593 qemu_mutex_lock_iothread(); 1594 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr)); 1595 qemu_mutex_unlock_iothread(); 1596 } 1597 1598 vcpu->interruption_pending = 1599 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; 1600 1601 vcpu->interruptable = 1602 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; 1603 1604 return; 1605 } 1606 1607 static void whpx_vcpu_process_async_events(CPUState *cpu) 1608 { 1609 CPUX86State *env = cpu->env_ptr; 1610 X86CPU *x86_cpu = X86_CPU(cpu); 1611 AccelCPUState *vcpu = cpu->accel; 1612 1613 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1614 !(env->hflags & HF_SMM_MASK)) { 1615 whpx_cpu_synchronize_state(cpu); 1616 do_cpu_init(x86_cpu); 1617 vcpu->interruptable = true; 1618 } 1619 1620 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 1621 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 1622 apic_poll_irq(x86_cpu->apic_state); 1623 } 1624 1625 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1626 (env->eflags & IF_MASK)) || 1627 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1628 cpu->halted = false; 1629 } 1630 1631 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 1632 whpx_cpu_synchronize_state(cpu); 1633 do_cpu_sipi(x86_cpu); 1634 } 1635 1636 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1637 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 1638 whpx_cpu_synchronize_state(cpu); 1639 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 1640 env->tpr_access_type); 1641 } 1642 1643 return; 1644 } 1645 1646 static int whpx_vcpu_run(CPUState *cpu) 1647 { 1648 HRESULT hr; 1649 struct whpx_state *whpx = &whpx_global; 1650 AccelCPUState *vcpu = cpu->accel; 1651 struct whpx_breakpoint *stepped_over_bp = NULL; 1652 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; 1653 int ret; 1654 1655 g_assert(qemu_mutex_iothread_locked()); 1656 1657 if (whpx->running_cpus++ == 0) { 1658 /* Insert breakpoints into memory, update exception exit bitmap. */ 1659 ret = whpx_first_vcpu_starting(cpu); 1660 if (ret != 0) { 1661 return ret; 1662 } 1663 } 1664 1665 if (whpx->breakpoints.breakpoints && 1666 whpx->breakpoints.breakpoints->used > 0) 1667 { 1668 uint64_t pc = whpx_vcpu_get_pc(cpu, true); 1669 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); 1670 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { 1671 stepped_over_bp = NULL; 1672 } 1673 1674 if (stepped_over_bp) { 1675 /* 1676 * We are trying to run the instruction overwritten by an active 1677 * breakpoint. We will temporarily disable the breakpoint, suspend 1678 * other CPUs, and step over the instruction. 1679 */ 1680 exclusive_step_mode = WHPX_STEP_EXCLUSIVE; 1681 } 1682 } 1683 1684 if (exclusive_step_mode == WHPX_STEP_NONE) { 1685 whpx_vcpu_process_async_events(cpu); 1686 if (cpu->halted && !whpx_apic_in_platform()) { 1687 cpu->exception_index = EXCP_HLT; 1688 qatomic_set(&cpu->exit_request, false); 1689 return 0; 1690 } 1691 } 1692 1693 qemu_mutex_unlock_iothread(); 1694 1695 if (exclusive_step_mode != WHPX_STEP_NONE) { 1696 start_exclusive(); 1697 g_assert(cpu == current_cpu); 1698 g_assert(!cpu->running); 1699 cpu->running = true; 1700 1701 hr = whpx_set_exception_exit_bitmap( 1702 1UL << WHvX64ExceptionTypeDebugTrapOrFault); 1703 if (!SUCCEEDED(hr)) { 1704 error_report("WHPX: Failed to update exception exit mask, " 1705 "hr=%08lx.", hr); 1706 return 1; 1707 } 1708 1709 if (stepped_over_bp) { 1710 /* Temporarily disable the triggered breakpoint. */ 1711 cpu_memory_rw_debug(cpu, 1712 stepped_over_bp->address, 1713 &stepped_over_bp->original_instruction, 1714 1, 1715 true); 1716 } 1717 } else { 1718 cpu_exec_start(cpu); 1719 } 1720 1721 do { 1722 if (cpu->vcpu_dirty) { 1723 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); 1724 cpu->vcpu_dirty = false; 1725 } 1726 1727 if (exclusive_step_mode == WHPX_STEP_NONE) { 1728 whpx_vcpu_pre_run(cpu); 1729 1730 if (qatomic_read(&cpu->exit_request)) { 1731 whpx_vcpu_kick(cpu); 1732 } 1733 } 1734 1735 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1736 whpx_vcpu_configure_single_stepping(cpu, true, NULL); 1737 } 1738 1739 hr = whp_dispatch.WHvRunVirtualProcessor( 1740 whpx->partition, cpu->cpu_index, 1741 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); 1742 1743 if (FAILED(hr)) { 1744 error_report("WHPX: Failed to exec a virtual processor," 1745 " hr=%08lx", hr); 1746 ret = -1; 1747 break; 1748 } 1749 1750 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1751 whpx_vcpu_configure_single_stepping(cpu, 1752 false, 1753 &vcpu->exit_ctx.VpContext.Rflags); 1754 } 1755 1756 whpx_vcpu_post_run(cpu); 1757 1758 switch (vcpu->exit_ctx.ExitReason) { 1759 case WHvRunVpExitReasonMemoryAccess: 1760 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); 1761 break; 1762 1763 case WHvRunVpExitReasonX64IoPortAccess: 1764 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); 1765 break; 1766 1767 case WHvRunVpExitReasonX64InterruptWindow: 1768 vcpu->ready_for_pic_interrupt = 1; 1769 vcpu->window_registered = 0; 1770 ret = 0; 1771 break; 1772 1773 case WHvRunVpExitReasonX64ApicEoi: 1774 assert(whpx_apic_in_platform()); 1775 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); 1776 break; 1777 1778 case WHvRunVpExitReasonX64Halt: 1779 /* 1780 * WARNING: as of build 19043.1526 (21H1), this exit reason is no 1781 * longer used. 1782 */ 1783 ret = whpx_handle_halt(cpu); 1784 break; 1785 1786 case WHvRunVpExitReasonX64ApicInitSipiTrap: { 1787 WHV_INTERRUPT_CONTROL ipi = {0}; 1788 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; 1789 uint32_t delivery_mode = 1790 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; 1791 int dest_shorthand = 1792 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; 1793 bool broadcast = false; 1794 bool include_self = false; 1795 uint32_t i; 1796 1797 /* We only registered for INIT and SIPI exits. */ 1798 if ((delivery_mode != APIC_DM_INIT) && 1799 (delivery_mode != APIC_DM_SIPI)) { 1800 error_report( 1801 "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); 1802 break; 1803 } 1804 1805 if (delivery_mode == APIC_DM_INIT) { 1806 ipi.Type = WHvX64InterruptTypeInit; 1807 } else { 1808 ipi.Type = WHvX64InterruptTypeSipi; 1809 } 1810 1811 ipi.DestinationMode = 1812 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? 1813 WHvX64InterruptDestinationModeLogical : 1814 WHvX64InterruptDestinationModePhysical; 1815 1816 ipi.TriggerMode = 1817 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? 1818 WHvX64InterruptTriggerModeLevel : 1819 WHvX64InterruptTriggerModeEdge; 1820 1821 ipi.Vector = icr & APIC_VECTOR_MASK; 1822 switch (dest_shorthand) { 1823 /* no shorthand. Bits 56-63 contain the destination. */ 1824 case 0: 1825 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; 1826 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1827 &ipi, sizeof(ipi)); 1828 if (FAILED(hr)) { 1829 error_report("WHPX: Failed to request interrupt hr=%08lx", 1830 hr); 1831 } 1832 1833 break; 1834 1835 /* self */ 1836 case 1: 1837 include_self = true; 1838 break; 1839 1840 /* broadcast, including self */ 1841 case 2: 1842 broadcast = true; 1843 include_self = true; 1844 break; 1845 1846 /* broadcast, excluding self */ 1847 case 3: 1848 broadcast = true; 1849 break; 1850 } 1851 1852 if (!broadcast && !include_self) { 1853 break; 1854 } 1855 1856 for (i = 0; i <= max_vcpu_index; i++) { 1857 if (i == cpu->cpu_index && !include_self) { 1858 continue; 1859 } 1860 1861 /* 1862 * Assuming that APIC Ids are identity mapped since 1863 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers 1864 * are not handled yet and the hypervisor doesn't allow the 1865 * guest to modify the APIC ID. 1866 */ 1867 ipi.Destination = i; 1868 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1869 &ipi, sizeof(ipi)); 1870 if (FAILED(hr)) { 1871 error_report( 1872 "WHPX: Failed to request SIPI for %d, hr=%08lx", 1873 i, hr); 1874 } 1875 } 1876 1877 break; 1878 } 1879 1880 case WHvRunVpExitReasonCanceled: 1881 if (exclusive_step_mode != WHPX_STEP_NONE) { 1882 /* 1883 * We are trying to step over a single instruction, and 1884 * likely got a request to stop from another thread. 1885 * Delay it until we are done stepping 1886 * over. 1887 */ 1888 ret = 0; 1889 } else { 1890 cpu->exception_index = EXCP_INTERRUPT; 1891 ret = 1; 1892 } 1893 break; 1894 case WHvRunVpExitReasonX64MsrAccess: { 1895 WHV_REGISTER_VALUE reg_values[3] = {0}; 1896 WHV_REGISTER_NAME reg_names[3]; 1897 UINT32 reg_count; 1898 1899 reg_names[0] = WHvX64RegisterRip; 1900 reg_names[1] = WHvX64RegisterRax; 1901 reg_names[2] = WHvX64RegisterRdx; 1902 1903 reg_values[0].Reg64 = 1904 vcpu->exit_ctx.VpContext.Rip + 1905 vcpu->exit_ctx.VpContext.InstructionLength; 1906 1907 /* 1908 * For all unsupported MSR access we: 1909 * ignore writes 1910 * return 0 on read. 1911 */ 1912 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? 1913 1 : 3; 1914 1915 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1916 whpx->partition, 1917 cpu->cpu_index, 1918 reg_names, reg_count, 1919 reg_values); 1920 1921 if (FAILED(hr)) { 1922 error_report("WHPX: Failed to set MsrAccess state " 1923 " registers, hr=%08lx", hr); 1924 } 1925 ret = 0; 1926 break; 1927 } 1928 case WHvRunVpExitReasonX64Cpuid: { 1929 WHV_REGISTER_VALUE reg_values[5]; 1930 WHV_REGISTER_NAME reg_names[5]; 1931 UINT32 reg_count = 5; 1932 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; 1933 X86CPU *x86_cpu = X86_CPU(cpu); 1934 CPUX86State *env = &x86_cpu->env; 1935 1936 memset(reg_values, 0, sizeof(reg_values)); 1937 1938 rip = vcpu->exit_ctx.VpContext.Rip + 1939 vcpu->exit_ctx.VpContext.InstructionLength; 1940 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; 1941 1942 /* 1943 * Ideally, these should be supplied to the hypervisor during VCPU 1944 * initialization and it should be able to satisfy this request. 1945 * But, currently, WHPX doesn't support setting CPUID values in the 1946 * hypervisor once the partition has been setup, which is too late 1947 * since VCPUs are realized later. For now, use the values from 1948 * QEMU to satisfy these requests, until WHPX adds support for 1949 * being able to set these values in the hypervisor at runtime. 1950 */ 1951 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, 1952 (UINT32 *)&rcx, (UINT32 *)&rdx); 1953 switch (cpuid_fn) { 1954 case 0x40000000: 1955 /* Expose the vmware cpu frequency cpuid leaf */ 1956 rax = 0x40000010; 1957 rbx = rcx = rdx = 0; 1958 break; 1959 1960 case 0x40000010: 1961 rax = env->tsc_khz; 1962 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1963 rcx = rdx = 0; 1964 break; 1965 1966 case 0x80000001: 1967 /* Remove any support of OSVW */ 1968 rcx &= ~CPUID_EXT3_OSVW; 1969 break; 1970 } 1971 1972 reg_names[0] = WHvX64RegisterRip; 1973 reg_names[1] = WHvX64RegisterRax; 1974 reg_names[2] = WHvX64RegisterRcx; 1975 reg_names[3] = WHvX64RegisterRdx; 1976 reg_names[4] = WHvX64RegisterRbx; 1977 1978 reg_values[0].Reg64 = rip; 1979 reg_values[1].Reg64 = rax; 1980 reg_values[2].Reg64 = rcx; 1981 reg_values[3].Reg64 = rdx; 1982 reg_values[4].Reg64 = rbx; 1983 1984 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1985 whpx->partition, cpu->cpu_index, 1986 reg_names, 1987 reg_count, 1988 reg_values); 1989 1990 if (FAILED(hr)) { 1991 error_report("WHPX: Failed to set CpuidAccess state registers," 1992 " hr=%08lx", hr); 1993 } 1994 ret = 0; 1995 break; 1996 } 1997 case WHvRunVpExitReasonException: 1998 whpx_get_registers(cpu); 1999 2000 if ((vcpu->exit_ctx.VpException.ExceptionType == 2001 WHvX64ExceptionTypeDebugTrapOrFault) && 2002 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && 2003 (vcpu->exit_ctx.VpException.InstructionBytes[0] == 2004 whpx_breakpoint_instruction)) { 2005 /* Stopped at a software breakpoint. */ 2006 cpu->exception_index = EXCP_DEBUG; 2007 } else if ((vcpu->exit_ctx.VpException.ExceptionType == 2008 WHvX64ExceptionTypeDebugTrapOrFault) && 2009 !cpu->singlestep_enabled) { 2010 /* 2011 * Just finished stepping over a breakpoint, but the 2012 * gdb does not expect us to do single-stepping. 2013 * Don't do anything special. 2014 */ 2015 cpu->exception_index = EXCP_INTERRUPT; 2016 } else { 2017 /* Another exception or debug event. Report it to GDB. */ 2018 cpu->exception_index = EXCP_DEBUG; 2019 } 2020 2021 ret = 1; 2022 break; 2023 case WHvRunVpExitReasonNone: 2024 case WHvRunVpExitReasonUnrecoverableException: 2025 case WHvRunVpExitReasonInvalidVpRegisterValue: 2026 case WHvRunVpExitReasonUnsupportedFeature: 2027 default: 2028 error_report("WHPX: Unexpected VP exit code %d", 2029 vcpu->exit_ctx.ExitReason); 2030 whpx_get_registers(cpu); 2031 qemu_mutex_lock_iothread(); 2032 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2033 qemu_mutex_unlock_iothread(); 2034 break; 2035 } 2036 2037 } while (!ret); 2038 2039 if (stepped_over_bp) { 2040 /* Restore the breakpoint we stepped over */ 2041 cpu_memory_rw_debug(cpu, 2042 stepped_over_bp->address, 2043 (void *)&whpx_breakpoint_instruction, 2044 1, 2045 true); 2046 } 2047 2048 if (exclusive_step_mode != WHPX_STEP_NONE) { 2049 g_assert(cpu_in_exclusive_context(cpu)); 2050 cpu->running = false; 2051 end_exclusive(); 2052 2053 exclusive_step_mode = WHPX_STEP_NONE; 2054 } else { 2055 cpu_exec_end(cpu); 2056 } 2057 2058 qemu_mutex_lock_iothread(); 2059 current_cpu = cpu; 2060 2061 if (--whpx->running_cpus == 0) { 2062 whpx_last_vcpu_stopping(cpu); 2063 } 2064 2065 qatomic_set(&cpu->exit_request, false); 2066 2067 return ret < 0; 2068 } 2069 2070 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2071 { 2072 if (!cpu->vcpu_dirty) { 2073 whpx_get_registers(cpu); 2074 cpu->vcpu_dirty = true; 2075 } 2076 } 2077 2078 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, 2079 run_on_cpu_data arg) 2080 { 2081 whpx_set_registers(cpu, WHPX_SET_RESET_STATE); 2082 cpu->vcpu_dirty = false; 2083 } 2084 2085 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, 2086 run_on_cpu_data arg) 2087 { 2088 whpx_set_registers(cpu, WHPX_SET_FULL_STATE); 2089 cpu->vcpu_dirty = false; 2090 } 2091 2092 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, 2093 run_on_cpu_data arg) 2094 { 2095 cpu->vcpu_dirty = true; 2096 } 2097 2098 /* 2099 * CPU support. 2100 */ 2101 2102 void whpx_cpu_synchronize_state(CPUState *cpu) 2103 { 2104 if (!cpu->vcpu_dirty) { 2105 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); 2106 } 2107 } 2108 2109 void whpx_cpu_synchronize_post_reset(CPUState *cpu) 2110 { 2111 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2112 } 2113 2114 void whpx_cpu_synchronize_post_init(CPUState *cpu) 2115 { 2116 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2117 } 2118 2119 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) 2120 { 2121 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2122 } 2123 2124 void whpx_cpu_synchronize_pre_resume(bool step_pending) 2125 { 2126 whpx_global.step_pending = step_pending; 2127 } 2128 2129 /* 2130 * Vcpu support. 2131 */ 2132 2133 static Error *whpx_migration_blocker; 2134 2135 static void whpx_cpu_update_state(void *opaque, bool running, RunState state) 2136 { 2137 CPUX86State *env = opaque; 2138 2139 if (running) { 2140 env->tsc_valid = false; 2141 } 2142 } 2143 2144 int whpx_init_vcpu(CPUState *cpu) 2145 { 2146 HRESULT hr; 2147 struct whpx_state *whpx = &whpx_global; 2148 AccelCPUState *vcpu = NULL; 2149 Error *local_error = NULL; 2150 CPUX86State *env = cpu->env_ptr; 2151 X86CPU *x86_cpu = X86_CPU(cpu); 2152 UINT64 freq = 0; 2153 int ret; 2154 2155 /* Add migration blockers for all unsupported features of the 2156 * Windows Hypervisor Platform 2157 */ 2158 if (whpx_migration_blocker == NULL) { 2159 error_setg(&whpx_migration_blocker, 2160 "State blocked due to non-migratable CPUID feature support," 2161 "dirty memory tracking support, and XSAVE/XRSTOR support"); 2162 2163 if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) { 2164 error_report_err(local_error); 2165 error_free(whpx_migration_blocker); 2166 ret = -EINVAL; 2167 goto error; 2168 } 2169 } 2170 2171 vcpu = g_new0(AccelCPUState, 1); 2172 2173 hr = whp_dispatch.WHvEmulatorCreateEmulator( 2174 &whpx_emu_callbacks, 2175 &vcpu->emulator); 2176 if (FAILED(hr)) { 2177 error_report("WHPX: Failed to setup instruction completion support," 2178 " hr=%08lx", hr); 2179 ret = -EINVAL; 2180 goto error; 2181 } 2182 2183 hr = whp_dispatch.WHvCreateVirtualProcessor( 2184 whpx->partition, cpu->cpu_index, 0); 2185 if (FAILED(hr)) { 2186 error_report("WHPX: Failed to create a virtual processor," 2187 " hr=%08lx", hr); 2188 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2189 ret = -EINVAL; 2190 goto error; 2191 } 2192 2193 /* 2194 * vcpu's TSC frequency is either specified by user, or use the value 2195 * provided by Hyper-V if the former is not present. In the latter case, we 2196 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC 2197 * frequency can be migrated later via this field. 2198 */ 2199 if (!env->tsc_khz) { 2200 hr = whp_dispatch.WHvGetCapability( 2201 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), 2202 NULL); 2203 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2204 if (FAILED(hr)) { 2205 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); 2206 } else { 2207 env->tsc_khz = freq / 1000; /* Hz to KHz */ 2208 } 2209 } 2210 } 2211 2212 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; 2213 hr = whp_dispatch.WHvGetCapability( 2214 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); 2215 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2216 if (FAILED(hr)) { 2217 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); 2218 } else { 2219 env->apic_bus_freq = freq; 2220 } 2221 } 2222 2223 /* 2224 * If the vmware cpuid frequency leaf option is set, and we have a valid 2225 * tsc value, trap the corresponding cpuid's. 2226 */ 2227 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { 2228 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; 2229 2230 hr = whp_dispatch.WHvSetPartitionProperty( 2231 whpx->partition, 2232 WHvPartitionPropertyCodeCpuidExitList, 2233 cpuidExitList, 2234 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2235 2236 if (FAILED(hr)) { 2237 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2238 hr); 2239 ret = -EINVAL; 2240 goto error; 2241 } 2242 } 2243 2244 vcpu->interruptable = true; 2245 cpu->vcpu_dirty = true; 2246 cpu->accel = vcpu; 2247 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); 2248 qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr); 2249 2250 return 0; 2251 2252 error: 2253 g_free(vcpu); 2254 2255 return ret; 2256 } 2257 2258 int whpx_vcpu_exec(CPUState *cpu) 2259 { 2260 int ret; 2261 int fatal; 2262 2263 for (;;) { 2264 if (cpu->exception_index >= EXCP_INTERRUPT) { 2265 ret = cpu->exception_index; 2266 cpu->exception_index = -1; 2267 break; 2268 } 2269 2270 fatal = whpx_vcpu_run(cpu); 2271 2272 if (fatal) { 2273 error_report("WHPX: Failed to exec a virtual processor"); 2274 abort(); 2275 } 2276 } 2277 2278 return ret; 2279 } 2280 2281 void whpx_destroy_vcpu(CPUState *cpu) 2282 { 2283 struct whpx_state *whpx = &whpx_global; 2284 AccelCPUState *vcpu = cpu->accel; 2285 2286 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); 2287 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2288 g_free(cpu->accel); 2289 return; 2290 } 2291 2292 void whpx_vcpu_kick(CPUState *cpu) 2293 { 2294 struct whpx_state *whpx = &whpx_global; 2295 whp_dispatch.WHvCancelRunVirtualProcessor( 2296 whpx->partition, cpu->cpu_index, 0); 2297 } 2298 2299 /* 2300 * Memory support. 2301 */ 2302 2303 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, 2304 void *host_va, int add, int rom, 2305 const char *name) 2306 { 2307 struct whpx_state *whpx = &whpx_global; 2308 HRESULT hr; 2309 2310 /* 2311 if (add) { 2312 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", 2313 (void*)start_pa, (void*)size, host_va, 2314 (rom ? "ROM" : "RAM"), name); 2315 } else { 2316 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", 2317 (void*)start_pa, (void*)size, host_va, name); 2318 } 2319 */ 2320 2321 if (add) { 2322 hr = whp_dispatch.WHvMapGpaRange(whpx->partition, 2323 host_va, 2324 start_pa, 2325 size, 2326 (WHvMapGpaRangeFlagRead | 2327 WHvMapGpaRangeFlagExecute | 2328 (rom ? 0 : WHvMapGpaRangeFlagWrite))); 2329 } else { 2330 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, 2331 start_pa, 2332 size); 2333 } 2334 2335 if (FAILED(hr)) { 2336 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," 2337 " Host:%p, hr=%08lx", 2338 (add ? "MAP" : "UNMAP"), name, 2339 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); 2340 } 2341 } 2342 2343 static void whpx_process_section(MemoryRegionSection *section, int add) 2344 { 2345 MemoryRegion *mr = section->mr; 2346 hwaddr start_pa = section->offset_within_address_space; 2347 ram_addr_t size = int128_get64(section->size); 2348 unsigned int delta; 2349 uint64_t host_va; 2350 2351 if (!memory_region_is_ram(mr)) { 2352 return; 2353 } 2354 2355 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 2356 delta &= ~qemu_real_host_page_mask(); 2357 if (delta > size) { 2358 return; 2359 } 2360 start_pa += delta; 2361 size -= delta; 2362 size &= qemu_real_host_page_mask(); 2363 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 2364 return; 2365 } 2366 2367 host_va = (uintptr_t)memory_region_get_ram_ptr(mr) 2368 + section->offset_within_region + delta; 2369 2370 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, 2371 memory_region_is_rom(mr), mr->name); 2372 } 2373 2374 static void whpx_region_add(MemoryListener *listener, 2375 MemoryRegionSection *section) 2376 { 2377 memory_region_ref(section->mr); 2378 whpx_process_section(section, 1); 2379 } 2380 2381 static void whpx_region_del(MemoryListener *listener, 2382 MemoryRegionSection *section) 2383 { 2384 whpx_process_section(section, 0); 2385 memory_region_unref(section->mr); 2386 } 2387 2388 static void whpx_transaction_begin(MemoryListener *listener) 2389 { 2390 } 2391 2392 static void whpx_transaction_commit(MemoryListener *listener) 2393 { 2394 } 2395 2396 static void whpx_log_sync(MemoryListener *listener, 2397 MemoryRegionSection *section) 2398 { 2399 MemoryRegion *mr = section->mr; 2400 2401 if (!memory_region_is_ram(mr)) { 2402 return; 2403 } 2404 2405 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 2406 } 2407 2408 static MemoryListener whpx_memory_listener = { 2409 .name = "whpx", 2410 .begin = whpx_transaction_begin, 2411 .commit = whpx_transaction_commit, 2412 .region_add = whpx_region_add, 2413 .region_del = whpx_region_del, 2414 .log_sync = whpx_log_sync, 2415 .priority = MEMORY_LISTENER_PRIORITY_ACCEL, 2416 }; 2417 2418 static void whpx_memory_init(void) 2419 { 2420 memory_listener_register(&whpx_memory_listener, &address_space_memory); 2421 } 2422 2423 /* 2424 * Load the functions from the given library, using the given handle. If a 2425 * handle is provided, it is used, otherwise the library is opened. The 2426 * handle will be updated on return with the opened one. 2427 */ 2428 static bool load_whp_dispatch_fns(HMODULE *handle, 2429 WHPFunctionList function_list) 2430 { 2431 HMODULE hLib = *handle; 2432 2433 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" 2434 #define WINHV_EMULATION_DLL "WinHvEmulation.dll" 2435 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ 2436 whp_dispatch.function_name = \ 2437 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2438 2439 #define WHP_LOAD_FIELD(return_type, function_name, signature) \ 2440 whp_dispatch.function_name = \ 2441 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2442 if (!whp_dispatch.function_name) { \ 2443 error_report("Could not load function %s", #function_name); \ 2444 goto error; \ 2445 } \ 2446 2447 #define WHP_LOAD_LIB(lib_name, handle_lib) \ 2448 if (!handle_lib) { \ 2449 handle_lib = LoadLibrary(lib_name); \ 2450 if (!handle_lib) { \ 2451 error_report("Could not load library %s.", lib_name); \ 2452 goto error; \ 2453 } \ 2454 } \ 2455 2456 switch (function_list) { 2457 case WINHV_PLATFORM_FNS_DEFAULT: 2458 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2459 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) 2460 break; 2461 2462 case WINHV_EMULATION_FNS_DEFAULT: 2463 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) 2464 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) 2465 break; 2466 2467 case WINHV_PLATFORM_FNS_SUPPLEMENTAL: 2468 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2469 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) 2470 break; 2471 } 2472 2473 *handle = hLib; 2474 return true; 2475 2476 error: 2477 if (hLib) { 2478 FreeLibrary(hLib); 2479 } 2480 2481 return false; 2482 } 2483 2484 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, 2485 const char *name, void *opaque, 2486 Error **errp) 2487 { 2488 struct whpx_state *whpx = &whpx_global; 2489 OnOffSplit mode; 2490 2491 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 2492 return; 2493 } 2494 2495 switch (mode) { 2496 case ON_OFF_SPLIT_ON: 2497 whpx->kernel_irqchip_allowed = true; 2498 whpx->kernel_irqchip_required = true; 2499 break; 2500 2501 case ON_OFF_SPLIT_OFF: 2502 whpx->kernel_irqchip_allowed = false; 2503 whpx->kernel_irqchip_required = false; 2504 break; 2505 2506 case ON_OFF_SPLIT_SPLIT: 2507 error_setg(errp, "WHPX: split irqchip currently not supported"); 2508 error_append_hint(errp, 2509 "Try without kernel-irqchip or with kernel-irqchip=on|off"); 2510 break; 2511 2512 default: 2513 /* 2514 * The value was checked in visit_type_OnOffSplit() above. If 2515 * we get here, then something is wrong in QEMU. 2516 */ 2517 abort(); 2518 } 2519 } 2520 2521 /* 2522 * Partition support 2523 */ 2524 2525 static int whpx_accel_init(MachineState *ms) 2526 { 2527 struct whpx_state *whpx; 2528 int ret; 2529 HRESULT hr; 2530 WHV_CAPABILITY whpx_cap; 2531 UINT32 whpx_cap_size; 2532 WHV_PARTITION_PROPERTY prop; 2533 UINT32 cpuidExitList[] = {1, 0x80000001}; 2534 WHV_CAPABILITY_FEATURES features = {0}; 2535 2536 whpx = &whpx_global; 2537 2538 if (!init_whp_dispatch()) { 2539 ret = -ENOSYS; 2540 goto error; 2541 } 2542 2543 whpx->mem_quota = ms->ram_size; 2544 2545 hr = whp_dispatch.WHvGetCapability( 2546 WHvCapabilityCodeHypervisorPresent, &whpx_cap, 2547 sizeof(whpx_cap), &whpx_cap_size); 2548 if (FAILED(hr) || !whpx_cap.HypervisorPresent) { 2549 error_report("WHPX: No accelerator found, hr=%08lx", hr); 2550 ret = -ENOSPC; 2551 goto error; 2552 } 2553 2554 hr = whp_dispatch.WHvGetCapability( 2555 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); 2556 if (FAILED(hr)) { 2557 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); 2558 ret = -EINVAL; 2559 goto error; 2560 } 2561 2562 hr = whp_dispatch.WHvCreatePartition(&whpx->partition); 2563 if (FAILED(hr)) { 2564 error_report("WHPX: Failed to create partition, hr=%08lx", hr); 2565 ret = -EINVAL; 2566 goto error; 2567 } 2568 2569 /* 2570 * Query the XSAVE capability of the partition. Any error here is not 2571 * considered fatal. 2572 */ 2573 hr = whp_dispatch.WHvGetPartitionProperty( 2574 whpx->partition, 2575 WHvPartitionPropertyCodeProcessorXsaveFeatures, 2576 &whpx_xsave_cap, 2577 sizeof(whpx_xsave_cap), 2578 &whpx_cap_size); 2579 2580 /* 2581 * Windows version which don't support this property will return with the 2582 * specific error code. 2583 */ 2584 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) { 2585 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr); 2586 } 2587 2588 if (!whpx_has_xsave()) { 2589 printf("WHPX: Partition is not XSAVE capable\n"); 2590 } 2591 2592 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2593 prop.ProcessorCount = ms->smp.cpus; 2594 hr = whp_dispatch.WHvSetPartitionProperty( 2595 whpx->partition, 2596 WHvPartitionPropertyCodeProcessorCount, 2597 &prop, 2598 sizeof(WHV_PARTITION_PROPERTY)); 2599 2600 if (FAILED(hr)) { 2601 error_report("WHPX: Failed to set partition processor count to %u," 2602 " hr=%08lx", prop.ProcessorCount, hr); 2603 ret = -EINVAL; 2604 goto error; 2605 } 2606 2607 /* 2608 * Error out if WHP doesn't support apic emulation and user is requiring 2609 * it. 2610 */ 2611 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || 2612 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { 2613 error_report("WHPX: kernel irqchip requested, but unavailable. " 2614 "Try without kernel-irqchip or with kernel-irqchip=off"); 2615 ret = -EINVAL; 2616 goto error; 2617 } 2618 2619 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && 2620 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { 2621 WHV_X64_LOCAL_APIC_EMULATION_MODE mode = 2622 WHvX64LocalApicEmulationModeXApic; 2623 printf("WHPX: setting APIC emulation mode in the hypervisor\n"); 2624 hr = whp_dispatch.WHvSetPartitionProperty( 2625 whpx->partition, 2626 WHvPartitionPropertyCodeLocalApicEmulationMode, 2627 &mode, 2628 sizeof(mode)); 2629 if (FAILED(hr)) { 2630 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); 2631 if (whpx->kernel_irqchip_required) { 2632 error_report("WHPX: kernel irqchip requested, but unavailable"); 2633 ret = -EINVAL; 2634 goto error; 2635 } 2636 } else { 2637 whpx->apic_in_platform = true; 2638 } 2639 } 2640 2641 /* Register for MSR and CPUID exits */ 2642 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2643 prop.ExtendedVmExits.X64MsrExit = 1; 2644 prop.ExtendedVmExits.X64CpuidExit = 1; 2645 prop.ExtendedVmExits.ExceptionExit = 1; 2646 if (whpx_apic_in_platform()) { 2647 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; 2648 } 2649 2650 hr = whp_dispatch.WHvSetPartitionProperty( 2651 whpx->partition, 2652 WHvPartitionPropertyCodeExtendedVmExits, 2653 &prop, 2654 sizeof(WHV_PARTITION_PROPERTY)); 2655 if (FAILED(hr)) { 2656 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); 2657 ret = -EINVAL; 2658 goto error; 2659 } 2660 2661 hr = whp_dispatch.WHvSetPartitionProperty( 2662 whpx->partition, 2663 WHvPartitionPropertyCodeCpuidExitList, 2664 cpuidExitList, 2665 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2666 2667 if (FAILED(hr)) { 2668 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2669 hr); 2670 ret = -EINVAL; 2671 goto error; 2672 } 2673 2674 /* 2675 * We do not want to intercept any exceptions from the guest, 2676 * until we actually start debugging with gdb. 2677 */ 2678 whpx->exception_exit_bitmap = -1; 2679 hr = whpx_set_exception_exit_bitmap(0); 2680 2681 if (FAILED(hr)) { 2682 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); 2683 ret = -EINVAL; 2684 goto error; 2685 } 2686 2687 hr = whp_dispatch.WHvSetupPartition(whpx->partition); 2688 if (FAILED(hr)) { 2689 error_report("WHPX: Failed to setup partition, hr=%08lx", hr); 2690 ret = -EINVAL; 2691 goto error; 2692 } 2693 2694 whpx_memory_init(); 2695 2696 printf("Windows Hypervisor Platform accelerator is operational\n"); 2697 return 0; 2698 2699 error: 2700 2701 if (NULL != whpx->partition) { 2702 whp_dispatch.WHvDeletePartition(whpx->partition); 2703 whpx->partition = NULL; 2704 } 2705 2706 return ret; 2707 } 2708 2709 int whpx_enabled(void) 2710 { 2711 return whpx_allowed; 2712 } 2713 2714 bool whpx_apic_in_platform(void) { 2715 return whpx_global.apic_in_platform; 2716 } 2717 2718 static void whpx_accel_class_init(ObjectClass *oc, void *data) 2719 { 2720 AccelClass *ac = ACCEL_CLASS(oc); 2721 ac->name = "WHPX"; 2722 ac->init_machine = whpx_accel_init; 2723 ac->allowed = &whpx_allowed; 2724 2725 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 2726 NULL, whpx_set_kernel_irqchip, 2727 NULL, NULL); 2728 object_class_property_set_description(oc, "kernel-irqchip", 2729 "Configure WHPX in-kernel irqchip"); 2730 } 2731 2732 static void whpx_accel_instance_init(Object *obj) 2733 { 2734 struct whpx_state *whpx = &whpx_global; 2735 2736 memset(whpx, 0, sizeof(struct whpx_state)); 2737 /* Turn on kernel-irqchip, by default */ 2738 whpx->kernel_irqchip_allowed = true; 2739 } 2740 2741 static const TypeInfo whpx_accel_type = { 2742 .name = ACCEL_CLASS_NAME("whpx"), 2743 .parent = TYPE_ACCEL, 2744 .instance_init = whpx_accel_instance_init, 2745 .class_init = whpx_accel_class_init, 2746 }; 2747 2748 static void whpx_type_init(void) 2749 { 2750 type_register_static(&whpx_accel_type); 2751 } 2752 2753 bool init_whp_dispatch(void) 2754 { 2755 if (whp_dispatch_initialized) { 2756 return true; 2757 } 2758 2759 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { 2760 goto error; 2761 } 2762 2763 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { 2764 goto error; 2765 } 2766 2767 assert(load_whp_dispatch_fns(&hWinHvPlatform, 2768 WINHV_PLATFORM_FNS_SUPPLEMENTAL)); 2769 whp_dispatch_initialized = true; 2770 2771 return true; 2772 error: 2773 if (hWinHvPlatform) { 2774 FreeLibrary(hWinHvPlatform); 2775 } 2776 2777 if (hWinHvEmulation) { 2778 FreeLibrary(hWinHvEmulation); 2779 } 2780 2781 return false; 2782 } 2783 2784 type_init(whpx_type_init); 2785