1 /* 2 * QEMU Windows Hypervisor Platform accelerator (WHPX) 3 * 4 * Copyright Microsoft Corp. 2017 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 11 #include "qemu/osdep.h" 12 #include "cpu.h" 13 #include "exec/address-spaces.h" 14 #include "exec/ioport.h" 15 #include "gdbstub/helpers.h" 16 #include "qemu/accel.h" 17 #include "sysemu/whpx.h" 18 #include "sysemu/cpus.h" 19 #include "sysemu/runstate.h" 20 #include "qemu/main-loop.h" 21 #include "hw/boards.h" 22 #include "hw/intc/ioapic.h" 23 #include "hw/i386/apic_internal.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "qapi/qapi-types-common.h" 27 #include "qapi/qapi-visit-common.h" 28 #include "migration/blocker.h" 29 #include <winerror.h> 30 31 #include "whpx-internal.h" 32 #include "whpx-accel-ops.h" 33 34 #include <winhvplatform.h> 35 #include <winhvemulation.h> 36 37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) 38 39 static const WHV_REGISTER_NAME whpx_register_names[] = { 40 41 /* X64 General purpose registers */ 42 WHvX64RegisterRax, 43 WHvX64RegisterRcx, 44 WHvX64RegisterRdx, 45 WHvX64RegisterRbx, 46 WHvX64RegisterRsp, 47 WHvX64RegisterRbp, 48 WHvX64RegisterRsi, 49 WHvX64RegisterRdi, 50 WHvX64RegisterR8, 51 WHvX64RegisterR9, 52 WHvX64RegisterR10, 53 WHvX64RegisterR11, 54 WHvX64RegisterR12, 55 WHvX64RegisterR13, 56 WHvX64RegisterR14, 57 WHvX64RegisterR15, 58 WHvX64RegisterRip, 59 WHvX64RegisterRflags, 60 61 /* X64 Segment registers */ 62 WHvX64RegisterEs, 63 WHvX64RegisterCs, 64 WHvX64RegisterSs, 65 WHvX64RegisterDs, 66 WHvX64RegisterFs, 67 WHvX64RegisterGs, 68 WHvX64RegisterLdtr, 69 WHvX64RegisterTr, 70 71 /* X64 Table registers */ 72 WHvX64RegisterIdtr, 73 WHvX64RegisterGdtr, 74 75 /* X64 Control Registers */ 76 WHvX64RegisterCr0, 77 WHvX64RegisterCr2, 78 WHvX64RegisterCr3, 79 WHvX64RegisterCr4, 80 WHvX64RegisterCr8, 81 82 /* X64 Debug Registers */ 83 /* 84 * WHvX64RegisterDr0, 85 * WHvX64RegisterDr1, 86 * WHvX64RegisterDr2, 87 * WHvX64RegisterDr3, 88 * WHvX64RegisterDr6, 89 * WHvX64RegisterDr7, 90 */ 91 92 /* X64 Floating Point and Vector Registers */ 93 WHvX64RegisterXmm0, 94 WHvX64RegisterXmm1, 95 WHvX64RegisterXmm2, 96 WHvX64RegisterXmm3, 97 WHvX64RegisterXmm4, 98 WHvX64RegisterXmm5, 99 WHvX64RegisterXmm6, 100 WHvX64RegisterXmm7, 101 WHvX64RegisterXmm8, 102 WHvX64RegisterXmm9, 103 WHvX64RegisterXmm10, 104 WHvX64RegisterXmm11, 105 WHvX64RegisterXmm12, 106 WHvX64RegisterXmm13, 107 WHvX64RegisterXmm14, 108 WHvX64RegisterXmm15, 109 WHvX64RegisterFpMmx0, 110 WHvX64RegisterFpMmx1, 111 WHvX64RegisterFpMmx2, 112 WHvX64RegisterFpMmx3, 113 WHvX64RegisterFpMmx4, 114 WHvX64RegisterFpMmx5, 115 WHvX64RegisterFpMmx6, 116 WHvX64RegisterFpMmx7, 117 WHvX64RegisterFpControlStatus, 118 WHvX64RegisterXmmControlStatus, 119 120 /* X64 MSRs */ 121 WHvX64RegisterEfer, 122 #ifdef TARGET_X86_64 123 WHvX64RegisterKernelGsBase, 124 #endif 125 WHvX64RegisterApicBase, 126 /* WHvX64RegisterPat, */ 127 WHvX64RegisterSysenterCs, 128 WHvX64RegisterSysenterEip, 129 WHvX64RegisterSysenterEsp, 130 WHvX64RegisterStar, 131 #ifdef TARGET_X86_64 132 WHvX64RegisterLstar, 133 WHvX64RegisterCstar, 134 WHvX64RegisterSfmask, 135 #endif 136 137 /* Interrupt / Event Registers */ 138 /* 139 * WHvRegisterPendingInterruption, 140 * WHvRegisterInterruptState, 141 * WHvRegisterPendingEvent0, 142 * WHvRegisterPendingEvent1 143 * WHvX64RegisterDeliverabilityNotifications, 144 */ 145 }; 146 147 struct whpx_register_set { 148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; 149 }; 150 151 /* 152 * The current implementation of instruction stepping sets the TF flag 153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction. 154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. 155 * 156 * This approach has a few limitations: 157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag 158 * along with the other flags, possibly restoring it later. It would 159 * result in another INT1 when the flags are restored, triggering 160 * a stop in gdb that could be cleared by doing another step. 161 * 162 * Stepping over a POPF/LAHF instruction will let it overwrite the 163 * TF flags, ending the stepping mode. 164 * 165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, 166 * or anything that could result in a page fault) will save the flags 167 * to the stack, clear the TF flag, and let the guest execute the 168 * handler. Normally, the guest will restore the original flags, 169 * that will continue single-stepping. 170 * 171 * 3. Debuggers running on the guest may wish to set TF to do instruction 172 * stepping. INT1 events generated by it would be intercepted by us, 173 * as long as the gdb is connected to QEMU. 174 * 175 * In practice this means that: 176 * 1. Stepping through flags-modifying instructions may cause gdb to 177 * continue or stop in unexpected places. This will be fully recoverable 178 * and will not crash the target. 179 * 180 * 2. Stepping over an instruction that triggers an exception will step 181 * over the exception handler, not into it. 182 * 183 * 3. Debugging the guest via gdb, while running debugger on the guest 184 * at the same time may lead to unexpected effects. Removing all 185 * breakpoints set via QEMU will prevent any further interference 186 * with the guest-level debuggers. 187 * 188 * The limitations can be addressed as shown below: 189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of 190 * stepping through them. The exact semantics of the instructions is 191 * defined in the "Combined Volume Set of Intel 64 and IA-32 192 * Architectures Software Developer's Manuals", however it involves a 193 * fair amount of corner cases due to compatibility with real mode, 194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes. 195 * 196 * 2. We could step into the guest's exception handlers using the following 197 * sequence: 198 * a. Temporarily enable catching of all exception types via 199 * whpx_set_exception_exit_bitmap(). 200 * b. Once an exception is intercepted, read the IDT/GDT and locate 201 * the original handler. 202 * c. Patch the original handler, injecting an INT3 at the beginning. 203 * d. Update the exception exit bitmap to only catch the 204 * WHvX64ExceptionTypeBreakpointTrap exception. 205 * e. Let the affected CPU run in the exclusive mode. 206 * f. Restore the original handler and the exception exit bitmap. 207 * Note that handling all corner cases related to IDT/GDT is harder 208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a 209 * rough idea. 210 * 211 * 3. In order to properly support guest-level debugging in parallel with 212 * the QEMU-level debugging, we would need to be able to pass some INT1 213 * events to the guest. This could be done via the following methods: 214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, 215 * it seems to only work for interrupts and not software 216 * exceptions. 217 * b. Locating and patching the original handler by parsing IDT/GDT. 218 * This involves relatively complex logic outlined in the previous 219 * paragraph. 220 * c. Emulating the exception invocation (i.e. manually updating RIP, 221 * RFLAGS, and pushing the old values to stack). This is even more 222 * complicated than the previous option, since it involves checking 223 * CPL, gate attributes, and doing various adjustments depending 224 * on the current CPU mode, whether the CPL is changing, etc. 225 */ 226 typedef enum WhpxStepMode { 227 WHPX_STEP_NONE = 0, 228 /* Halt other VCPUs */ 229 WHPX_STEP_EXCLUSIVE, 230 } WhpxStepMode; 231 232 struct AccelCPUState { 233 WHV_EMULATOR_HANDLE emulator; 234 bool window_registered; 235 bool interruptable; 236 bool ready_for_pic_interrupt; 237 uint64_t tpr; 238 uint64_t apic_base; 239 bool interruption_pending; 240 bool dirty; 241 242 /* Must be the last field as it may have a tail */ 243 WHV_RUN_VP_EXIT_CONTEXT exit_ctx; 244 }; 245 246 static bool whpx_allowed; 247 static bool whp_dispatch_initialized; 248 static HMODULE hWinHvPlatform, hWinHvEmulation; 249 static uint32_t max_vcpu_index; 250 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap; 251 252 struct whpx_state whpx_global; 253 struct WHPDispatch whp_dispatch; 254 255 static bool whpx_has_xsave(void) 256 { 257 return whpx_xsave_cap.XsaveSupport; 258 } 259 260 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, 261 int r86) 262 { 263 WHV_X64_SEGMENT_REGISTER hs; 264 unsigned flags = qs->flags; 265 266 hs.Base = qs->base; 267 hs.Limit = qs->limit; 268 hs.Selector = qs->selector; 269 270 if (v86) { 271 hs.Attributes = 0; 272 hs.SegmentType = 3; 273 hs.Present = 1; 274 hs.DescriptorPrivilegeLevel = 3; 275 hs.NonSystemSegment = 1; 276 277 } else { 278 hs.Attributes = (flags >> DESC_TYPE_SHIFT); 279 280 if (r86) { 281 /* hs.Base &= 0xfffff; */ 282 } 283 } 284 285 return hs; 286 } 287 288 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) 289 { 290 SegmentCache qs; 291 292 qs.base = hs->Base; 293 qs.limit = hs->Limit; 294 qs.selector = hs->Selector; 295 296 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; 297 298 return qs; 299 } 300 301 /* X64 Extended Control Registers */ 302 static void whpx_set_xcrs(CPUState *cpu) 303 { 304 HRESULT hr; 305 struct whpx_state *whpx = &whpx_global; 306 WHV_REGISTER_VALUE xcr0; 307 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 308 309 if (!whpx_has_xsave()) { 310 return; 311 } 312 313 /* Only xcr0 is supported by the hypervisor currently */ 314 xcr0.Reg64 = cpu_env(cpu)->xcr0; 315 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 316 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 317 if (FAILED(hr)) { 318 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr); 319 } 320 } 321 322 static int whpx_set_tsc(CPUState *cpu) 323 { 324 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 325 WHV_REGISTER_VALUE tsc_val; 326 HRESULT hr; 327 struct whpx_state *whpx = &whpx_global; 328 329 /* 330 * Suspend the partition prior to setting the TSC to reduce the variance 331 * in TSC across vCPUs. When the first vCPU runs post suspend, the 332 * partition is automatically resumed. 333 */ 334 if (whp_dispatch.WHvSuspendPartitionTime) { 335 336 /* 337 * Unable to suspend partition while setting TSC is not a fatal 338 * error. It just increases the likelihood of TSC variance between 339 * vCPUs and some guest OS are able to handle that just fine. 340 */ 341 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); 342 if (FAILED(hr)) { 343 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); 344 } 345 } 346 347 tsc_val.Reg64 = cpu_env(cpu)->tsc; 348 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 349 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 350 if (FAILED(hr)) { 351 error_report("WHPX: Failed to set TSC, hr=%08lx", hr); 352 return -1; 353 } 354 355 return 0; 356 } 357 358 /* 359 * The CR8 register in the CPU is mapped to the TPR register of the APIC, 360 * however, they use a slightly different encoding. Specifically: 361 * 362 * APIC.TPR[bits 7:4] = CR8[bits 3:0] 363 * 364 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 365 * and IA-32 Architectures Software Developer's Manual. 366 * 367 * The functions below translate the value of CR8 to TPR and vice versa. 368 */ 369 370 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) 371 { 372 return tpr >> 4; 373 } 374 375 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8) 376 { 377 return cr8 << 4; 378 } 379 380 static void whpx_set_registers(CPUState *cpu, int level) 381 { 382 struct whpx_state *whpx = &whpx_global; 383 AccelCPUState *vcpu = cpu->accel; 384 X86CPU *x86_cpu = X86_CPU(cpu); 385 CPUX86State *env = &x86_cpu->env; 386 struct whpx_register_set vcxt; 387 HRESULT hr; 388 int idx; 389 int idx_next; 390 int i; 391 int v86, r86; 392 393 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 394 395 /* 396 * Following MSRs have side effects on the guest or are too heavy for 397 * runtime. Limit them to full state update. 398 */ 399 if (level >= WHPX_SET_RESET_STATE) { 400 whpx_set_tsc(cpu); 401 } 402 403 memset(&vcxt, 0, sizeof(struct whpx_register_set)); 404 405 v86 = (env->eflags & VM_MASK); 406 r86 = !(env->cr[0] & CR0_PE_MASK); 407 408 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 409 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); 410 411 idx = 0; 412 413 /* Indexes for first 16 registers match between HV and QEMU definitions */ 414 idx_next = 16; 415 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 416 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; 417 } 418 idx = idx_next; 419 420 /* Same goes for RIP and RFLAGS */ 421 assert(whpx_register_names[idx] == WHvX64RegisterRip); 422 vcxt.values[idx++].Reg64 = env->eip; 423 424 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 425 vcxt.values[idx++].Reg64 = env->eflags; 426 427 /* Translate 6+4 segment registers. HV and QEMU order matches */ 428 assert(idx == WHvX64RegisterEs); 429 for (i = 0; i < 6; i += 1, idx += 1) { 430 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); 431 } 432 433 assert(idx == WHvX64RegisterLdtr); 434 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); 435 436 assert(idx == WHvX64RegisterTr); 437 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); 438 439 assert(idx == WHvX64RegisterIdtr); 440 vcxt.values[idx].Table.Base = env->idt.base; 441 vcxt.values[idx].Table.Limit = env->idt.limit; 442 idx += 1; 443 444 assert(idx == WHvX64RegisterGdtr); 445 vcxt.values[idx].Table.Base = env->gdt.base; 446 vcxt.values[idx].Table.Limit = env->gdt.limit; 447 idx += 1; 448 449 /* CR0, 2, 3, 4, 8 */ 450 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 451 vcxt.values[idx++].Reg64 = env->cr[0]; 452 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 453 vcxt.values[idx++].Reg64 = env->cr[2]; 454 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 455 vcxt.values[idx++].Reg64 = env->cr[3]; 456 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 457 vcxt.values[idx++].Reg64 = env->cr[4]; 458 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 459 vcxt.values[idx++].Reg64 = vcpu->tpr; 460 461 /* 8 Debug Registers - Skipped */ 462 463 /* 464 * Extended control registers needs to be handled separately depending 465 * on whether xsave is supported/enabled or not. 466 */ 467 whpx_set_xcrs(cpu); 468 469 /* 16 XMM registers */ 470 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 471 idx_next = idx + 16; 472 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 473 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); 474 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); 475 } 476 idx = idx_next; 477 478 /* 8 FP registers */ 479 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 480 for (i = 0; i < 8; i += 1, idx += 1) { 481 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); 482 /* vcxt.values[idx].Fp.AsUINT128.High64 = 483 env->fpregs[i].mmx.MMX_Q(1); 484 */ 485 } 486 487 /* FP control status register */ 488 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 489 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; 490 vcxt.values[idx].FpControlStatus.FpStatus = 491 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; 492 vcxt.values[idx].FpControlStatus.FpTag = 0; 493 for (i = 0; i < 8; ++i) { 494 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; 495 } 496 vcxt.values[idx].FpControlStatus.Reserved = 0; 497 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; 498 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; 499 idx += 1; 500 501 /* XMM control status register */ 502 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 503 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; 504 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; 505 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; 506 idx += 1; 507 508 /* MSRs */ 509 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 510 vcxt.values[idx++].Reg64 = env->efer; 511 #ifdef TARGET_X86_64 512 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 513 vcxt.values[idx++].Reg64 = env->kernelgsbase; 514 #endif 515 516 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 517 vcxt.values[idx++].Reg64 = vcpu->apic_base; 518 519 /* WHvX64RegisterPat - Skipped */ 520 521 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 522 vcxt.values[idx++].Reg64 = env->sysenter_cs; 523 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 524 vcxt.values[idx++].Reg64 = env->sysenter_eip; 525 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 526 vcxt.values[idx++].Reg64 = env->sysenter_esp; 527 assert(whpx_register_names[idx] == WHvX64RegisterStar); 528 vcxt.values[idx++].Reg64 = env->star; 529 #ifdef TARGET_X86_64 530 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 531 vcxt.values[idx++].Reg64 = env->lstar; 532 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 533 vcxt.values[idx++].Reg64 = env->cstar; 534 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 535 vcxt.values[idx++].Reg64 = env->fmask; 536 #endif 537 538 /* Interrupt / Event Registers - Skipped */ 539 540 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 541 542 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 543 whpx->partition, cpu->cpu_index, 544 whpx_register_names, 545 RTL_NUMBER_OF(whpx_register_names), 546 &vcxt.values[0]); 547 548 if (FAILED(hr)) { 549 error_report("WHPX: Failed to set virtual processor context, hr=%08lx", 550 hr); 551 } 552 553 return; 554 } 555 556 static int whpx_get_tsc(CPUState *cpu) 557 { 558 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 559 WHV_REGISTER_VALUE tsc_val; 560 HRESULT hr; 561 struct whpx_state *whpx = &whpx_global; 562 563 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 564 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 565 if (FAILED(hr)) { 566 error_report("WHPX: Failed to get TSC, hr=%08lx", hr); 567 return -1; 568 } 569 570 cpu_env(cpu)->tsc = tsc_val.Reg64; 571 return 0; 572 } 573 574 /* X64 Extended Control Registers */ 575 static void whpx_get_xcrs(CPUState *cpu) 576 { 577 HRESULT hr; 578 struct whpx_state *whpx = &whpx_global; 579 WHV_REGISTER_VALUE xcr0; 580 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 581 582 if (!whpx_has_xsave()) { 583 return; 584 } 585 586 /* Only xcr0 is supported by the hypervisor currently */ 587 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 588 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 589 if (FAILED(hr)) { 590 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr); 591 return; 592 } 593 594 cpu_env(cpu)->xcr0 = xcr0.Reg64; 595 } 596 597 static void whpx_get_registers(CPUState *cpu) 598 { 599 struct whpx_state *whpx = &whpx_global; 600 AccelCPUState *vcpu = cpu->accel; 601 X86CPU *x86_cpu = X86_CPU(cpu); 602 CPUX86State *env = &x86_cpu->env; 603 struct whpx_register_set vcxt; 604 uint64_t tpr, apic_base; 605 HRESULT hr; 606 int idx; 607 int idx_next; 608 int i; 609 610 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 611 612 if (!env->tsc_valid) { 613 whpx_get_tsc(cpu); 614 env->tsc_valid = !runstate_is_running(); 615 } 616 617 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 618 whpx->partition, cpu->cpu_index, 619 whpx_register_names, 620 RTL_NUMBER_OF(whpx_register_names), 621 &vcxt.values[0]); 622 if (FAILED(hr)) { 623 error_report("WHPX: Failed to get virtual processor context, hr=%08lx", 624 hr); 625 } 626 627 if (whpx_apic_in_platform()) { 628 /* 629 * Fetch the TPR value from the emulated APIC. It may get overwritten 630 * below with the value from CR8 returned by 631 * WHvGetVirtualProcessorRegisters(). 632 */ 633 whpx_apic_get(x86_cpu->apic_state); 634 vcpu->tpr = whpx_apic_tpr_to_cr8( 635 cpu_get_apic_tpr(x86_cpu->apic_state)); 636 } 637 638 idx = 0; 639 640 /* Indexes for first 16 registers match between HV and QEMU definitions */ 641 idx_next = 16; 642 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 643 env->regs[idx] = vcxt.values[idx].Reg64; 644 } 645 idx = idx_next; 646 647 /* Same goes for RIP and RFLAGS */ 648 assert(whpx_register_names[idx] == WHvX64RegisterRip); 649 env->eip = vcxt.values[idx++].Reg64; 650 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 651 env->eflags = vcxt.values[idx++].Reg64; 652 653 /* Translate 6+4 segment registers. HV and QEMU order matches */ 654 assert(idx == WHvX64RegisterEs); 655 for (i = 0; i < 6; i += 1, idx += 1) { 656 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); 657 } 658 659 assert(idx == WHvX64RegisterLdtr); 660 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); 661 assert(idx == WHvX64RegisterTr); 662 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); 663 assert(idx == WHvX64RegisterIdtr); 664 env->idt.base = vcxt.values[idx].Table.Base; 665 env->idt.limit = vcxt.values[idx].Table.Limit; 666 idx += 1; 667 assert(idx == WHvX64RegisterGdtr); 668 env->gdt.base = vcxt.values[idx].Table.Base; 669 env->gdt.limit = vcxt.values[idx].Table.Limit; 670 idx += 1; 671 672 /* CR0, 2, 3, 4, 8 */ 673 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 674 env->cr[0] = vcxt.values[idx++].Reg64; 675 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 676 env->cr[2] = vcxt.values[idx++].Reg64; 677 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 678 env->cr[3] = vcxt.values[idx++].Reg64; 679 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 680 env->cr[4] = vcxt.values[idx++].Reg64; 681 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 682 tpr = vcxt.values[idx++].Reg64; 683 if (tpr != vcpu->tpr) { 684 vcpu->tpr = tpr; 685 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr)); 686 } 687 688 /* 8 Debug Registers - Skipped */ 689 690 /* 691 * Extended control registers needs to be handled separately depending 692 * on whether xsave is supported/enabled or not. 693 */ 694 whpx_get_xcrs(cpu); 695 696 /* 16 XMM registers */ 697 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 698 idx_next = idx + 16; 699 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 700 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; 701 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; 702 } 703 idx = idx_next; 704 705 /* 8 FP registers */ 706 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 707 for (i = 0; i < 8; i += 1, idx += 1) { 708 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; 709 /* env->fpregs[i].mmx.MMX_Q(1) = 710 vcxt.values[idx].Fp.AsUINT128.High64; 711 */ 712 } 713 714 /* FP control status register */ 715 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 716 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; 717 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; 718 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; 719 for (i = 0; i < 8; ++i) { 720 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); 721 } 722 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; 723 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; 724 idx += 1; 725 726 /* XMM control status register */ 727 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 728 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; 729 idx += 1; 730 731 /* MSRs */ 732 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 733 env->efer = vcxt.values[idx++].Reg64; 734 #ifdef TARGET_X86_64 735 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 736 env->kernelgsbase = vcxt.values[idx++].Reg64; 737 #endif 738 739 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 740 apic_base = vcxt.values[idx++].Reg64; 741 if (apic_base != vcpu->apic_base) { 742 vcpu->apic_base = apic_base; 743 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); 744 } 745 746 /* WHvX64RegisterPat - Skipped */ 747 748 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 749 env->sysenter_cs = vcxt.values[idx++].Reg64; 750 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 751 env->sysenter_eip = vcxt.values[idx++].Reg64; 752 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 753 env->sysenter_esp = vcxt.values[idx++].Reg64; 754 assert(whpx_register_names[idx] == WHvX64RegisterStar); 755 env->star = vcxt.values[idx++].Reg64; 756 #ifdef TARGET_X86_64 757 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 758 env->lstar = vcxt.values[idx++].Reg64; 759 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 760 env->cstar = vcxt.values[idx++].Reg64; 761 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 762 env->fmask = vcxt.values[idx++].Reg64; 763 #endif 764 765 /* Interrupt / Event Registers - Skipped */ 766 767 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 768 769 if (whpx_apic_in_platform()) { 770 whpx_apic_get(x86_cpu->apic_state); 771 } 772 773 x86_update_hflags(env); 774 775 return; 776 } 777 778 static HRESULT CALLBACK whpx_emu_ioport_callback( 779 void *ctx, 780 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) 781 { 782 MemTxAttrs attrs = { 0 }; 783 address_space_rw(&address_space_io, IoAccess->Port, attrs, 784 &IoAccess->Data, IoAccess->AccessSize, 785 IoAccess->Direction); 786 return S_OK; 787 } 788 789 static HRESULT CALLBACK whpx_emu_mmio_callback( 790 void *ctx, 791 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) 792 { 793 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, 794 ma->Direction); 795 return S_OK; 796 } 797 798 static HRESULT CALLBACK whpx_emu_getreg_callback( 799 void *ctx, 800 const WHV_REGISTER_NAME *RegisterNames, 801 UINT32 RegisterCount, 802 WHV_REGISTER_VALUE *RegisterValues) 803 { 804 HRESULT hr; 805 struct whpx_state *whpx = &whpx_global; 806 CPUState *cpu = (CPUState *)ctx; 807 808 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 809 whpx->partition, cpu->cpu_index, 810 RegisterNames, RegisterCount, 811 RegisterValues); 812 if (FAILED(hr)) { 813 error_report("WHPX: Failed to get virtual processor registers," 814 " hr=%08lx", hr); 815 } 816 817 return hr; 818 } 819 820 static HRESULT CALLBACK whpx_emu_setreg_callback( 821 void *ctx, 822 const WHV_REGISTER_NAME *RegisterNames, 823 UINT32 RegisterCount, 824 const WHV_REGISTER_VALUE *RegisterValues) 825 { 826 HRESULT hr; 827 struct whpx_state *whpx = &whpx_global; 828 CPUState *cpu = (CPUState *)ctx; 829 830 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 831 whpx->partition, cpu->cpu_index, 832 RegisterNames, RegisterCount, 833 RegisterValues); 834 if (FAILED(hr)) { 835 error_report("WHPX: Failed to set virtual processor registers," 836 " hr=%08lx", hr); 837 } 838 839 /* 840 * The emulator just successfully wrote the register state. We clear the 841 * dirty state so we avoid the double write on resume of the VP. 842 */ 843 cpu->accel->dirty = false; 844 845 return hr; 846 } 847 848 static HRESULT CALLBACK whpx_emu_translate_callback( 849 void *ctx, 850 WHV_GUEST_VIRTUAL_ADDRESS Gva, 851 WHV_TRANSLATE_GVA_FLAGS TranslateFlags, 852 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, 853 WHV_GUEST_PHYSICAL_ADDRESS *Gpa) 854 { 855 HRESULT hr; 856 struct whpx_state *whpx = &whpx_global; 857 CPUState *cpu = (CPUState *)ctx; 858 WHV_TRANSLATE_GVA_RESULT res; 859 860 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, 861 Gva, TranslateFlags, &res, Gpa); 862 if (FAILED(hr)) { 863 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); 864 } else { 865 *TranslationResult = res.ResultCode; 866 } 867 868 return hr; 869 } 870 871 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { 872 .Size = sizeof(WHV_EMULATOR_CALLBACKS), 873 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, 874 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, 875 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, 876 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, 877 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, 878 }; 879 880 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) 881 { 882 HRESULT hr; 883 AccelCPUState *vcpu = cpu->accel; 884 WHV_EMULATOR_STATUS emu_status; 885 886 hr = whp_dispatch.WHvEmulatorTryMmioEmulation( 887 vcpu->emulator, cpu, 888 &vcpu->exit_ctx.VpContext, ctx, 889 &emu_status); 890 if (FAILED(hr)) { 891 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); 892 return -1; 893 } 894 895 if (!emu_status.EmulationSuccessful) { 896 error_report("WHPX: Failed to emulate MMIO access with" 897 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 898 return -1; 899 } 900 901 return 0; 902 } 903 904 static int whpx_handle_portio(CPUState *cpu, 905 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) 906 { 907 HRESULT hr; 908 AccelCPUState *vcpu = cpu->accel; 909 WHV_EMULATOR_STATUS emu_status; 910 911 hr = whp_dispatch.WHvEmulatorTryIoEmulation( 912 vcpu->emulator, cpu, 913 &vcpu->exit_ctx.VpContext, ctx, 914 &emu_status); 915 if (FAILED(hr)) { 916 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); 917 return -1; 918 } 919 920 if (!emu_status.EmulationSuccessful) { 921 error_report("WHPX: Failed to emulate PortIO access with" 922 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 923 return -1; 924 } 925 926 return 0; 927 } 928 929 /* 930 * Controls whether we should intercept various exceptions on the guest, 931 * namely breakpoint/single-step events. 932 * 933 * The 'exceptions' argument accepts a bitmask, e.g: 934 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) 935 */ 936 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) 937 { 938 struct whpx_state *whpx = &whpx_global; 939 WHV_PARTITION_PROPERTY prop = { 0, }; 940 HRESULT hr; 941 942 if (exceptions == whpx->exception_exit_bitmap) { 943 return S_OK; 944 } 945 946 prop.ExceptionExitBitmap = exceptions; 947 948 hr = whp_dispatch.WHvSetPartitionProperty( 949 whpx->partition, 950 WHvPartitionPropertyCodeExceptionExitBitmap, 951 &prop, 952 sizeof(WHV_PARTITION_PROPERTY)); 953 954 if (SUCCEEDED(hr)) { 955 whpx->exception_exit_bitmap = exceptions; 956 } 957 958 return hr; 959 } 960 961 962 /* 963 * This function is called before/after stepping over a single instruction. 964 * It will update the CPU registers to arm/disarm the instruction stepping 965 * accordingly. 966 */ 967 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, 968 bool set, 969 uint64_t *exit_context_rflags) 970 { 971 WHV_REGISTER_NAME reg_name; 972 WHV_REGISTER_VALUE reg_value; 973 HRESULT hr; 974 struct whpx_state *whpx = &whpx_global; 975 976 /* 977 * If we are trying to step over a single instruction, we need to set the 978 * TF bit in rflags. Otherwise, clear it. 979 */ 980 reg_name = WHvX64RegisterRflags; 981 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 982 whpx->partition, 983 cpu->cpu_index, 984 ®_name, 985 1, 986 ®_value); 987 988 if (FAILED(hr)) { 989 error_report("WHPX: Failed to get rflags, hr=%08lx", hr); 990 return hr; 991 } 992 993 if (exit_context_rflags) { 994 assert(*exit_context_rflags == reg_value.Reg64); 995 } 996 997 if (set) { 998 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ 999 reg_value.Reg64 |= TF_MASK; 1000 } else { 1001 reg_value.Reg64 &= ~TF_MASK; 1002 } 1003 1004 if (exit_context_rflags) { 1005 *exit_context_rflags = reg_value.Reg64; 1006 } 1007 1008 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1009 whpx->partition, 1010 cpu->cpu_index, 1011 ®_name, 1012 1, 1013 ®_value); 1014 1015 if (FAILED(hr)) { 1016 error_report("WHPX: Failed to set rflags," 1017 " hr=%08lx", 1018 hr); 1019 return hr; 1020 } 1021 1022 reg_name = WHvRegisterInterruptState; 1023 reg_value.Reg64 = 0; 1024 1025 /* Suspend delivery of hardware interrupts during single-stepping. */ 1026 reg_value.InterruptState.InterruptShadow = set != 0; 1027 1028 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1029 whpx->partition, 1030 cpu->cpu_index, 1031 ®_name, 1032 1, 1033 ®_value); 1034 1035 if (FAILED(hr)) { 1036 error_report("WHPX: Failed to set InterruptState," 1037 " hr=%08lx", 1038 hr); 1039 return hr; 1040 } 1041 1042 if (!set) { 1043 /* 1044 * We have just finished stepping over a single instruction, 1045 * and intercepted the INT1 generated by it. 1046 * We need to now hide the INT1 from the guest, 1047 * as it would not be expecting it. 1048 */ 1049 1050 reg_name = WHvX64RegisterPendingDebugException; 1051 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1052 whpx->partition, 1053 cpu->cpu_index, 1054 ®_name, 1055 1, 1056 ®_value); 1057 1058 if (FAILED(hr)) { 1059 error_report("WHPX: Failed to get pending debug exceptions," 1060 "hr=%08lx", hr); 1061 return hr; 1062 } 1063 1064 if (reg_value.PendingDebugException.SingleStep) { 1065 reg_value.PendingDebugException.SingleStep = 0; 1066 1067 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1068 whpx->partition, 1069 cpu->cpu_index, 1070 ®_name, 1071 1, 1072 ®_value); 1073 1074 if (FAILED(hr)) { 1075 error_report("WHPX: Failed to clear pending debug exceptions," 1076 "hr=%08lx", hr); 1077 return hr; 1078 } 1079 } 1080 1081 } 1082 1083 return S_OK; 1084 } 1085 1086 /* Tries to find a breakpoint at the specified address. */ 1087 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) 1088 { 1089 struct whpx_state *whpx = &whpx_global; 1090 int i; 1091 1092 if (whpx->breakpoints.breakpoints) { 1093 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { 1094 if (address == whpx->breakpoints.breakpoints->data[i].address) { 1095 return &whpx->breakpoints.breakpoints->data[i]; 1096 } 1097 } 1098 } 1099 1100 return NULL; 1101 } 1102 1103 /* 1104 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for 1105 * debugging user-mode applications. Since the WHPX API does not offer 1106 * an easy way to pass the intercepted exception back to the guest, we 1107 * resort to using INT1 instead, and let the guest always handle INT3. 1108 */ 1109 static const uint8_t whpx_breakpoint_instruction = 0xF1; 1110 1111 /* 1112 * The WHPX QEMU backend implements breakpoints by writing the INT1 1113 * instruction into memory (ignoring the DRx registers). This raises a few 1114 * issues that need to be carefully handled: 1115 * 1116 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints 1117 * at the same location, and later remove them in arbitrary order. 1118 * This should not cause memory corruption, and should only remove the 1119 * physical breakpoint instruction when the last QEMU breakpoint is gone. 1120 * 1121 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid 1122 * physical location. Hence, physically adding/removing a breakpoint can 1123 * theoretically fail at any time. We need to keep track of it. 1124 * 1125 * The function below rebuilds a list of low-level breakpoints (one per 1126 * address, tracking the original instruction and any errors) from the list of 1127 * high-level breakpoints (set via cpu_breakpoint_insert()). 1128 * 1129 * In order to optimize performance, this function stores the list of 1130 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the 1131 * low-level ones, so that it won't be re-invoked until these breakpoints 1132 * change. 1133 * 1134 * Note that this function decides which breakpoints should be inserted into, 1135 * memory, but doesn't actually do it. The memory accessing is done in 1136 * whpx_apply_breakpoints(). 1137 */ 1138 static void whpx_translate_cpu_breakpoints( 1139 struct whpx_breakpoints *breakpoints, 1140 CPUState *cpu, 1141 int cpu_breakpoint_count) 1142 { 1143 CPUBreakpoint *bp; 1144 int cpu_bp_index = 0; 1145 1146 breakpoints->original_addresses = 1147 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); 1148 1149 breakpoints->original_address_count = cpu_breakpoint_count; 1150 1151 int max_breakpoints = cpu_breakpoint_count + 1152 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); 1153 1154 struct whpx_breakpoint_collection *new_breakpoints = 1155 g_malloc0(sizeof(struct whpx_breakpoint_collection) 1156 + max_breakpoints * sizeof(struct whpx_breakpoint)); 1157 1158 new_breakpoints->allocated = max_breakpoints; 1159 new_breakpoints->used = 0; 1160 1161 /* 1162 * 1. Preserve all old breakpoints that could not be automatically 1163 * cleared when the CPU got stopped. 1164 */ 1165 if (breakpoints->breakpoints) { 1166 int i; 1167 for (i = 0; i < breakpoints->breakpoints->used; i++) { 1168 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { 1169 new_breakpoints->data[new_breakpoints->used++] = 1170 breakpoints->breakpoints->data[i]; 1171 } 1172 } 1173 } 1174 1175 /* 2. Map all CPU breakpoints to WHPX breakpoints */ 1176 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1177 int i; 1178 bool found = false; 1179 1180 /* This will be used to detect changed CPU breakpoints later. */ 1181 breakpoints->original_addresses[cpu_bp_index++] = bp->pc; 1182 1183 for (i = 0; i < new_breakpoints->used; i++) { 1184 /* 1185 * WARNING: This loop has O(N^2) complexity, where N is the 1186 * number of breakpoints. It should not be a bottleneck in 1187 * real-world scenarios, since it only needs to run once after 1188 * the breakpoints have been modified. 1189 * If this ever becomes a concern, it can be optimized by storing 1190 * high-level breakpoint objects in a tree or hash map. 1191 */ 1192 1193 if (new_breakpoints->data[i].address == bp->pc) { 1194 /* There was already a breakpoint at this address. */ 1195 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { 1196 new_breakpoints->data[i].state = WHPX_BP_SET; 1197 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { 1198 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; 1199 } 1200 1201 found = true; 1202 break; 1203 } 1204 } 1205 1206 if (!found && new_breakpoints->used < new_breakpoints->allocated) { 1207 /* No WHPX breakpoint at this address. Create one. */ 1208 new_breakpoints->data[new_breakpoints->used].address = bp->pc; 1209 new_breakpoints->data[new_breakpoints->used].state = 1210 WHPX_BP_SET_PENDING; 1211 new_breakpoints->used++; 1212 } 1213 } 1214 1215 /* 1216 * Free the previous breakpoint list. This can be optimized by keeping 1217 * it as shadow buffer for the next computation instead of freeing 1218 * it immediately. 1219 */ 1220 g_free(breakpoints->breakpoints); 1221 1222 breakpoints->breakpoints = new_breakpoints; 1223 } 1224 1225 /* 1226 * Physically inserts/removes the breakpoints by reading and writing the 1227 * physical memory, keeping a track of the failed attempts. 1228 * 1229 * Passing resuming=true will try to set all previously unset breakpoints. 1230 * Passing resuming=false will remove all inserted ones. 1231 */ 1232 static void whpx_apply_breakpoints( 1233 struct whpx_breakpoint_collection *breakpoints, 1234 CPUState *cpu, 1235 bool resuming) 1236 { 1237 int i, rc; 1238 if (!breakpoints) { 1239 return; 1240 } 1241 1242 for (i = 0; i < breakpoints->used; i++) { 1243 /* Decide what to do right now based on the last known state. */ 1244 WhpxBreakpointState state = breakpoints->data[i].state; 1245 switch (state) { 1246 case WHPX_BP_CLEARED: 1247 if (resuming) { 1248 state = WHPX_BP_SET_PENDING; 1249 } 1250 break; 1251 case WHPX_BP_SET_PENDING: 1252 if (!resuming) { 1253 state = WHPX_BP_CLEARED; 1254 } 1255 break; 1256 case WHPX_BP_SET: 1257 if (!resuming) { 1258 state = WHPX_BP_CLEAR_PENDING; 1259 } 1260 break; 1261 case WHPX_BP_CLEAR_PENDING: 1262 if (resuming) { 1263 state = WHPX_BP_SET; 1264 } 1265 break; 1266 } 1267 1268 if (state == WHPX_BP_SET_PENDING) { 1269 /* Remember the original instruction. */ 1270 rc = cpu_memory_rw_debug(cpu, 1271 breakpoints->data[i].address, 1272 &breakpoints->data[i].original_instruction, 1273 1, 1274 false); 1275 1276 if (!rc) { 1277 /* Write the breakpoint instruction. */ 1278 rc = cpu_memory_rw_debug(cpu, 1279 breakpoints->data[i].address, 1280 (void *)&whpx_breakpoint_instruction, 1281 1, 1282 true); 1283 } 1284 1285 if (!rc) { 1286 state = WHPX_BP_SET; 1287 } 1288 1289 } 1290 1291 if (state == WHPX_BP_CLEAR_PENDING) { 1292 /* Restore the original instruction. */ 1293 rc = cpu_memory_rw_debug(cpu, 1294 breakpoints->data[i].address, 1295 &breakpoints->data[i].original_instruction, 1296 1, 1297 true); 1298 1299 if (!rc) { 1300 state = WHPX_BP_CLEARED; 1301 } 1302 } 1303 1304 breakpoints->data[i].state = state; 1305 } 1306 } 1307 1308 /* 1309 * This function is called when the a VCPU is about to start and no other 1310 * VCPUs have been started so far. Since the VCPU start order could be 1311 * arbitrary, it doesn't have to be VCPU#0. 1312 * 1313 * It is used to commit the breakpoints into memory, and configure WHPX 1314 * to intercept debug exceptions. 1315 * 1316 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or 1317 * more VCPUs are already running, so this is the best place to do it. 1318 */ 1319 static int whpx_first_vcpu_starting(CPUState *cpu) 1320 { 1321 struct whpx_state *whpx = &whpx_global; 1322 HRESULT hr; 1323 1324 g_assert(bql_locked()); 1325 1326 if (!QTAILQ_EMPTY(&cpu->breakpoints) || 1327 (whpx->breakpoints.breakpoints && 1328 whpx->breakpoints.breakpoints->used)) { 1329 CPUBreakpoint *bp; 1330 int i = 0; 1331 bool update_pending = false; 1332 1333 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1334 if (i >= whpx->breakpoints.original_address_count || 1335 bp->pc != whpx->breakpoints.original_addresses[i]) { 1336 update_pending = true; 1337 } 1338 1339 i++; 1340 } 1341 1342 if (i != whpx->breakpoints.original_address_count) { 1343 update_pending = true; 1344 } 1345 1346 if (update_pending) { 1347 /* 1348 * The CPU breakpoints have changed since the last call to 1349 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must 1350 * now be recomputed. 1351 */ 1352 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); 1353 } 1354 1355 /* Actually insert the breakpoints into the memory. */ 1356 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); 1357 } 1358 1359 uint64_t exception_mask; 1360 if (whpx->step_pending || 1361 (whpx->breakpoints.breakpoints && 1362 whpx->breakpoints.breakpoints->used)) { 1363 /* 1364 * We are either attempting to single-step one or more CPUs, or 1365 * have one or more breakpoints enabled. Both require intercepting 1366 * the WHvX64ExceptionTypeBreakpointTrap exception. 1367 */ 1368 1369 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; 1370 } else { 1371 /* Let the guest handle all exceptions. */ 1372 exception_mask = 0; 1373 } 1374 1375 hr = whpx_set_exception_exit_bitmap(exception_mask); 1376 if (!SUCCEEDED(hr)) { 1377 error_report("WHPX: Failed to update exception exit mask," 1378 "hr=%08lx.", hr); 1379 return 1; 1380 } 1381 1382 return 0; 1383 } 1384 1385 /* 1386 * This function is called when the last VCPU has finished running. 1387 * It is used to remove any previously set breakpoints from memory. 1388 */ 1389 static int whpx_last_vcpu_stopping(CPUState *cpu) 1390 { 1391 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); 1392 return 0; 1393 } 1394 1395 /* Returns the address of the next instruction that is about to be executed. */ 1396 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) 1397 { 1398 if (cpu->accel->dirty) { 1399 /* The CPU registers have been modified by other parts of QEMU. */ 1400 return cpu_env(cpu)->eip; 1401 } else if (exit_context_valid) { 1402 /* 1403 * The CPU registers have not been modified by neither other parts 1404 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). 1405 * This is the most common case. 1406 */ 1407 AccelCPUState *vcpu = cpu->accel; 1408 return vcpu->exit_ctx.VpContext.Rip; 1409 } else { 1410 /* 1411 * The CPU registers have been modified by a call to 1412 * WHvSetVirtualProcessorRegisters() and must be re-queried from 1413 * the target. 1414 */ 1415 WHV_REGISTER_VALUE reg_value; 1416 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; 1417 HRESULT hr; 1418 struct whpx_state *whpx = &whpx_global; 1419 1420 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1421 whpx->partition, 1422 cpu->cpu_index, 1423 ®_name, 1424 1, 1425 ®_value); 1426 1427 if (FAILED(hr)) { 1428 error_report("WHPX: Failed to get PC, hr=%08lx", hr); 1429 return 0; 1430 } 1431 1432 return reg_value.Reg64; 1433 } 1434 } 1435 1436 static int whpx_handle_halt(CPUState *cpu) 1437 { 1438 int ret = 0; 1439 1440 bql_lock(); 1441 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1442 (cpu_env(cpu)->eflags & IF_MASK)) && 1443 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1444 cpu->exception_index = EXCP_HLT; 1445 cpu->halted = true; 1446 ret = 1; 1447 } 1448 bql_unlock(); 1449 1450 return ret; 1451 } 1452 1453 static void whpx_vcpu_pre_run(CPUState *cpu) 1454 { 1455 HRESULT hr; 1456 struct whpx_state *whpx = &whpx_global; 1457 AccelCPUState *vcpu = cpu->accel; 1458 X86CPU *x86_cpu = X86_CPU(cpu); 1459 CPUX86State *env = &x86_cpu->env; 1460 int irq; 1461 uint8_t tpr; 1462 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; 1463 UINT32 reg_count = 0; 1464 WHV_REGISTER_VALUE reg_values[3]; 1465 WHV_REGISTER_NAME reg_names[3]; 1466 1467 memset(&new_int, 0, sizeof(new_int)); 1468 memset(reg_values, 0, sizeof(reg_values)); 1469 1470 bql_lock(); 1471 1472 /* Inject NMI */ 1473 if (!vcpu->interruption_pending && 1474 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 1475 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 1476 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 1477 vcpu->interruptable = false; 1478 new_int.InterruptionType = WHvX64PendingNmi; 1479 new_int.InterruptionPending = 1; 1480 new_int.InterruptionVector = 2; 1481 } 1482 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 1483 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 1484 } 1485 } 1486 1487 /* 1488 * Force the VCPU out of its inner loop to process any INIT requests or 1489 * commit pending TPR access. 1490 */ 1491 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 1492 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1493 !(env->hflags & HF_SMM_MASK)) { 1494 cpu->exit_request = 1; 1495 } 1496 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1497 cpu->exit_request = 1; 1498 } 1499 } 1500 1501 /* Get pending hard interruption or replay one that was overwritten */ 1502 if (!whpx_apic_in_platform()) { 1503 if (!vcpu->interruption_pending && 1504 vcpu->interruptable && (env->eflags & IF_MASK)) { 1505 assert(!new_int.InterruptionPending); 1506 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1507 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1508 irq = cpu_get_pic_interrupt(env); 1509 if (irq >= 0) { 1510 new_int.InterruptionType = WHvX64PendingInterrupt; 1511 new_int.InterruptionPending = 1; 1512 new_int.InterruptionVector = irq; 1513 } 1514 } 1515 } 1516 1517 /* Setup interrupt state if new one was prepared */ 1518 if (new_int.InterruptionPending) { 1519 reg_values[reg_count].PendingInterruption = new_int; 1520 reg_names[reg_count] = WHvRegisterPendingInterruption; 1521 reg_count += 1; 1522 } 1523 } else if (vcpu->ready_for_pic_interrupt && 1524 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 1525 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1526 irq = cpu_get_pic_interrupt(env); 1527 if (irq >= 0) { 1528 reg_names[reg_count] = WHvRegisterPendingEvent; 1529 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) 1530 { 1531 .EventPending = 1, 1532 .EventType = WHvX64PendingEventExtInt, 1533 .Vector = irq, 1534 }; 1535 reg_count += 1; 1536 } 1537 } 1538 1539 /* Sync the TPR to the CR8 if was modified during the intercept */ 1540 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 1541 if (tpr != vcpu->tpr) { 1542 vcpu->tpr = tpr; 1543 reg_values[reg_count].Reg64 = tpr; 1544 cpu->exit_request = 1; 1545 reg_names[reg_count] = WHvX64RegisterCr8; 1546 reg_count += 1; 1547 } 1548 1549 /* Update the state of the interrupt delivery notification */ 1550 if (!vcpu->window_registered && 1551 cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1552 reg_values[reg_count].DeliverabilityNotifications = 1553 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { 1554 .InterruptNotification = 1 1555 }; 1556 vcpu->window_registered = 1; 1557 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; 1558 reg_count += 1; 1559 } 1560 1561 bql_unlock(); 1562 vcpu->ready_for_pic_interrupt = false; 1563 1564 if (reg_count) { 1565 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1566 whpx->partition, cpu->cpu_index, 1567 reg_names, reg_count, reg_values); 1568 if (FAILED(hr)) { 1569 error_report("WHPX: Failed to set interrupt state registers," 1570 " hr=%08lx", hr); 1571 } 1572 } 1573 1574 return; 1575 } 1576 1577 static void whpx_vcpu_post_run(CPUState *cpu) 1578 { 1579 AccelCPUState *vcpu = cpu->accel; 1580 X86CPU *x86_cpu = X86_CPU(cpu); 1581 CPUX86State *env = &x86_cpu->env; 1582 1583 env->eflags = vcpu->exit_ctx.VpContext.Rflags; 1584 1585 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; 1586 if (vcpu->tpr != tpr) { 1587 vcpu->tpr = tpr; 1588 bql_lock(); 1589 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr)); 1590 bql_unlock(); 1591 } 1592 1593 vcpu->interruption_pending = 1594 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; 1595 1596 vcpu->interruptable = 1597 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; 1598 1599 return; 1600 } 1601 1602 static void whpx_vcpu_process_async_events(CPUState *cpu) 1603 { 1604 X86CPU *x86_cpu = X86_CPU(cpu); 1605 CPUX86State *env = &x86_cpu->env; 1606 AccelCPUState *vcpu = cpu->accel; 1607 1608 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1609 !(env->hflags & HF_SMM_MASK)) { 1610 whpx_cpu_synchronize_state(cpu); 1611 do_cpu_init(x86_cpu); 1612 vcpu->interruptable = true; 1613 } 1614 1615 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 1616 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 1617 apic_poll_irq(x86_cpu->apic_state); 1618 } 1619 1620 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1621 (env->eflags & IF_MASK)) || 1622 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1623 cpu->halted = false; 1624 } 1625 1626 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 1627 whpx_cpu_synchronize_state(cpu); 1628 do_cpu_sipi(x86_cpu); 1629 } 1630 1631 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1632 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 1633 whpx_cpu_synchronize_state(cpu); 1634 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 1635 env->tpr_access_type); 1636 } 1637 1638 return; 1639 } 1640 1641 static int whpx_vcpu_run(CPUState *cpu) 1642 { 1643 HRESULT hr; 1644 struct whpx_state *whpx = &whpx_global; 1645 AccelCPUState *vcpu = cpu->accel; 1646 struct whpx_breakpoint *stepped_over_bp = NULL; 1647 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; 1648 int ret; 1649 1650 g_assert(bql_locked()); 1651 1652 if (whpx->running_cpus++ == 0) { 1653 /* Insert breakpoints into memory, update exception exit bitmap. */ 1654 ret = whpx_first_vcpu_starting(cpu); 1655 if (ret != 0) { 1656 return ret; 1657 } 1658 } 1659 1660 if (whpx->breakpoints.breakpoints && 1661 whpx->breakpoints.breakpoints->used > 0) 1662 { 1663 uint64_t pc = whpx_vcpu_get_pc(cpu, true); 1664 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); 1665 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { 1666 stepped_over_bp = NULL; 1667 } 1668 1669 if (stepped_over_bp) { 1670 /* 1671 * We are trying to run the instruction overwritten by an active 1672 * breakpoint. We will temporarily disable the breakpoint, suspend 1673 * other CPUs, and step over the instruction. 1674 */ 1675 exclusive_step_mode = WHPX_STEP_EXCLUSIVE; 1676 } 1677 } 1678 1679 if (exclusive_step_mode == WHPX_STEP_NONE) { 1680 whpx_vcpu_process_async_events(cpu); 1681 if (cpu->halted && !whpx_apic_in_platform()) { 1682 cpu->exception_index = EXCP_HLT; 1683 qatomic_set(&cpu->exit_request, false); 1684 return 0; 1685 } 1686 } 1687 1688 bql_unlock(); 1689 1690 if (exclusive_step_mode != WHPX_STEP_NONE) { 1691 start_exclusive(); 1692 g_assert(cpu == current_cpu); 1693 g_assert(!cpu->running); 1694 cpu->running = true; 1695 1696 hr = whpx_set_exception_exit_bitmap( 1697 1UL << WHvX64ExceptionTypeDebugTrapOrFault); 1698 if (!SUCCEEDED(hr)) { 1699 error_report("WHPX: Failed to update exception exit mask, " 1700 "hr=%08lx.", hr); 1701 return 1; 1702 } 1703 1704 if (stepped_over_bp) { 1705 /* Temporarily disable the triggered breakpoint. */ 1706 cpu_memory_rw_debug(cpu, 1707 stepped_over_bp->address, 1708 &stepped_over_bp->original_instruction, 1709 1, 1710 true); 1711 } 1712 } else { 1713 cpu_exec_start(cpu); 1714 } 1715 1716 do { 1717 if (cpu->accel->dirty) { 1718 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); 1719 cpu->accel->dirty = false; 1720 } 1721 1722 if (exclusive_step_mode == WHPX_STEP_NONE) { 1723 whpx_vcpu_pre_run(cpu); 1724 1725 if (qatomic_read(&cpu->exit_request)) { 1726 whpx_vcpu_kick(cpu); 1727 } 1728 } 1729 1730 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1731 whpx_vcpu_configure_single_stepping(cpu, true, NULL); 1732 } 1733 1734 hr = whp_dispatch.WHvRunVirtualProcessor( 1735 whpx->partition, cpu->cpu_index, 1736 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); 1737 1738 if (FAILED(hr)) { 1739 error_report("WHPX: Failed to exec a virtual processor," 1740 " hr=%08lx", hr); 1741 ret = -1; 1742 break; 1743 } 1744 1745 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1746 whpx_vcpu_configure_single_stepping(cpu, 1747 false, 1748 &vcpu->exit_ctx.VpContext.Rflags); 1749 } 1750 1751 whpx_vcpu_post_run(cpu); 1752 1753 switch (vcpu->exit_ctx.ExitReason) { 1754 case WHvRunVpExitReasonMemoryAccess: 1755 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); 1756 break; 1757 1758 case WHvRunVpExitReasonX64IoPortAccess: 1759 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); 1760 break; 1761 1762 case WHvRunVpExitReasonX64InterruptWindow: 1763 vcpu->ready_for_pic_interrupt = 1; 1764 vcpu->window_registered = 0; 1765 ret = 0; 1766 break; 1767 1768 case WHvRunVpExitReasonX64ApicEoi: 1769 assert(whpx_apic_in_platform()); 1770 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); 1771 break; 1772 1773 case WHvRunVpExitReasonX64Halt: 1774 /* 1775 * WARNING: as of build 19043.1526 (21H1), this exit reason is no 1776 * longer used. 1777 */ 1778 ret = whpx_handle_halt(cpu); 1779 break; 1780 1781 case WHvRunVpExitReasonX64ApicInitSipiTrap: { 1782 WHV_INTERRUPT_CONTROL ipi = {0}; 1783 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; 1784 uint32_t delivery_mode = 1785 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; 1786 int dest_shorthand = 1787 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; 1788 bool broadcast = false; 1789 bool include_self = false; 1790 uint32_t i; 1791 1792 /* We only registered for INIT and SIPI exits. */ 1793 if ((delivery_mode != APIC_DM_INIT) && 1794 (delivery_mode != APIC_DM_SIPI)) { 1795 error_report( 1796 "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); 1797 break; 1798 } 1799 1800 if (delivery_mode == APIC_DM_INIT) { 1801 ipi.Type = WHvX64InterruptTypeInit; 1802 } else { 1803 ipi.Type = WHvX64InterruptTypeSipi; 1804 } 1805 1806 ipi.DestinationMode = 1807 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? 1808 WHvX64InterruptDestinationModeLogical : 1809 WHvX64InterruptDestinationModePhysical; 1810 1811 ipi.TriggerMode = 1812 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? 1813 WHvX64InterruptTriggerModeLevel : 1814 WHvX64InterruptTriggerModeEdge; 1815 1816 ipi.Vector = icr & APIC_VECTOR_MASK; 1817 switch (dest_shorthand) { 1818 /* no shorthand. Bits 56-63 contain the destination. */ 1819 case 0: 1820 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; 1821 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1822 &ipi, sizeof(ipi)); 1823 if (FAILED(hr)) { 1824 error_report("WHPX: Failed to request interrupt hr=%08lx", 1825 hr); 1826 } 1827 1828 break; 1829 1830 /* self */ 1831 case 1: 1832 include_self = true; 1833 break; 1834 1835 /* broadcast, including self */ 1836 case 2: 1837 broadcast = true; 1838 include_self = true; 1839 break; 1840 1841 /* broadcast, excluding self */ 1842 case 3: 1843 broadcast = true; 1844 break; 1845 } 1846 1847 if (!broadcast && !include_self) { 1848 break; 1849 } 1850 1851 for (i = 0; i <= max_vcpu_index; i++) { 1852 if (i == cpu->cpu_index && !include_self) { 1853 continue; 1854 } 1855 1856 /* 1857 * Assuming that APIC Ids are identity mapped since 1858 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers 1859 * are not handled yet and the hypervisor doesn't allow the 1860 * guest to modify the APIC ID. 1861 */ 1862 ipi.Destination = i; 1863 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1864 &ipi, sizeof(ipi)); 1865 if (FAILED(hr)) { 1866 error_report( 1867 "WHPX: Failed to request SIPI for %d, hr=%08lx", 1868 i, hr); 1869 } 1870 } 1871 1872 break; 1873 } 1874 1875 case WHvRunVpExitReasonCanceled: 1876 if (exclusive_step_mode != WHPX_STEP_NONE) { 1877 /* 1878 * We are trying to step over a single instruction, and 1879 * likely got a request to stop from another thread. 1880 * Delay it until we are done stepping 1881 * over. 1882 */ 1883 ret = 0; 1884 } else { 1885 cpu->exception_index = EXCP_INTERRUPT; 1886 ret = 1; 1887 } 1888 break; 1889 case WHvRunVpExitReasonX64MsrAccess: { 1890 WHV_REGISTER_VALUE reg_values[3] = {0}; 1891 WHV_REGISTER_NAME reg_names[3]; 1892 UINT32 reg_count; 1893 1894 reg_names[0] = WHvX64RegisterRip; 1895 reg_names[1] = WHvX64RegisterRax; 1896 reg_names[2] = WHvX64RegisterRdx; 1897 1898 reg_values[0].Reg64 = 1899 vcpu->exit_ctx.VpContext.Rip + 1900 vcpu->exit_ctx.VpContext.InstructionLength; 1901 1902 /* 1903 * For all unsupported MSR access we: 1904 * ignore writes 1905 * return 0 on read. 1906 */ 1907 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? 1908 1 : 3; 1909 1910 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1911 whpx->partition, 1912 cpu->cpu_index, 1913 reg_names, reg_count, 1914 reg_values); 1915 1916 if (FAILED(hr)) { 1917 error_report("WHPX: Failed to set MsrAccess state " 1918 " registers, hr=%08lx", hr); 1919 } 1920 ret = 0; 1921 break; 1922 } 1923 case WHvRunVpExitReasonX64Cpuid: { 1924 WHV_REGISTER_VALUE reg_values[5]; 1925 WHV_REGISTER_NAME reg_names[5]; 1926 UINT32 reg_count = 5; 1927 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; 1928 X86CPU *x86_cpu = X86_CPU(cpu); 1929 CPUX86State *env = &x86_cpu->env; 1930 1931 memset(reg_values, 0, sizeof(reg_values)); 1932 1933 rip = vcpu->exit_ctx.VpContext.Rip + 1934 vcpu->exit_ctx.VpContext.InstructionLength; 1935 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; 1936 1937 /* 1938 * Ideally, these should be supplied to the hypervisor during VCPU 1939 * initialization and it should be able to satisfy this request. 1940 * But, currently, WHPX doesn't support setting CPUID values in the 1941 * hypervisor once the partition has been setup, which is too late 1942 * since VCPUs are realized later. For now, use the values from 1943 * QEMU to satisfy these requests, until WHPX adds support for 1944 * being able to set these values in the hypervisor at runtime. 1945 */ 1946 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, 1947 (UINT32 *)&rcx, (UINT32 *)&rdx); 1948 switch (cpuid_fn) { 1949 case 0x40000000: 1950 /* Expose the vmware cpu frequency cpuid leaf */ 1951 rax = 0x40000010; 1952 rbx = rcx = rdx = 0; 1953 break; 1954 1955 case 0x40000010: 1956 rax = env->tsc_khz; 1957 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1958 rcx = rdx = 0; 1959 break; 1960 1961 case 0x80000001: 1962 /* Remove any support of OSVW */ 1963 rcx &= ~CPUID_EXT3_OSVW; 1964 break; 1965 } 1966 1967 reg_names[0] = WHvX64RegisterRip; 1968 reg_names[1] = WHvX64RegisterRax; 1969 reg_names[2] = WHvX64RegisterRcx; 1970 reg_names[3] = WHvX64RegisterRdx; 1971 reg_names[4] = WHvX64RegisterRbx; 1972 1973 reg_values[0].Reg64 = rip; 1974 reg_values[1].Reg64 = rax; 1975 reg_values[2].Reg64 = rcx; 1976 reg_values[3].Reg64 = rdx; 1977 reg_values[4].Reg64 = rbx; 1978 1979 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1980 whpx->partition, cpu->cpu_index, 1981 reg_names, 1982 reg_count, 1983 reg_values); 1984 1985 if (FAILED(hr)) { 1986 error_report("WHPX: Failed to set CpuidAccess state registers," 1987 " hr=%08lx", hr); 1988 } 1989 ret = 0; 1990 break; 1991 } 1992 case WHvRunVpExitReasonException: 1993 whpx_get_registers(cpu); 1994 1995 if ((vcpu->exit_ctx.VpException.ExceptionType == 1996 WHvX64ExceptionTypeDebugTrapOrFault) && 1997 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && 1998 (vcpu->exit_ctx.VpException.InstructionBytes[0] == 1999 whpx_breakpoint_instruction)) { 2000 /* Stopped at a software breakpoint. */ 2001 cpu->exception_index = EXCP_DEBUG; 2002 } else if ((vcpu->exit_ctx.VpException.ExceptionType == 2003 WHvX64ExceptionTypeDebugTrapOrFault) && 2004 !cpu->singlestep_enabled) { 2005 /* 2006 * Just finished stepping over a breakpoint, but the 2007 * gdb does not expect us to do single-stepping. 2008 * Don't do anything special. 2009 */ 2010 cpu->exception_index = EXCP_INTERRUPT; 2011 } else { 2012 /* Another exception or debug event. Report it to GDB. */ 2013 cpu->exception_index = EXCP_DEBUG; 2014 } 2015 2016 ret = 1; 2017 break; 2018 case WHvRunVpExitReasonNone: 2019 case WHvRunVpExitReasonUnrecoverableException: 2020 case WHvRunVpExitReasonInvalidVpRegisterValue: 2021 case WHvRunVpExitReasonUnsupportedFeature: 2022 default: 2023 error_report("WHPX: Unexpected VP exit code %d", 2024 vcpu->exit_ctx.ExitReason); 2025 whpx_get_registers(cpu); 2026 bql_lock(); 2027 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2028 bql_unlock(); 2029 break; 2030 } 2031 2032 } while (!ret); 2033 2034 if (stepped_over_bp) { 2035 /* Restore the breakpoint we stepped over */ 2036 cpu_memory_rw_debug(cpu, 2037 stepped_over_bp->address, 2038 (void *)&whpx_breakpoint_instruction, 2039 1, 2040 true); 2041 } 2042 2043 if (exclusive_step_mode != WHPX_STEP_NONE) { 2044 g_assert(cpu_in_exclusive_context(cpu)); 2045 cpu->running = false; 2046 end_exclusive(); 2047 2048 exclusive_step_mode = WHPX_STEP_NONE; 2049 } else { 2050 cpu_exec_end(cpu); 2051 } 2052 2053 bql_lock(); 2054 current_cpu = cpu; 2055 2056 if (--whpx->running_cpus == 0) { 2057 whpx_last_vcpu_stopping(cpu); 2058 } 2059 2060 qatomic_set(&cpu->exit_request, false); 2061 2062 return ret < 0; 2063 } 2064 2065 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2066 { 2067 if (!cpu->accel->dirty) { 2068 whpx_get_registers(cpu); 2069 cpu->accel->dirty = true; 2070 } 2071 } 2072 2073 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, 2074 run_on_cpu_data arg) 2075 { 2076 whpx_set_registers(cpu, WHPX_SET_RESET_STATE); 2077 cpu->accel->dirty = false; 2078 } 2079 2080 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, 2081 run_on_cpu_data arg) 2082 { 2083 whpx_set_registers(cpu, WHPX_SET_FULL_STATE); 2084 cpu->accel->dirty = false; 2085 } 2086 2087 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, 2088 run_on_cpu_data arg) 2089 { 2090 cpu->accel->dirty = true; 2091 } 2092 2093 /* 2094 * CPU support. 2095 */ 2096 2097 void whpx_cpu_synchronize_state(CPUState *cpu) 2098 { 2099 if (!cpu->accel->dirty) { 2100 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); 2101 } 2102 } 2103 2104 void whpx_cpu_synchronize_post_reset(CPUState *cpu) 2105 { 2106 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2107 } 2108 2109 void whpx_cpu_synchronize_post_init(CPUState *cpu) 2110 { 2111 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2112 } 2113 2114 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) 2115 { 2116 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2117 } 2118 2119 void whpx_cpu_synchronize_pre_resume(bool step_pending) 2120 { 2121 whpx_global.step_pending = step_pending; 2122 } 2123 2124 /* 2125 * Vcpu support. 2126 */ 2127 2128 static Error *whpx_migration_blocker; 2129 2130 static void whpx_cpu_update_state(void *opaque, bool running, RunState state) 2131 { 2132 CPUX86State *env = opaque; 2133 2134 if (running) { 2135 env->tsc_valid = false; 2136 } 2137 } 2138 2139 int whpx_init_vcpu(CPUState *cpu) 2140 { 2141 HRESULT hr; 2142 struct whpx_state *whpx = &whpx_global; 2143 AccelCPUState *vcpu = NULL; 2144 Error *local_error = NULL; 2145 X86CPU *x86_cpu = X86_CPU(cpu); 2146 CPUX86State *env = &x86_cpu->env; 2147 UINT64 freq = 0; 2148 int ret; 2149 2150 /* Add migration blockers for all unsupported features of the 2151 * Windows Hypervisor Platform 2152 */ 2153 if (whpx_migration_blocker == NULL) { 2154 error_setg(&whpx_migration_blocker, 2155 "State blocked due to non-migratable CPUID feature support," 2156 "dirty memory tracking support, and XSAVE/XRSTOR support"); 2157 2158 if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) { 2159 error_report_err(local_error); 2160 ret = -EINVAL; 2161 goto error; 2162 } 2163 } 2164 2165 vcpu = g_new0(AccelCPUState, 1); 2166 2167 hr = whp_dispatch.WHvEmulatorCreateEmulator( 2168 &whpx_emu_callbacks, 2169 &vcpu->emulator); 2170 if (FAILED(hr)) { 2171 error_report("WHPX: Failed to setup instruction completion support," 2172 " hr=%08lx", hr); 2173 ret = -EINVAL; 2174 goto error; 2175 } 2176 2177 hr = whp_dispatch.WHvCreateVirtualProcessor( 2178 whpx->partition, cpu->cpu_index, 0); 2179 if (FAILED(hr)) { 2180 error_report("WHPX: Failed to create a virtual processor," 2181 " hr=%08lx", hr); 2182 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2183 ret = -EINVAL; 2184 goto error; 2185 } 2186 2187 /* 2188 * vcpu's TSC frequency is either specified by user, or use the value 2189 * provided by Hyper-V if the former is not present. In the latter case, we 2190 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC 2191 * frequency can be migrated later via this field. 2192 */ 2193 if (!env->tsc_khz) { 2194 hr = whp_dispatch.WHvGetCapability( 2195 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), 2196 NULL); 2197 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2198 if (FAILED(hr)) { 2199 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); 2200 } else { 2201 env->tsc_khz = freq / 1000; /* Hz to KHz */ 2202 } 2203 } 2204 } 2205 2206 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; 2207 hr = whp_dispatch.WHvGetCapability( 2208 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); 2209 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2210 if (FAILED(hr)) { 2211 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); 2212 } else { 2213 env->apic_bus_freq = freq; 2214 } 2215 } 2216 2217 /* 2218 * If the vmware cpuid frequency leaf option is set, and we have a valid 2219 * tsc value, trap the corresponding cpuid's. 2220 */ 2221 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { 2222 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; 2223 2224 hr = whp_dispatch.WHvSetPartitionProperty( 2225 whpx->partition, 2226 WHvPartitionPropertyCodeCpuidExitList, 2227 cpuidExitList, 2228 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2229 2230 if (FAILED(hr)) { 2231 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2232 hr); 2233 ret = -EINVAL; 2234 goto error; 2235 } 2236 } 2237 2238 vcpu->interruptable = true; 2239 vcpu->dirty = true; 2240 cpu->accel = vcpu; 2241 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); 2242 qemu_add_vm_change_state_handler(whpx_cpu_update_state, env); 2243 2244 return 0; 2245 2246 error: 2247 g_free(vcpu); 2248 2249 return ret; 2250 } 2251 2252 int whpx_vcpu_exec(CPUState *cpu) 2253 { 2254 int ret; 2255 int fatal; 2256 2257 for (;;) { 2258 if (cpu->exception_index >= EXCP_INTERRUPT) { 2259 ret = cpu->exception_index; 2260 cpu->exception_index = -1; 2261 break; 2262 } 2263 2264 fatal = whpx_vcpu_run(cpu); 2265 2266 if (fatal) { 2267 error_report("WHPX: Failed to exec a virtual processor"); 2268 abort(); 2269 } 2270 } 2271 2272 return ret; 2273 } 2274 2275 void whpx_destroy_vcpu(CPUState *cpu) 2276 { 2277 struct whpx_state *whpx = &whpx_global; 2278 AccelCPUState *vcpu = cpu->accel; 2279 2280 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); 2281 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2282 g_free(cpu->accel); 2283 return; 2284 } 2285 2286 void whpx_vcpu_kick(CPUState *cpu) 2287 { 2288 struct whpx_state *whpx = &whpx_global; 2289 whp_dispatch.WHvCancelRunVirtualProcessor( 2290 whpx->partition, cpu->cpu_index, 0); 2291 } 2292 2293 /* 2294 * Memory support. 2295 */ 2296 2297 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, 2298 void *host_va, int add, int rom, 2299 const char *name) 2300 { 2301 struct whpx_state *whpx = &whpx_global; 2302 HRESULT hr; 2303 2304 /* 2305 if (add) { 2306 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", 2307 (void*)start_pa, (void*)size, host_va, 2308 (rom ? "ROM" : "RAM"), name); 2309 } else { 2310 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", 2311 (void*)start_pa, (void*)size, host_va, name); 2312 } 2313 */ 2314 2315 if (add) { 2316 hr = whp_dispatch.WHvMapGpaRange(whpx->partition, 2317 host_va, 2318 start_pa, 2319 size, 2320 (WHvMapGpaRangeFlagRead | 2321 WHvMapGpaRangeFlagExecute | 2322 (rom ? 0 : WHvMapGpaRangeFlagWrite))); 2323 } else { 2324 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, 2325 start_pa, 2326 size); 2327 } 2328 2329 if (FAILED(hr)) { 2330 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," 2331 " Host:%p, hr=%08lx", 2332 (add ? "MAP" : "UNMAP"), name, 2333 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); 2334 } 2335 } 2336 2337 static void whpx_process_section(MemoryRegionSection *section, int add) 2338 { 2339 MemoryRegion *mr = section->mr; 2340 hwaddr start_pa = section->offset_within_address_space; 2341 ram_addr_t size = int128_get64(section->size); 2342 unsigned int delta; 2343 uint64_t host_va; 2344 2345 if (!memory_region_is_ram(mr)) { 2346 return; 2347 } 2348 2349 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 2350 delta &= ~qemu_real_host_page_mask(); 2351 if (delta > size) { 2352 return; 2353 } 2354 start_pa += delta; 2355 size -= delta; 2356 size &= qemu_real_host_page_mask(); 2357 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 2358 return; 2359 } 2360 2361 host_va = (uintptr_t)memory_region_get_ram_ptr(mr) 2362 + section->offset_within_region + delta; 2363 2364 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, 2365 memory_region_is_rom(mr), mr->name); 2366 } 2367 2368 static void whpx_region_add(MemoryListener *listener, 2369 MemoryRegionSection *section) 2370 { 2371 memory_region_ref(section->mr); 2372 whpx_process_section(section, 1); 2373 } 2374 2375 static void whpx_region_del(MemoryListener *listener, 2376 MemoryRegionSection *section) 2377 { 2378 whpx_process_section(section, 0); 2379 memory_region_unref(section->mr); 2380 } 2381 2382 static void whpx_transaction_begin(MemoryListener *listener) 2383 { 2384 } 2385 2386 static void whpx_transaction_commit(MemoryListener *listener) 2387 { 2388 } 2389 2390 static void whpx_log_sync(MemoryListener *listener, 2391 MemoryRegionSection *section) 2392 { 2393 MemoryRegion *mr = section->mr; 2394 2395 if (!memory_region_is_ram(mr)) { 2396 return; 2397 } 2398 2399 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 2400 } 2401 2402 static MemoryListener whpx_memory_listener = { 2403 .name = "whpx", 2404 .begin = whpx_transaction_begin, 2405 .commit = whpx_transaction_commit, 2406 .region_add = whpx_region_add, 2407 .region_del = whpx_region_del, 2408 .log_sync = whpx_log_sync, 2409 .priority = MEMORY_LISTENER_PRIORITY_ACCEL, 2410 }; 2411 2412 static void whpx_memory_init(void) 2413 { 2414 memory_listener_register(&whpx_memory_listener, &address_space_memory); 2415 } 2416 2417 /* 2418 * Load the functions from the given library, using the given handle. If a 2419 * handle is provided, it is used, otherwise the library is opened. The 2420 * handle will be updated on return with the opened one. 2421 */ 2422 static bool load_whp_dispatch_fns(HMODULE *handle, 2423 WHPFunctionList function_list) 2424 { 2425 HMODULE hLib = *handle; 2426 2427 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" 2428 #define WINHV_EMULATION_DLL "WinHvEmulation.dll" 2429 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ 2430 whp_dispatch.function_name = \ 2431 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2432 2433 #define WHP_LOAD_FIELD(return_type, function_name, signature) \ 2434 whp_dispatch.function_name = \ 2435 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2436 if (!whp_dispatch.function_name) { \ 2437 error_report("Could not load function %s", #function_name); \ 2438 goto error; \ 2439 } \ 2440 2441 #define WHP_LOAD_LIB(lib_name, handle_lib) \ 2442 if (!handle_lib) { \ 2443 handle_lib = LoadLibrary(lib_name); \ 2444 if (!handle_lib) { \ 2445 error_report("Could not load library %s.", lib_name); \ 2446 goto error; \ 2447 } \ 2448 } \ 2449 2450 switch (function_list) { 2451 case WINHV_PLATFORM_FNS_DEFAULT: 2452 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2453 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) 2454 break; 2455 2456 case WINHV_EMULATION_FNS_DEFAULT: 2457 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) 2458 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) 2459 break; 2460 2461 case WINHV_PLATFORM_FNS_SUPPLEMENTAL: 2462 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2463 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) 2464 break; 2465 } 2466 2467 *handle = hLib; 2468 return true; 2469 2470 error: 2471 if (hLib) { 2472 FreeLibrary(hLib); 2473 } 2474 2475 return false; 2476 } 2477 2478 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, 2479 const char *name, void *opaque, 2480 Error **errp) 2481 { 2482 struct whpx_state *whpx = &whpx_global; 2483 OnOffSplit mode; 2484 2485 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 2486 return; 2487 } 2488 2489 switch (mode) { 2490 case ON_OFF_SPLIT_ON: 2491 whpx->kernel_irqchip_allowed = true; 2492 whpx->kernel_irqchip_required = true; 2493 break; 2494 2495 case ON_OFF_SPLIT_OFF: 2496 whpx->kernel_irqchip_allowed = false; 2497 whpx->kernel_irqchip_required = false; 2498 break; 2499 2500 case ON_OFF_SPLIT_SPLIT: 2501 error_setg(errp, "WHPX: split irqchip currently not supported"); 2502 error_append_hint(errp, 2503 "Try without kernel-irqchip or with kernel-irqchip=on|off"); 2504 break; 2505 2506 default: 2507 /* 2508 * The value was checked in visit_type_OnOffSplit() above. If 2509 * we get here, then something is wrong in QEMU. 2510 */ 2511 abort(); 2512 } 2513 } 2514 2515 /* 2516 * Partition support 2517 */ 2518 2519 static int whpx_accel_init(MachineState *ms) 2520 { 2521 struct whpx_state *whpx; 2522 int ret; 2523 HRESULT hr; 2524 WHV_CAPABILITY whpx_cap; 2525 UINT32 whpx_cap_size; 2526 WHV_PARTITION_PROPERTY prop; 2527 UINT32 cpuidExitList[] = {1, 0x80000001}; 2528 WHV_CAPABILITY_FEATURES features = {0}; 2529 2530 whpx = &whpx_global; 2531 2532 if (!init_whp_dispatch()) { 2533 ret = -ENOSYS; 2534 goto error; 2535 } 2536 2537 whpx->mem_quota = ms->ram_size; 2538 2539 hr = whp_dispatch.WHvGetCapability( 2540 WHvCapabilityCodeHypervisorPresent, &whpx_cap, 2541 sizeof(whpx_cap), &whpx_cap_size); 2542 if (FAILED(hr) || !whpx_cap.HypervisorPresent) { 2543 error_report("WHPX: No accelerator found, hr=%08lx", hr); 2544 ret = -ENOSPC; 2545 goto error; 2546 } 2547 2548 hr = whp_dispatch.WHvGetCapability( 2549 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); 2550 if (FAILED(hr)) { 2551 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); 2552 ret = -EINVAL; 2553 goto error; 2554 } 2555 2556 hr = whp_dispatch.WHvCreatePartition(&whpx->partition); 2557 if (FAILED(hr)) { 2558 error_report("WHPX: Failed to create partition, hr=%08lx", hr); 2559 ret = -EINVAL; 2560 goto error; 2561 } 2562 2563 /* 2564 * Query the XSAVE capability of the partition. Any error here is not 2565 * considered fatal. 2566 */ 2567 hr = whp_dispatch.WHvGetPartitionProperty( 2568 whpx->partition, 2569 WHvPartitionPropertyCodeProcessorXsaveFeatures, 2570 &whpx_xsave_cap, 2571 sizeof(whpx_xsave_cap), 2572 &whpx_cap_size); 2573 2574 /* 2575 * Windows version which don't support this property will return with the 2576 * specific error code. 2577 */ 2578 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) { 2579 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr); 2580 } 2581 2582 if (!whpx_has_xsave()) { 2583 printf("WHPX: Partition is not XSAVE capable\n"); 2584 } 2585 2586 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2587 prop.ProcessorCount = ms->smp.cpus; 2588 hr = whp_dispatch.WHvSetPartitionProperty( 2589 whpx->partition, 2590 WHvPartitionPropertyCodeProcessorCount, 2591 &prop, 2592 sizeof(WHV_PARTITION_PROPERTY)); 2593 2594 if (FAILED(hr)) { 2595 error_report("WHPX: Failed to set partition processor count to %u," 2596 " hr=%08lx", prop.ProcessorCount, hr); 2597 ret = -EINVAL; 2598 goto error; 2599 } 2600 2601 /* 2602 * Error out if WHP doesn't support apic emulation and user is requiring 2603 * it. 2604 */ 2605 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || 2606 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { 2607 error_report("WHPX: kernel irqchip requested, but unavailable. " 2608 "Try without kernel-irqchip or with kernel-irqchip=off"); 2609 ret = -EINVAL; 2610 goto error; 2611 } 2612 2613 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && 2614 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { 2615 WHV_X64_LOCAL_APIC_EMULATION_MODE mode = 2616 WHvX64LocalApicEmulationModeXApic; 2617 printf("WHPX: setting APIC emulation mode in the hypervisor\n"); 2618 hr = whp_dispatch.WHvSetPartitionProperty( 2619 whpx->partition, 2620 WHvPartitionPropertyCodeLocalApicEmulationMode, 2621 &mode, 2622 sizeof(mode)); 2623 if (FAILED(hr)) { 2624 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); 2625 if (whpx->kernel_irqchip_required) { 2626 error_report("WHPX: kernel irqchip requested, but unavailable"); 2627 ret = -EINVAL; 2628 goto error; 2629 } 2630 } else { 2631 whpx->apic_in_platform = true; 2632 } 2633 } 2634 2635 /* Register for MSR and CPUID exits */ 2636 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2637 prop.ExtendedVmExits.X64MsrExit = 1; 2638 prop.ExtendedVmExits.X64CpuidExit = 1; 2639 prop.ExtendedVmExits.ExceptionExit = 1; 2640 if (whpx_apic_in_platform()) { 2641 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; 2642 } 2643 2644 hr = whp_dispatch.WHvSetPartitionProperty( 2645 whpx->partition, 2646 WHvPartitionPropertyCodeExtendedVmExits, 2647 &prop, 2648 sizeof(WHV_PARTITION_PROPERTY)); 2649 if (FAILED(hr)) { 2650 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); 2651 ret = -EINVAL; 2652 goto error; 2653 } 2654 2655 hr = whp_dispatch.WHvSetPartitionProperty( 2656 whpx->partition, 2657 WHvPartitionPropertyCodeCpuidExitList, 2658 cpuidExitList, 2659 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2660 2661 if (FAILED(hr)) { 2662 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2663 hr); 2664 ret = -EINVAL; 2665 goto error; 2666 } 2667 2668 /* 2669 * We do not want to intercept any exceptions from the guest, 2670 * until we actually start debugging with gdb. 2671 */ 2672 whpx->exception_exit_bitmap = -1; 2673 hr = whpx_set_exception_exit_bitmap(0); 2674 2675 if (FAILED(hr)) { 2676 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); 2677 ret = -EINVAL; 2678 goto error; 2679 } 2680 2681 hr = whp_dispatch.WHvSetupPartition(whpx->partition); 2682 if (FAILED(hr)) { 2683 error_report("WHPX: Failed to setup partition, hr=%08lx", hr); 2684 ret = -EINVAL; 2685 goto error; 2686 } 2687 2688 whpx_memory_init(); 2689 2690 printf("Windows Hypervisor Platform accelerator is operational\n"); 2691 return 0; 2692 2693 error: 2694 2695 if (NULL != whpx->partition) { 2696 whp_dispatch.WHvDeletePartition(whpx->partition); 2697 whpx->partition = NULL; 2698 } 2699 2700 return ret; 2701 } 2702 2703 int whpx_enabled(void) 2704 { 2705 return whpx_allowed; 2706 } 2707 2708 bool whpx_apic_in_platform(void) { 2709 return whpx_global.apic_in_platform; 2710 } 2711 2712 static void whpx_accel_class_init(ObjectClass *oc, void *data) 2713 { 2714 AccelClass *ac = ACCEL_CLASS(oc); 2715 ac->name = "WHPX"; 2716 ac->init_machine = whpx_accel_init; 2717 ac->allowed = &whpx_allowed; 2718 2719 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 2720 NULL, whpx_set_kernel_irqchip, 2721 NULL, NULL); 2722 object_class_property_set_description(oc, "kernel-irqchip", 2723 "Configure WHPX in-kernel irqchip"); 2724 } 2725 2726 static void whpx_accel_instance_init(Object *obj) 2727 { 2728 struct whpx_state *whpx = &whpx_global; 2729 2730 memset(whpx, 0, sizeof(struct whpx_state)); 2731 /* Turn on kernel-irqchip, by default */ 2732 whpx->kernel_irqchip_allowed = true; 2733 } 2734 2735 static const TypeInfo whpx_accel_type = { 2736 .name = ACCEL_CLASS_NAME("whpx"), 2737 .parent = TYPE_ACCEL, 2738 .instance_init = whpx_accel_instance_init, 2739 .class_init = whpx_accel_class_init, 2740 }; 2741 2742 static void whpx_type_init(void) 2743 { 2744 type_register_static(&whpx_accel_type); 2745 } 2746 2747 bool init_whp_dispatch(void) 2748 { 2749 if (whp_dispatch_initialized) { 2750 return true; 2751 } 2752 2753 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { 2754 goto error; 2755 } 2756 2757 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { 2758 goto error; 2759 } 2760 2761 assert(load_whp_dispatch_fns(&hWinHvPlatform, 2762 WINHV_PLATFORM_FNS_SUPPLEMENTAL)); 2763 whp_dispatch_initialized = true; 2764 2765 return true; 2766 error: 2767 if (hWinHvPlatform) { 2768 FreeLibrary(hWinHvPlatform); 2769 } 2770 2771 if (hWinHvEmulation) { 2772 FreeLibrary(hWinHvEmulation); 2773 } 2774 2775 return false; 2776 } 2777 2778 type_init(whpx_type_init); 2779