1 /* 2 * QEMU Windows Hypervisor Platform accelerator (WHPX) 3 * 4 * Copyright Microsoft Corp. 2017 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 11 #include "qemu/osdep.h" 12 #include "cpu.h" 13 #include "exec/address-spaces.h" 14 #include "exec/ioport.h" 15 #include "gdbstub/helpers.h" 16 #include "qemu/accel.h" 17 #include "sysemu/whpx.h" 18 #include "sysemu/cpus.h" 19 #include "sysemu/runstate.h" 20 #include "qemu/main-loop.h" 21 #include "hw/boards.h" 22 #include "hw/intc/ioapic.h" 23 #include "hw/i386/apic_internal.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "qapi/qapi-types-common.h" 27 #include "qapi/qapi-visit-common.h" 28 #include "migration/blocker.h" 29 #include <winerror.h> 30 31 #include "whpx-internal.h" 32 #include "whpx-accel-ops.h" 33 34 #include <winhvplatform.h> 35 #include <winhvemulation.h> 36 37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) 38 39 static const WHV_REGISTER_NAME whpx_register_names[] = { 40 41 /* X64 General purpose registers */ 42 WHvX64RegisterRax, 43 WHvX64RegisterRcx, 44 WHvX64RegisterRdx, 45 WHvX64RegisterRbx, 46 WHvX64RegisterRsp, 47 WHvX64RegisterRbp, 48 WHvX64RegisterRsi, 49 WHvX64RegisterRdi, 50 WHvX64RegisterR8, 51 WHvX64RegisterR9, 52 WHvX64RegisterR10, 53 WHvX64RegisterR11, 54 WHvX64RegisterR12, 55 WHvX64RegisterR13, 56 WHvX64RegisterR14, 57 WHvX64RegisterR15, 58 WHvX64RegisterRip, 59 WHvX64RegisterRflags, 60 61 /* X64 Segment registers */ 62 WHvX64RegisterEs, 63 WHvX64RegisterCs, 64 WHvX64RegisterSs, 65 WHvX64RegisterDs, 66 WHvX64RegisterFs, 67 WHvX64RegisterGs, 68 WHvX64RegisterLdtr, 69 WHvX64RegisterTr, 70 71 /* X64 Table registers */ 72 WHvX64RegisterIdtr, 73 WHvX64RegisterGdtr, 74 75 /* X64 Control Registers */ 76 WHvX64RegisterCr0, 77 WHvX64RegisterCr2, 78 WHvX64RegisterCr3, 79 WHvX64RegisterCr4, 80 WHvX64RegisterCr8, 81 82 /* X64 Debug Registers */ 83 /* 84 * WHvX64RegisterDr0, 85 * WHvX64RegisterDr1, 86 * WHvX64RegisterDr2, 87 * WHvX64RegisterDr3, 88 * WHvX64RegisterDr6, 89 * WHvX64RegisterDr7, 90 */ 91 92 /* X64 Floating Point and Vector Registers */ 93 WHvX64RegisterXmm0, 94 WHvX64RegisterXmm1, 95 WHvX64RegisterXmm2, 96 WHvX64RegisterXmm3, 97 WHvX64RegisterXmm4, 98 WHvX64RegisterXmm5, 99 WHvX64RegisterXmm6, 100 WHvX64RegisterXmm7, 101 WHvX64RegisterXmm8, 102 WHvX64RegisterXmm9, 103 WHvX64RegisterXmm10, 104 WHvX64RegisterXmm11, 105 WHvX64RegisterXmm12, 106 WHvX64RegisterXmm13, 107 WHvX64RegisterXmm14, 108 WHvX64RegisterXmm15, 109 WHvX64RegisterFpMmx0, 110 WHvX64RegisterFpMmx1, 111 WHvX64RegisterFpMmx2, 112 WHvX64RegisterFpMmx3, 113 WHvX64RegisterFpMmx4, 114 WHvX64RegisterFpMmx5, 115 WHvX64RegisterFpMmx6, 116 WHvX64RegisterFpMmx7, 117 WHvX64RegisterFpControlStatus, 118 WHvX64RegisterXmmControlStatus, 119 120 /* X64 MSRs */ 121 WHvX64RegisterEfer, 122 #ifdef TARGET_X86_64 123 WHvX64RegisterKernelGsBase, 124 #endif 125 WHvX64RegisterApicBase, 126 /* WHvX64RegisterPat, */ 127 WHvX64RegisterSysenterCs, 128 WHvX64RegisterSysenterEip, 129 WHvX64RegisterSysenterEsp, 130 WHvX64RegisterStar, 131 #ifdef TARGET_X86_64 132 WHvX64RegisterLstar, 133 WHvX64RegisterCstar, 134 WHvX64RegisterSfmask, 135 #endif 136 137 /* Interrupt / Event Registers */ 138 /* 139 * WHvRegisterPendingInterruption, 140 * WHvRegisterInterruptState, 141 * WHvRegisterPendingEvent0, 142 * WHvRegisterPendingEvent1 143 * WHvX64RegisterDeliverabilityNotifications, 144 */ 145 }; 146 147 struct whpx_register_set { 148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; 149 }; 150 151 /* 152 * The current implementation of instruction stepping sets the TF flag 153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction. 154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. 155 * 156 * This approach has a few limitations: 157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag 158 * along with the other flags, possibly restoring it later. It would 159 * result in another INT1 when the flags are restored, triggering 160 * a stop in gdb that could be cleared by doing another step. 161 * 162 * Stepping over a POPF/LAHF instruction will let it overwrite the 163 * TF flags, ending the stepping mode. 164 * 165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, 166 * or anything that could result in a page fault) will save the flags 167 * to the stack, clear the TF flag, and let the guest execute the 168 * handler. Normally, the guest will restore the original flags, 169 * that will continue single-stepping. 170 * 171 * 3. Debuggers running on the guest may wish to set TF to do instruction 172 * stepping. INT1 events generated by it would be intercepted by us, 173 * as long as the gdb is connected to QEMU. 174 * 175 * In practice this means that: 176 * 1. Stepping through flags-modifying instructions may cause gdb to 177 * continue or stop in unexpected places. This will be fully recoverable 178 * and will not crash the target. 179 * 180 * 2. Stepping over an instruction that triggers an exception will step 181 * over the exception handler, not into it. 182 * 183 * 3. Debugging the guest via gdb, while running debugger on the guest 184 * at the same time may lead to unexpected effects. Removing all 185 * breakpoints set via QEMU will prevent any further interference 186 * with the guest-level debuggers. 187 * 188 * The limitations can be addressed as shown below: 189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of 190 * stepping through them. The exact semantics of the instructions is 191 * defined in the "Combined Volume Set of Intel 64 and IA-32 192 * Architectures Software Developer's Manuals", however it involves a 193 * fair amount of corner cases due to compatibility with real mode, 194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes. 195 * 196 * 2. We could step into the guest's exception handlers using the following 197 * sequence: 198 * a. Temporarily enable catching of all exception types via 199 * whpx_set_exception_exit_bitmap(). 200 * b. Once an exception is intercepted, read the IDT/GDT and locate 201 * the original handler. 202 * c. Patch the original handler, injecting an INT3 at the beginning. 203 * d. Update the exception exit bitmap to only catch the 204 * WHvX64ExceptionTypeBreakpointTrap exception. 205 * e. Let the affected CPU run in the exclusive mode. 206 * f. Restore the original handler and the exception exit bitmap. 207 * Note that handling all corner cases related to IDT/GDT is harder 208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a 209 * rough idea. 210 * 211 * 3. In order to properly support guest-level debugging in parallel with 212 * the QEMU-level debugging, we would need to be able to pass some INT1 213 * events to the guest. This could be done via the following methods: 214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, 215 * it seems to only work for interrupts and not software 216 * exceptions. 217 * b. Locating and patching the original handler by parsing IDT/GDT. 218 * This involves relatively complex logic outlined in the previous 219 * paragraph. 220 * c. Emulating the exception invocation (i.e. manually updating RIP, 221 * RFLAGS, and pushing the old values to stack). This is even more 222 * complicated than the previous option, since it involves checking 223 * CPL, gate attributes, and doing various adjustments depending 224 * on the current CPU mode, whether the CPL is changing, etc. 225 */ 226 typedef enum WhpxStepMode { 227 WHPX_STEP_NONE = 0, 228 /* Halt other VCPUs */ 229 WHPX_STEP_EXCLUSIVE, 230 } WhpxStepMode; 231 232 struct AccelCPUState { 233 WHV_EMULATOR_HANDLE emulator; 234 bool window_registered; 235 bool interruptable; 236 bool ready_for_pic_interrupt; 237 uint64_t tpr; 238 uint64_t apic_base; 239 bool interruption_pending; 240 241 /* Must be the last field as it may have a tail */ 242 WHV_RUN_VP_EXIT_CONTEXT exit_ctx; 243 }; 244 245 static bool whpx_allowed; 246 static bool whp_dispatch_initialized; 247 static HMODULE hWinHvPlatform, hWinHvEmulation; 248 static uint32_t max_vcpu_index; 249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap; 250 251 struct whpx_state whpx_global; 252 struct WHPDispatch whp_dispatch; 253 254 static bool whpx_has_xsave(void) 255 { 256 return whpx_xsave_cap.XsaveSupport; 257 } 258 259 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, 260 int r86) 261 { 262 WHV_X64_SEGMENT_REGISTER hs; 263 unsigned flags = qs->flags; 264 265 hs.Base = qs->base; 266 hs.Limit = qs->limit; 267 hs.Selector = qs->selector; 268 269 if (v86) { 270 hs.Attributes = 0; 271 hs.SegmentType = 3; 272 hs.Present = 1; 273 hs.DescriptorPrivilegeLevel = 3; 274 hs.NonSystemSegment = 1; 275 276 } else { 277 hs.Attributes = (flags >> DESC_TYPE_SHIFT); 278 279 if (r86) { 280 /* hs.Base &= 0xfffff; */ 281 } 282 } 283 284 return hs; 285 } 286 287 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) 288 { 289 SegmentCache qs; 290 291 qs.base = hs->Base; 292 qs.limit = hs->Limit; 293 qs.selector = hs->Selector; 294 295 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; 296 297 return qs; 298 } 299 300 /* X64 Extended Control Registers */ 301 static void whpx_set_xcrs(CPUState *cpu) 302 { 303 HRESULT hr; 304 struct whpx_state *whpx = &whpx_global; 305 WHV_REGISTER_VALUE xcr0; 306 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 307 308 if (!whpx_has_xsave()) { 309 return; 310 } 311 312 /* Only xcr0 is supported by the hypervisor currently */ 313 xcr0.Reg64 = cpu_env(cpu)->xcr0; 314 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 315 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 316 if (FAILED(hr)) { 317 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr); 318 } 319 } 320 321 static int whpx_set_tsc(CPUState *cpu) 322 { 323 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 324 WHV_REGISTER_VALUE tsc_val; 325 HRESULT hr; 326 struct whpx_state *whpx = &whpx_global; 327 328 /* 329 * Suspend the partition prior to setting the TSC to reduce the variance 330 * in TSC across vCPUs. When the first vCPU runs post suspend, the 331 * partition is automatically resumed. 332 */ 333 if (whp_dispatch.WHvSuspendPartitionTime) { 334 335 /* 336 * Unable to suspend partition while setting TSC is not a fatal 337 * error. It just increases the likelihood of TSC variance between 338 * vCPUs and some guest OS are able to handle that just fine. 339 */ 340 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); 341 if (FAILED(hr)) { 342 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); 343 } 344 } 345 346 tsc_val.Reg64 = cpu_env(cpu)->tsc; 347 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 348 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 349 if (FAILED(hr)) { 350 error_report("WHPX: Failed to set TSC, hr=%08lx", hr); 351 return -1; 352 } 353 354 return 0; 355 } 356 357 /* 358 * The CR8 register in the CPU is mapped to the TPR register of the APIC, 359 * however, they use a slightly different encoding. Specifically: 360 * 361 * APIC.TPR[bits 7:4] = CR8[bits 3:0] 362 * 363 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 364 * and IA-32 Architectures Software Developer's Manual. 365 * 366 * The functions below translate the value of CR8 to TPR and vice versa. 367 */ 368 369 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) 370 { 371 return tpr >> 4; 372 } 373 374 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8) 375 { 376 return cr8 << 4; 377 } 378 379 static void whpx_set_registers(CPUState *cpu, int level) 380 { 381 struct whpx_state *whpx = &whpx_global; 382 AccelCPUState *vcpu = cpu->accel; 383 X86CPU *x86_cpu = X86_CPU(cpu); 384 CPUX86State *env = &x86_cpu->env; 385 struct whpx_register_set vcxt; 386 HRESULT hr; 387 int idx; 388 int idx_next; 389 int i; 390 int v86, r86; 391 392 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 393 394 /* 395 * Following MSRs have side effects on the guest or are too heavy for 396 * runtime. Limit them to full state update. 397 */ 398 if (level >= WHPX_SET_RESET_STATE) { 399 whpx_set_tsc(cpu); 400 } 401 402 memset(&vcxt, 0, sizeof(struct whpx_register_set)); 403 404 v86 = (env->eflags & VM_MASK); 405 r86 = !(env->cr[0] & CR0_PE_MASK); 406 407 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 408 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); 409 410 idx = 0; 411 412 /* Indexes for first 16 registers match between HV and QEMU definitions */ 413 idx_next = 16; 414 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 415 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; 416 } 417 idx = idx_next; 418 419 /* Same goes for RIP and RFLAGS */ 420 assert(whpx_register_names[idx] == WHvX64RegisterRip); 421 vcxt.values[idx++].Reg64 = env->eip; 422 423 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 424 vcxt.values[idx++].Reg64 = env->eflags; 425 426 /* Translate 6+4 segment registers. HV and QEMU order matches */ 427 assert(idx == WHvX64RegisterEs); 428 for (i = 0; i < 6; i += 1, idx += 1) { 429 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); 430 } 431 432 assert(idx == WHvX64RegisterLdtr); 433 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); 434 435 assert(idx == WHvX64RegisterTr); 436 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); 437 438 assert(idx == WHvX64RegisterIdtr); 439 vcxt.values[idx].Table.Base = env->idt.base; 440 vcxt.values[idx].Table.Limit = env->idt.limit; 441 idx += 1; 442 443 assert(idx == WHvX64RegisterGdtr); 444 vcxt.values[idx].Table.Base = env->gdt.base; 445 vcxt.values[idx].Table.Limit = env->gdt.limit; 446 idx += 1; 447 448 /* CR0, 2, 3, 4, 8 */ 449 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 450 vcxt.values[idx++].Reg64 = env->cr[0]; 451 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 452 vcxt.values[idx++].Reg64 = env->cr[2]; 453 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 454 vcxt.values[idx++].Reg64 = env->cr[3]; 455 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 456 vcxt.values[idx++].Reg64 = env->cr[4]; 457 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 458 vcxt.values[idx++].Reg64 = vcpu->tpr; 459 460 /* 8 Debug Registers - Skipped */ 461 462 /* 463 * Extended control registers needs to be handled separately depending 464 * on whether xsave is supported/enabled or not. 465 */ 466 whpx_set_xcrs(cpu); 467 468 /* 16 XMM registers */ 469 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 470 idx_next = idx + 16; 471 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 472 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); 473 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); 474 } 475 idx = idx_next; 476 477 /* 8 FP registers */ 478 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 479 for (i = 0; i < 8; i += 1, idx += 1) { 480 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); 481 /* vcxt.values[idx].Fp.AsUINT128.High64 = 482 env->fpregs[i].mmx.MMX_Q(1); 483 */ 484 } 485 486 /* FP control status register */ 487 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 488 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; 489 vcxt.values[idx].FpControlStatus.FpStatus = 490 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; 491 vcxt.values[idx].FpControlStatus.FpTag = 0; 492 for (i = 0; i < 8; ++i) { 493 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; 494 } 495 vcxt.values[idx].FpControlStatus.Reserved = 0; 496 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; 497 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; 498 idx += 1; 499 500 /* XMM control status register */ 501 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 502 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; 503 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; 504 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; 505 idx += 1; 506 507 /* MSRs */ 508 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 509 vcxt.values[idx++].Reg64 = env->efer; 510 #ifdef TARGET_X86_64 511 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 512 vcxt.values[idx++].Reg64 = env->kernelgsbase; 513 #endif 514 515 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 516 vcxt.values[idx++].Reg64 = vcpu->apic_base; 517 518 /* WHvX64RegisterPat - Skipped */ 519 520 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 521 vcxt.values[idx++].Reg64 = env->sysenter_cs; 522 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 523 vcxt.values[idx++].Reg64 = env->sysenter_eip; 524 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 525 vcxt.values[idx++].Reg64 = env->sysenter_esp; 526 assert(whpx_register_names[idx] == WHvX64RegisterStar); 527 vcxt.values[idx++].Reg64 = env->star; 528 #ifdef TARGET_X86_64 529 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 530 vcxt.values[idx++].Reg64 = env->lstar; 531 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 532 vcxt.values[idx++].Reg64 = env->cstar; 533 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 534 vcxt.values[idx++].Reg64 = env->fmask; 535 #endif 536 537 /* Interrupt / Event Registers - Skipped */ 538 539 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 540 541 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 542 whpx->partition, cpu->cpu_index, 543 whpx_register_names, 544 RTL_NUMBER_OF(whpx_register_names), 545 &vcxt.values[0]); 546 547 if (FAILED(hr)) { 548 error_report("WHPX: Failed to set virtual processor context, hr=%08lx", 549 hr); 550 } 551 552 return; 553 } 554 555 static int whpx_get_tsc(CPUState *cpu) 556 { 557 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 558 WHV_REGISTER_VALUE tsc_val; 559 HRESULT hr; 560 struct whpx_state *whpx = &whpx_global; 561 562 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 563 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 564 if (FAILED(hr)) { 565 error_report("WHPX: Failed to get TSC, hr=%08lx", hr); 566 return -1; 567 } 568 569 cpu_env(cpu)->tsc = tsc_val.Reg64; 570 return 0; 571 } 572 573 /* X64 Extended Control Registers */ 574 static void whpx_get_xcrs(CPUState *cpu) 575 { 576 HRESULT hr; 577 struct whpx_state *whpx = &whpx_global; 578 WHV_REGISTER_VALUE xcr0; 579 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 580 581 if (!whpx_has_xsave()) { 582 return; 583 } 584 585 /* Only xcr0 is supported by the hypervisor currently */ 586 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 587 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 588 if (FAILED(hr)) { 589 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr); 590 return; 591 } 592 593 cpu_env(cpu)->xcr0 = xcr0.Reg64; 594 } 595 596 static void whpx_get_registers(CPUState *cpu) 597 { 598 struct whpx_state *whpx = &whpx_global; 599 AccelCPUState *vcpu = cpu->accel; 600 X86CPU *x86_cpu = X86_CPU(cpu); 601 CPUX86State *env = &x86_cpu->env; 602 struct whpx_register_set vcxt; 603 uint64_t tpr, apic_base; 604 HRESULT hr; 605 int idx; 606 int idx_next; 607 int i; 608 609 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 610 611 if (!env->tsc_valid) { 612 whpx_get_tsc(cpu); 613 env->tsc_valid = !runstate_is_running(); 614 } 615 616 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 617 whpx->partition, cpu->cpu_index, 618 whpx_register_names, 619 RTL_NUMBER_OF(whpx_register_names), 620 &vcxt.values[0]); 621 if (FAILED(hr)) { 622 error_report("WHPX: Failed to get virtual processor context, hr=%08lx", 623 hr); 624 } 625 626 if (whpx_apic_in_platform()) { 627 /* 628 * Fetch the TPR value from the emulated APIC. It may get overwritten 629 * below with the value from CR8 returned by 630 * WHvGetVirtualProcessorRegisters(). 631 */ 632 whpx_apic_get(x86_cpu->apic_state); 633 vcpu->tpr = whpx_apic_tpr_to_cr8( 634 cpu_get_apic_tpr(x86_cpu->apic_state)); 635 } 636 637 idx = 0; 638 639 /* Indexes for first 16 registers match between HV and QEMU definitions */ 640 idx_next = 16; 641 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 642 env->regs[idx] = vcxt.values[idx].Reg64; 643 } 644 idx = idx_next; 645 646 /* Same goes for RIP and RFLAGS */ 647 assert(whpx_register_names[idx] == WHvX64RegisterRip); 648 env->eip = vcxt.values[idx++].Reg64; 649 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 650 env->eflags = vcxt.values[idx++].Reg64; 651 652 /* Translate 6+4 segment registers. HV and QEMU order matches */ 653 assert(idx == WHvX64RegisterEs); 654 for (i = 0; i < 6; i += 1, idx += 1) { 655 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); 656 } 657 658 assert(idx == WHvX64RegisterLdtr); 659 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); 660 assert(idx == WHvX64RegisterTr); 661 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); 662 assert(idx == WHvX64RegisterIdtr); 663 env->idt.base = vcxt.values[idx].Table.Base; 664 env->idt.limit = vcxt.values[idx].Table.Limit; 665 idx += 1; 666 assert(idx == WHvX64RegisterGdtr); 667 env->gdt.base = vcxt.values[idx].Table.Base; 668 env->gdt.limit = vcxt.values[idx].Table.Limit; 669 idx += 1; 670 671 /* CR0, 2, 3, 4, 8 */ 672 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 673 env->cr[0] = vcxt.values[idx++].Reg64; 674 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 675 env->cr[2] = vcxt.values[idx++].Reg64; 676 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 677 env->cr[3] = vcxt.values[idx++].Reg64; 678 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 679 env->cr[4] = vcxt.values[idx++].Reg64; 680 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 681 tpr = vcxt.values[idx++].Reg64; 682 if (tpr != vcpu->tpr) { 683 vcpu->tpr = tpr; 684 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr)); 685 } 686 687 /* 8 Debug Registers - Skipped */ 688 689 /* 690 * Extended control registers needs to be handled separately depending 691 * on whether xsave is supported/enabled or not. 692 */ 693 whpx_get_xcrs(cpu); 694 695 /* 16 XMM registers */ 696 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 697 idx_next = idx + 16; 698 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 699 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; 700 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; 701 } 702 idx = idx_next; 703 704 /* 8 FP registers */ 705 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 706 for (i = 0; i < 8; i += 1, idx += 1) { 707 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; 708 /* env->fpregs[i].mmx.MMX_Q(1) = 709 vcxt.values[idx].Fp.AsUINT128.High64; 710 */ 711 } 712 713 /* FP control status register */ 714 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 715 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; 716 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; 717 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; 718 for (i = 0; i < 8; ++i) { 719 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); 720 } 721 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; 722 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; 723 idx += 1; 724 725 /* XMM control status register */ 726 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 727 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; 728 idx += 1; 729 730 /* MSRs */ 731 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 732 env->efer = vcxt.values[idx++].Reg64; 733 #ifdef TARGET_X86_64 734 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 735 env->kernelgsbase = vcxt.values[idx++].Reg64; 736 #endif 737 738 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 739 apic_base = vcxt.values[idx++].Reg64; 740 if (apic_base != vcpu->apic_base) { 741 vcpu->apic_base = apic_base; 742 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); 743 } 744 745 /* WHvX64RegisterPat - Skipped */ 746 747 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 748 env->sysenter_cs = vcxt.values[idx++].Reg64; 749 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 750 env->sysenter_eip = vcxt.values[idx++].Reg64; 751 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 752 env->sysenter_esp = vcxt.values[idx++].Reg64; 753 assert(whpx_register_names[idx] == WHvX64RegisterStar); 754 env->star = vcxt.values[idx++].Reg64; 755 #ifdef TARGET_X86_64 756 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 757 env->lstar = vcxt.values[idx++].Reg64; 758 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 759 env->cstar = vcxt.values[idx++].Reg64; 760 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 761 env->fmask = vcxt.values[idx++].Reg64; 762 #endif 763 764 /* Interrupt / Event Registers - Skipped */ 765 766 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 767 768 if (whpx_apic_in_platform()) { 769 whpx_apic_get(x86_cpu->apic_state); 770 } 771 772 x86_update_hflags(env); 773 774 return; 775 } 776 777 static HRESULT CALLBACK whpx_emu_ioport_callback( 778 void *ctx, 779 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) 780 { 781 MemTxAttrs attrs = { 0 }; 782 address_space_rw(&address_space_io, IoAccess->Port, attrs, 783 &IoAccess->Data, IoAccess->AccessSize, 784 IoAccess->Direction); 785 return S_OK; 786 } 787 788 static HRESULT CALLBACK whpx_emu_mmio_callback( 789 void *ctx, 790 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) 791 { 792 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, 793 ma->Direction); 794 return S_OK; 795 } 796 797 static HRESULT CALLBACK whpx_emu_getreg_callback( 798 void *ctx, 799 const WHV_REGISTER_NAME *RegisterNames, 800 UINT32 RegisterCount, 801 WHV_REGISTER_VALUE *RegisterValues) 802 { 803 HRESULT hr; 804 struct whpx_state *whpx = &whpx_global; 805 CPUState *cpu = (CPUState *)ctx; 806 807 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 808 whpx->partition, cpu->cpu_index, 809 RegisterNames, RegisterCount, 810 RegisterValues); 811 if (FAILED(hr)) { 812 error_report("WHPX: Failed to get virtual processor registers," 813 " hr=%08lx", hr); 814 } 815 816 return hr; 817 } 818 819 static HRESULT CALLBACK whpx_emu_setreg_callback( 820 void *ctx, 821 const WHV_REGISTER_NAME *RegisterNames, 822 UINT32 RegisterCount, 823 const WHV_REGISTER_VALUE *RegisterValues) 824 { 825 HRESULT hr; 826 struct whpx_state *whpx = &whpx_global; 827 CPUState *cpu = (CPUState *)ctx; 828 829 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 830 whpx->partition, cpu->cpu_index, 831 RegisterNames, RegisterCount, 832 RegisterValues); 833 if (FAILED(hr)) { 834 error_report("WHPX: Failed to set virtual processor registers," 835 " hr=%08lx", hr); 836 } 837 838 /* 839 * The emulator just successfully wrote the register state. We clear the 840 * dirty state so we avoid the double write on resume of the VP. 841 */ 842 cpu->vcpu_dirty = false; 843 844 return hr; 845 } 846 847 static HRESULT CALLBACK whpx_emu_translate_callback( 848 void *ctx, 849 WHV_GUEST_VIRTUAL_ADDRESS Gva, 850 WHV_TRANSLATE_GVA_FLAGS TranslateFlags, 851 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, 852 WHV_GUEST_PHYSICAL_ADDRESS *Gpa) 853 { 854 HRESULT hr; 855 struct whpx_state *whpx = &whpx_global; 856 CPUState *cpu = (CPUState *)ctx; 857 WHV_TRANSLATE_GVA_RESULT res; 858 859 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, 860 Gva, TranslateFlags, &res, Gpa); 861 if (FAILED(hr)) { 862 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); 863 } else { 864 *TranslationResult = res.ResultCode; 865 } 866 867 return hr; 868 } 869 870 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { 871 .Size = sizeof(WHV_EMULATOR_CALLBACKS), 872 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, 873 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, 874 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, 875 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, 876 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, 877 }; 878 879 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) 880 { 881 HRESULT hr; 882 AccelCPUState *vcpu = cpu->accel; 883 WHV_EMULATOR_STATUS emu_status; 884 885 hr = whp_dispatch.WHvEmulatorTryMmioEmulation( 886 vcpu->emulator, cpu, 887 &vcpu->exit_ctx.VpContext, ctx, 888 &emu_status); 889 if (FAILED(hr)) { 890 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); 891 return -1; 892 } 893 894 if (!emu_status.EmulationSuccessful) { 895 error_report("WHPX: Failed to emulate MMIO access with" 896 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 897 return -1; 898 } 899 900 return 0; 901 } 902 903 static int whpx_handle_portio(CPUState *cpu, 904 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) 905 { 906 HRESULT hr; 907 AccelCPUState *vcpu = cpu->accel; 908 WHV_EMULATOR_STATUS emu_status; 909 910 hr = whp_dispatch.WHvEmulatorTryIoEmulation( 911 vcpu->emulator, cpu, 912 &vcpu->exit_ctx.VpContext, ctx, 913 &emu_status); 914 if (FAILED(hr)) { 915 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); 916 return -1; 917 } 918 919 if (!emu_status.EmulationSuccessful) { 920 error_report("WHPX: Failed to emulate PortIO access with" 921 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 922 return -1; 923 } 924 925 return 0; 926 } 927 928 /* 929 * Controls whether we should intercept various exceptions on the guest, 930 * namely breakpoint/single-step events. 931 * 932 * The 'exceptions' argument accepts a bitmask, e.g: 933 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) 934 */ 935 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) 936 { 937 struct whpx_state *whpx = &whpx_global; 938 WHV_PARTITION_PROPERTY prop = { 0, }; 939 HRESULT hr; 940 941 if (exceptions == whpx->exception_exit_bitmap) { 942 return S_OK; 943 } 944 945 prop.ExceptionExitBitmap = exceptions; 946 947 hr = whp_dispatch.WHvSetPartitionProperty( 948 whpx->partition, 949 WHvPartitionPropertyCodeExceptionExitBitmap, 950 &prop, 951 sizeof(WHV_PARTITION_PROPERTY)); 952 953 if (SUCCEEDED(hr)) { 954 whpx->exception_exit_bitmap = exceptions; 955 } 956 957 return hr; 958 } 959 960 961 /* 962 * This function is called before/after stepping over a single instruction. 963 * It will update the CPU registers to arm/disarm the instruction stepping 964 * accordingly. 965 */ 966 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, 967 bool set, 968 uint64_t *exit_context_rflags) 969 { 970 WHV_REGISTER_NAME reg_name; 971 WHV_REGISTER_VALUE reg_value; 972 HRESULT hr; 973 struct whpx_state *whpx = &whpx_global; 974 975 /* 976 * If we are trying to step over a single instruction, we need to set the 977 * TF bit in rflags. Otherwise, clear it. 978 */ 979 reg_name = WHvX64RegisterRflags; 980 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 981 whpx->partition, 982 cpu->cpu_index, 983 ®_name, 984 1, 985 ®_value); 986 987 if (FAILED(hr)) { 988 error_report("WHPX: Failed to get rflags, hr=%08lx", hr); 989 return hr; 990 } 991 992 if (exit_context_rflags) { 993 assert(*exit_context_rflags == reg_value.Reg64); 994 } 995 996 if (set) { 997 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ 998 reg_value.Reg64 |= TF_MASK; 999 } else { 1000 reg_value.Reg64 &= ~TF_MASK; 1001 } 1002 1003 if (exit_context_rflags) { 1004 *exit_context_rflags = reg_value.Reg64; 1005 } 1006 1007 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1008 whpx->partition, 1009 cpu->cpu_index, 1010 ®_name, 1011 1, 1012 ®_value); 1013 1014 if (FAILED(hr)) { 1015 error_report("WHPX: Failed to set rflags," 1016 " hr=%08lx", 1017 hr); 1018 return hr; 1019 } 1020 1021 reg_name = WHvRegisterInterruptState; 1022 reg_value.Reg64 = 0; 1023 1024 /* Suspend delivery of hardware interrupts during single-stepping. */ 1025 reg_value.InterruptState.InterruptShadow = set != 0; 1026 1027 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1028 whpx->partition, 1029 cpu->cpu_index, 1030 ®_name, 1031 1, 1032 ®_value); 1033 1034 if (FAILED(hr)) { 1035 error_report("WHPX: Failed to set InterruptState," 1036 " hr=%08lx", 1037 hr); 1038 return hr; 1039 } 1040 1041 if (!set) { 1042 /* 1043 * We have just finished stepping over a single instruction, 1044 * and intercepted the INT1 generated by it. 1045 * We need to now hide the INT1 from the guest, 1046 * as it would not be expecting it. 1047 */ 1048 1049 reg_name = WHvX64RegisterPendingDebugException; 1050 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1051 whpx->partition, 1052 cpu->cpu_index, 1053 ®_name, 1054 1, 1055 ®_value); 1056 1057 if (FAILED(hr)) { 1058 error_report("WHPX: Failed to get pending debug exceptions," 1059 "hr=%08lx", hr); 1060 return hr; 1061 } 1062 1063 if (reg_value.PendingDebugException.SingleStep) { 1064 reg_value.PendingDebugException.SingleStep = 0; 1065 1066 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1067 whpx->partition, 1068 cpu->cpu_index, 1069 ®_name, 1070 1, 1071 ®_value); 1072 1073 if (FAILED(hr)) { 1074 error_report("WHPX: Failed to clear pending debug exceptions," 1075 "hr=%08lx", hr); 1076 return hr; 1077 } 1078 } 1079 1080 } 1081 1082 return S_OK; 1083 } 1084 1085 /* Tries to find a breakpoint at the specified address. */ 1086 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) 1087 { 1088 struct whpx_state *whpx = &whpx_global; 1089 int i; 1090 1091 if (whpx->breakpoints.breakpoints) { 1092 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { 1093 if (address == whpx->breakpoints.breakpoints->data[i].address) { 1094 return &whpx->breakpoints.breakpoints->data[i]; 1095 } 1096 } 1097 } 1098 1099 return NULL; 1100 } 1101 1102 /* 1103 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for 1104 * debugging user-mode applications. Since the WHPX API does not offer 1105 * an easy way to pass the intercepted exception back to the guest, we 1106 * resort to using INT1 instead, and let the guest always handle INT3. 1107 */ 1108 static const uint8_t whpx_breakpoint_instruction = 0xF1; 1109 1110 /* 1111 * The WHPX QEMU backend implements breakpoints by writing the INT1 1112 * instruction into memory (ignoring the DRx registers). This raises a few 1113 * issues that need to be carefully handled: 1114 * 1115 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints 1116 * at the same location, and later remove them in arbitrary order. 1117 * This should not cause memory corruption, and should only remove the 1118 * physical breakpoint instruction when the last QEMU breakpoint is gone. 1119 * 1120 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid 1121 * physical location. Hence, physically adding/removing a breakpoint can 1122 * theoretically fail at any time. We need to keep track of it. 1123 * 1124 * The function below rebuilds a list of low-level breakpoints (one per 1125 * address, tracking the original instruction and any errors) from the list of 1126 * high-level breakpoints (set via cpu_breakpoint_insert()). 1127 * 1128 * In order to optimize performance, this function stores the list of 1129 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the 1130 * low-level ones, so that it won't be re-invoked until these breakpoints 1131 * change. 1132 * 1133 * Note that this function decides which breakpoints should be inserted into, 1134 * memory, but doesn't actually do it. The memory accessing is done in 1135 * whpx_apply_breakpoints(). 1136 */ 1137 static void whpx_translate_cpu_breakpoints( 1138 struct whpx_breakpoints *breakpoints, 1139 CPUState *cpu, 1140 int cpu_breakpoint_count) 1141 { 1142 CPUBreakpoint *bp; 1143 int cpu_bp_index = 0; 1144 1145 breakpoints->original_addresses = 1146 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); 1147 1148 breakpoints->original_address_count = cpu_breakpoint_count; 1149 1150 int max_breakpoints = cpu_breakpoint_count + 1151 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); 1152 1153 struct whpx_breakpoint_collection *new_breakpoints = 1154 g_malloc0(sizeof(struct whpx_breakpoint_collection) 1155 + max_breakpoints * sizeof(struct whpx_breakpoint)); 1156 1157 new_breakpoints->allocated = max_breakpoints; 1158 new_breakpoints->used = 0; 1159 1160 /* 1161 * 1. Preserve all old breakpoints that could not be automatically 1162 * cleared when the CPU got stopped. 1163 */ 1164 if (breakpoints->breakpoints) { 1165 int i; 1166 for (i = 0; i < breakpoints->breakpoints->used; i++) { 1167 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { 1168 new_breakpoints->data[new_breakpoints->used++] = 1169 breakpoints->breakpoints->data[i]; 1170 } 1171 } 1172 } 1173 1174 /* 2. Map all CPU breakpoints to WHPX breakpoints */ 1175 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1176 int i; 1177 bool found = false; 1178 1179 /* This will be used to detect changed CPU breakpoints later. */ 1180 breakpoints->original_addresses[cpu_bp_index++] = bp->pc; 1181 1182 for (i = 0; i < new_breakpoints->used; i++) { 1183 /* 1184 * WARNING: This loop has O(N^2) complexity, where N is the 1185 * number of breakpoints. It should not be a bottleneck in 1186 * real-world scenarios, since it only needs to run once after 1187 * the breakpoints have been modified. 1188 * If this ever becomes a concern, it can be optimized by storing 1189 * high-level breakpoint objects in a tree or hash map. 1190 */ 1191 1192 if (new_breakpoints->data[i].address == bp->pc) { 1193 /* There was already a breakpoint at this address. */ 1194 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { 1195 new_breakpoints->data[i].state = WHPX_BP_SET; 1196 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { 1197 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; 1198 } 1199 1200 found = true; 1201 break; 1202 } 1203 } 1204 1205 if (!found && new_breakpoints->used < new_breakpoints->allocated) { 1206 /* No WHPX breakpoint at this address. Create one. */ 1207 new_breakpoints->data[new_breakpoints->used].address = bp->pc; 1208 new_breakpoints->data[new_breakpoints->used].state = 1209 WHPX_BP_SET_PENDING; 1210 new_breakpoints->used++; 1211 } 1212 } 1213 1214 /* 1215 * Free the previous breakpoint list. This can be optimized by keeping 1216 * it as shadow buffer for the next computation instead of freeing 1217 * it immediately. 1218 */ 1219 g_free(breakpoints->breakpoints); 1220 1221 breakpoints->breakpoints = new_breakpoints; 1222 } 1223 1224 /* 1225 * Physically inserts/removes the breakpoints by reading and writing the 1226 * physical memory, keeping a track of the failed attempts. 1227 * 1228 * Passing resuming=true will try to set all previously unset breakpoints. 1229 * Passing resuming=false will remove all inserted ones. 1230 */ 1231 static void whpx_apply_breakpoints( 1232 struct whpx_breakpoint_collection *breakpoints, 1233 CPUState *cpu, 1234 bool resuming) 1235 { 1236 int i, rc; 1237 if (!breakpoints) { 1238 return; 1239 } 1240 1241 for (i = 0; i < breakpoints->used; i++) { 1242 /* Decide what to do right now based on the last known state. */ 1243 WhpxBreakpointState state = breakpoints->data[i].state; 1244 switch (state) { 1245 case WHPX_BP_CLEARED: 1246 if (resuming) { 1247 state = WHPX_BP_SET_PENDING; 1248 } 1249 break; 1250 case WHPX_BP_SET_PENDING: 1251 if (!resuming) { 1252 state = WHPX_BP_CLEARED; 1253 } 1254 break; 1255 case WHPX_BP_SET: 1256 if (!resuming) { 1257 state = WHPX_BP_CLEAR_PENDING; 1258 } 1259 break; 1260 case WHPX_BP_CLEAR_PENDING: 1261 if (resuming) { 1262 state = WHPX_BP_SET; 1263 } 1264 break; 1265 } 1266 1267 if (state == WHPX_BP_SET_PENDING) { 1268 /* Remember the original instruction. */ 1269 rc = cpu_memory_rw_debug(cpu, 1270 breakpoints->data[i].address, 1271 &breakpoints->data[i].original_instruction, 1272 1, 1273 false); 1274 1275 if (!rc) { 1276 /* Write the breakpoint instruction. */ 1277 rc = cpu_memory_rw_debug(cpu, 1278 breakpoints->data[i].address, 1279 (void *)&whpx_breakpoint_instruction, 1280 1, 1281 true); 1282 } 1283 1284 if (!rc) { 1285 state = WHPX_BP_SET; 1286 } 1287 1288 } 1289 1290 if (state == WHPX_BP_CLEAR_PENDING) { 1291 /* Restore the original instruction. */ 1292 rc = cpu_memory_rw_debug(cpu, 1293 breakpoints->data[i].address, 1294 &breakpoints->data[i].original_instruction, 1295 1, 1296 true); 1297 1298 if (!rc) { 1299 state = WHPX_BP_CLEARED; 1300 } 1301 } 1302 1303 breakpoints->data[i].state = state; 1304 } 1305 } 1306 1307 /* 1308 * This function is called when the a VCPU is about to start and no other 1309 * VCPUs have been started so far. Since the VCPU start order could be 1310 * arbitrary, it doesn't have to be VCPU#0. 1311 * 1312 * It is used to commit the breakpoints into memory, and configure WHPX 1313 * to intercept debug exceptions. 1314 * 1315 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or 1316 * more VCPUs are already running, so this is the best place to do it. 1317 */ 1318 static int whpx_first_vcpu_starting(CPUState *cpu) 1319 { 1320 struct whpx_state *whpx = &whpx_global; 1321 HRESULT hr; 1322 1323 g_assert(bql_locked()); 1324 1325 if (!QTAILQ_EMPTY(&cpu->breakpoints) || 1326 (whpx->breakpoints.breakpoints && 1327 whpx->breakpoints.breakpoints->used)) { 1328 CPUBreakpoint *bp; 1329 int i = 0; 1330 bool update_pending = false; 1331 1332 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1333 if (i >= whpx->breakpoints.original_address_count || 1334 bp->pc != whpx->breakpoints.original_addresses[i]) { 1335 update_pending = true; 1336 } 1337 1338 i++; 1339 } 1340 1341 if (i != whpx->breakpoints.original_address_count) { 1342 update_pending = true; 1343 } 1344 1345 if (update_pending) { 1346 /* 1347 * The CPU breakpoints have changed since the last call to 1348 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must 1349 * now be recomputed. 1350 */ 1351 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); 1352 } 1353 1354 /* Actually insert the breakpoints into the memory. */ 1355 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); 1356 } 1357 1358 uint64_t exception_mask; 1359 if (whpx->step_pending || 1360 (whpx->breakpoints.breakpoints && 1361 whpx->breakpoints.breakpoints->used)) { 1362 /* 1363 * We are either attempting to single-step one or more CPUs, or 1364 * have one or more breakpoints enabled. Both require intercepting 1365 * the WHvX64ExceptionTypeBreakpointTrap exception. 1366 */ 1367 1368 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; 1369 } else { 1370 /* Let the guest handle all exceptions. */ 1371 exception_mask = 0; 1372 } 1373 1374 hr = whpx_set_exception_exit_bitmap(exception_mask); 1375 if (!SUCCEEDED(hr)) { 1376 error_report("WHPX: Failed to update exception exit mask," 1377 "hr=%08lx.", hr); 1378 return 1; 1379 } 1380 1381 return 0; 1382 } 1383 1384 /* 1385 * This function is called when the last VCPU has finished running. 1386 * It is used to remove any previously set breakpoints from memory. 1387 */ 1388 static int whpx_last_vcpu_stopping(CPUState *cpu) 1389 { 1390 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); 1391 return 0; 1392 } 1393 1394 /* Returns the address of the next instruction that is about to be executed. */ 1395 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) 1396 { 1397 if (cpu->vcpu_dirty) { 1398 /* The CPU registers have been modified by other parts of QEMU. */ 1399 return cpu_env(cpu)->eip; 1400 } else if (exit_context_valid) { 1401 /* 1402 * The CPU registers have not been modified by neither other parts 1403 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). 1404 * This is the most common case. 1405 */ 1406 AccelCPUState *vcpu = cpu->accel; 1407 return vcpu->exit_ctx.VpContext.Rip; 1408 } else { 1409 /* 1410 * The CPU registers have been modified by a call to 1411 * WHvSetVirtualProcessorRegisters() and must be re-queried from 1412 * the target. 1413 */ 1414 WHV_REGISTER_VALUE reg_value; 1415 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; 1416 HRESULT hr; 1417 struct whpx_state *whpx = &whpx_global; 1418 1419 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1420 whpx->partition, 1421 cpu->cpu_index, 1422 ®_name, 1423 1, 1424 ®_value); 1425 1426 if (FAILED(hr)) { 1427 error_report("WHPX: Failed to get PC, hr=%08lx", hr); 1428 return 0; 1429 } 1430 1431 return reg_value.Reg64; 1432 } 1433 } 1434 1435 static int whpx_handle_halt(CPUState *cpu) 1436 { 1437 int ret = 0; 1438 1439 bql_lock(); 1440 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1441 (cpu_env(cpu)->eflags & IF_MASK)) && 1442 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1443 cpu->exception_index = EXCP_HLT; 1444 cpu->halted = true; 1445 ret = 1; 1446 } 1447 bql_unlock(); 1448 1449 return ret; 1450 } 1451 1452 static void whpx_vcpu_pre_run(CPUState *cpu) 1453 { 1454 HRESULT hr; 1455 struct whpx_state *whpx = &whpx_global; 1456 AccelCPUState *vcpu = cpu->accel; 1457 X86CPU *x86_cpu = X86_CPU(cpu); 1458 CPUX86State *env = &x86_cpu->env; 1459 int irq; 1460 uint8_t tpr; 1461 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; 1462 UINT32 reg_count = 0; 1463 WHV_REGISTER_VALUE reg_values[3]; 1464 WHV_REGISTER_NAME reg_names[3]; 1465 1466 memset(&new_int, 0, sizeof(new_int)); 1467 memset(reg_values, 0, sizeof(reg_values)); 1468 1469 bql_lock(); 1470 1471 /* Inject NMI */ 1472 if (!vcpu->interruption_pending && 1473 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 1474 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 1475 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 1476 vcpu->interruptable = false; 1477 new_int.InterruptionType = WHvX64PendingNmi; 1478 new_int.InterruptionPending = 1; 1479 new_int.InterruptionVector = 2; 1480 } 1481 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 1482 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 1483 } 1484 } 1485 1486 /* 1487 * Force the VCPU out of its inner loop to process any INIT requests or 1488 * commit pending TPR access. 1489 */ 1490 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 1491 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1492 !(env->hflags & HF_SMM_MASK)) { 1493 cpu->exit_request = 1; 1494 } 1495 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1496 cpu->exit_request = 1; 1497 } 1498 } 1499 1500 /* Get pending hard interruption or replay one that was overwritten */ 1501 if (!whpx_apic_in_platform()) { 1502 if (!vcpu->interruption_pending && 1503 vcpu->interruptable && (env->eflags & IF_MASK)) { 1504 assert(!new_int.InterruptionPending); 1505 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1506 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1507 irq = cpu_get_pic_interrupt(env); 1508 if (irq >= 0) { 1509 new_int.InterruptionType = WHvX64PendingInterrupt; 1510 new_int.InterruptionPending = 1; 1511 new_int.InterruptionVector = irq; 1512 } 1513 } 1514 } 1515 1516 /* Setup interrupt state if new one was prepared */ 1517 if (new_int.InterruptionPending) { 1518 reg_values[reg_count].PendingInterruption = new_int; 1519 reg_names[reg_count] = WHvRegisterPendingInterruption; 1520 reg_count += 1; 1521 } 1522 } else if (vcpu->ready_for_pic_interrupt && 1523 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 1524 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1525 irq = cpu_get_pic_interrupt(env); 1526 if (irq >= 0) { 1527 reg_names[reg_count] = WHvRegisterPendingEvent; 1528 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) 1529 { 1530 .EventPending = 1, 1531 .EventType = WHvX64PendingEventExtInt, 1532 .Vector = irq, 1533 }; 1534 reg_count += 1; 1535 } 1536 } 1537 1538 /* Sync the TPR to the CR8 if was modified during the intercept */ 1539 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 1540 if (tpr != vcpu->tpr) { 1541 vcpu->tpr = tpr; 1542 reg_values[reg_count].Reg64 = tpr; 1543 cpu->exit_request = 1; 1544 reg_names[reg_count] = WHvX64RegisterCr8; 1545 reg_count += 1; 1546 } 1547 1548 /* Update the state of the interrupt delivery notification */ 1549 if (!vcpu->window_registered && 1550 cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1551 reg_values[reg_count].DeliverabilityNotifications = 1552 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { 1553 .InterruptNotification = 1 1554 }; 1555 vcpu->window_registered = 1; 1556 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; 1557 reg_count += 1; 1558 } 1559 1560 bql_unlock(); 1561 vcpu->ready_for_pic_interrupt = false; 1562 1563 if (reg_count) { 1564 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1565 whpx->partition, cpu->cpu_index, 1566 reg_names, reg_count, reg_values); 1567 if (FAILED(hr)) { 1568 error_report("WHPX: Failed to set interrupt state registers," 1569 " hr=%08lx", hr); 1570 } 1571 } 1572 1573 return; 1574 } 1575 1576 static void whpx_vcpu_post_run(CPUState *cpu) 1577 { 1578 AccelCPUState *vcpu = cpu->accel; 1579 X86CPU *x86_cpu = X86_CPU(cpu); 1580 CPUX86State *env = &x86_cpu->env; 1581 1582 env->eflags = vcpu->exit_ctx.VpContext.Rflags; 1583 1584 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; 1585 if (vcpu->tpr != tpr) { 1586 vcpu->tpr = tpr; 1587 bql_lock(); 1588 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr)); 1589 bql_unlock(); 1590 } 1591 1592 vcpu->interruption_pending = 1593 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; 1594 1595 vcpu->interruptable = 1596 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; 1597 1598 return; 1599 } 1600 1601 static void whpx_vcpu_process_async_events(CPUState *cpu) 1602 { 1603 X86CPU *x86_cpu = X86_CPU(cpu); 1604 CPUX86State *env = &x86_cpu->env; 1605 AccelCPUState *vcpu = cpu->accel; 1606 1607 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1608 !(env->hflags & HF_SMM_MASK)) { 1609 whpx_cpu_synchronize_state(cpu); 1610 do_cpu_init(x86_cpu); 1611 vcpu->interruptable = true; 1612 } 1613 1614 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 1615 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 1616 apic_poll_irq(x86_cpu->apic_state); 1617 } 1618 1619 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1620 (env->eflags & IF_MASK)) || 1621 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1622 cpu->halted = false; 1623 } 1624 1625 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 1626 whpx_cpu_synchronize_state(cpu); 1627 do_cpu_sipi(x86_cpu); 1628 } 1629 1630 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1631 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 1632 whpx_cpu_synchronize_state(cpu); 1633 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 1634 env->tpr_access_type); 1635 } 1636 1637 return; 1638 } 1639 1640 static int whpx_vcpu_run(CPUState *cpu) 1641 { 1642 HRESULT hr; 1643 struct whpx_state *whpx = &whpx_global; 1644 AccelCPUState *vcpu = cpu->accel; 1645 struct whpx_breakpoint *stepped_over_bp = NULL; 1646 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; 1647 int ret; 1648 1649 g_assert(bql_locked()); 1650 1651 if (whpx->running_cpus++ == 0) { 1652 /* Insert breakpoints into memory, update exception exit bitmap. */ 1653 ret = whpx_first_vcpu_starting(cpu); 1654 if (ret != 0) { 1655 return ret; 1656 } 1657 } 1658 1659 if (whpx->breakpoints.breakpoints && 1660 whpx->breakpoints.breakpoints->used > 0) 1661 { 1662 uint64_t pc = whpx_vcpu_get_pc(cpu, true); 1663 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); 1664 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { 1665 stepped_over_bp = NULL; 1666 } 1667 1668 if (stepped_over_bp) { 1669 /* 1670 * We are trying to run the instruction overwritten by an active 1671 * breakpoint. We will temporarily disable the breakpoint, suspend 1672 * other CPUs, and step over the instruction. 1673 */ 1674 exclusive_step_mode = WHPX_STEP_EXCLUSIVE; 1675 } 1676 } 1677 1678 if (exclusive_step_mode == WHPX_STEP_NONE) { 1679 whpx_vcpu_process_async_events(cpu); 1680 if (cpu->halted && !whpx_apic_in_platform()) { 1681 cpu->exception_index = EXCP_HLT; 1682 qatomic_set(&cpu->exit_request, false); 1683 return 0; 1684 } 1685 } 1686 1687 bql_unlock(); 1688 1689 if (exclusive_step_mode != WHPX_STEP_NONE) { 1690 start_exclusive(); 1691 g_assert(cpu == current_cpu); 1692 g_assert(!cpu->running); 1693 cpu->running = true; 1694 1695 hr = whpx_set_exception_exit_bitmap( 1696 1UL << WHvX64ExceptionTypeDebugTrapOrFault); 1697 if (!SUCCEEDED(hr)) { 1698 error_report("WHPX: Failed to update exception exit mask, " 1699 "hr=%08lx.", hr); 1700 return 1; 1701 } 1702 1703 if (stepped_over_bp) { 1704 /* Temporarily disable the triggered breakpoint. */ 1705 cpu_memory_rw_debug(cpu, 1706 stepped_over_bp->address, 1707 &stepped_over_bp->original_instruction, 1708 1, 1709 true); 1710 } 1711 } else { 1712 cpu_exec_start(cpu); 1713 } 1714 1715 do { 1716 if (cpu->vcpu_dirty) { 1717 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); 1718 cpu->vcpu_dirty = false; 1719 } 1720 1721 if (exclusive_step_mode == WHPX_STEP_NONE) { 1722 whpx_vcpu_pre_run(cpu); 1723 1724 if (qatomic_read(&cpu->exit_request)) { 1725 whpx_vcpu_kick(cpu); 1726 } 1727 } 1728 1729 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1730 whpx_vcpu_configure_single_stepping(cpu, true, NULL); 1731 } 1732 1733 hr = whp_dispatch.WHvRunVirtualProcessor( 1734 whpx->partition, cpu->cpu_index, 1735 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); 1736 1737 if (FAILED(hr)) { 1738 error_report("WHPX: Failed to exec a virtual processor," 1739 " hr=%08lx", hr); 1740 ret = -1; 1741 break; 1742 } 1743 1744 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1745 whpx_vcpu_configure_single_stepping(cpu, 1746 false, 1747 &vcpu->exit_ctx.VpContext.Rflags); 1748 } 1749 1750 whpx_vcpu_post_run(cpu); 1751 1752 switch (vcpu->exit_ctx.ExitReason) { 1753 case WHvRunVpExitReasonMemoryAccess: 1754 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); 1755 break; 1756 1757 case WHvRunVpExitReasonX64IoPortAccess: 1758 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); 1759 break; 1760 1761 case WHvRunVpExitReasonX64InterruptWindow: 1762 vcpu->ready_for_pic_interrupt = 1; 1763 vcpu->window_registered = 0; 1764 ret = 0; 1765 break; 1766 1767 case WHvRunVpExitReasonX64ApicEoi: 1768 assert(whpx_apic_in_platform()); 1769 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); 1770 break; 1771 1772 case WHvRunVpExitReasonX64Halt: 1773 /* 1774 * WARNING: as of build 19043.1526 (21H1), this exit reason is no 1775 * longer used. 1776 */ 1777 ret = whpx_handle_halt(cpu); 1778 break; 1779 1780 case WHvRunVpExitReasonX64ApicInitSipiTrap: { 1781 WHV_INTERRUPT_CONTROL ipi = {0}; 1782 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; 1783 uint32_t delivery_mode = 1784 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; 1785 int dest_shorthand = 1786 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; 1787 bool broadcast = false; 1788 bool include_self = false; 1789 uint32_t i; 1790 1791 /* We only registered for INIT and SIPI exits. */ 1792 if ((delivery_mode != APIC_DM_INIT) && 1793 (delivery_mode != APIC_DM_SIPI)) { 1794 error_report( 1795 "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); 1796 break; 1797 } 1798 1799 if (delivery_mode == APIC_DM_INIT) { 1800 ipi.Type = WHvX64InterruptTypeInit; 1801 } else { 1802 ipi.Type = WHvX64InterruptTypeSipi; 1803 } 1804 1805 ipi.DestinationMode = 1806 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? 1807 WHvX64InterruptDestinationModeLogical : 1808 WHvX64InterruptDestinationModePhysical; 1809 1810 ipi.TriggerMode = 1811 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? 1812 WHvX64InterruptTriggerModeLevel : 1813 WHvX64InterruptTriggerModeEdge; 1814 1815 ipi.Vector = icr & APIC_VECTOR_MASK; 1816 switch (dest_shorthand) { 1817 /* no shorthand. Bits 56-63 contain the destination. */ 1818 case 0: 1819 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; 1820 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1821 &ipi, sizeof(ipi)); 1822 if (FAILED(hr)) { 1823 error_report("WHPX: Failed to request interrupt hr=%08lx", 1824 hr); 1825 } 1826 1827 break; 1828 1829 /* self */ 1830 case 1: 1831 include_self = true; 1832 break; 1833 1834 /* broadcast, including self */ 1835 case 2: 1836 broadcast = true; 1837 include_self = true; 1838 break; 1839 1840 /* broadcast, excluding self */ 1841 case 3: 1842 broadcast = true; 1843 break; 1844 } 1845 1846 if (!broadcast && !include_self) { 1847 break; 1848 } 1849 1850 for (i = 0; i <= max_vcpu_index; i++) { 1851 if (i == cpu->cpu_index && !include_self) { 1852 continue; 1853 } 1854 1855 /* 1856 * Assuming that APIC Ids are identity mapped since 1857 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers 1858 * are not handled yet and the hypervisor doesn't allow the 1859 * guest to modify the APIC ID. 1860 */ 1861 ipi.Destination = i; 1862 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1863 &ipi, sizeof(ipi)); 1864 if (FAILED(hr)) { 1865 error_report( 1866 "WHPX: Failed to request SIPI for %d, hr=%08lx", 1867 i, hr); 1868 } 1869 } 1870 1871 break; 1872 } 1873 1874 case WHvRunVpExitReasonCanceled: 1875 if (exclusive_step_mode != WHPX_STEP_NONE) { 1876 /* 1877 * We are trying to step over a single instruction, and 1878 * likely got a request to stop from another thread. 1879 * Delay it until we are done stepping 1880 * over. 1881 */ 1882 ret = 0; 1883 } else { 1884 cpu->exception_index = EXCP_INTERRUPT; 1885 ret = 1; 1886 } 1887 break; 1888 case WHvRunVpExitReasonX64MsrAccess: { 1889 WHV_REGISTER_VALUE reg_values[3] = {0}; 1890 WHV_REGISTER_NAME reg_names[3]; 1891 UINT32 reg_count; 1892 1893 reg_names[0] = WHvX64RegisterRip; 1894 reg_names[1] = WHvX64RegisterRax; 1895 reg_names[2] = WHvX64RegisterRdx; 1896 1897 reg_values[0].Reg64 = 1898 vcpu->exit_ctx.VpContext.Rip + 1899 vcpu->exit_ctx.VpContext.InstructionLength; 1900 1901 /* 1902 * For all unsupported MSR access we: 1903 * ignore writes 1904 * return 0 on read. 1905 */ 1906 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? 1907 1 : 3; 1908 1909 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1910 whpx->partition, 1911 cpu->cpu_index, 1912 reg_names, reg_count, 1913 reg_values); 1914 1915 if (FAILED(hr)) { 1916 error_report("WHPX: Failed to set MsrAccess state " 1917 " registers, hr=%08lx", hr); 1918 } 1919 ret = 0; 1920 break; 1921 } 1922 case WHvRunVpExitReasonX64Cpuid: { 1923 WHV_REGISTER_VALUE reg_values[5]; 1924 WHV_REGISTER_NAME reg_names[5]; 1925 UINT32 reg_count = 5; 1926 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; 1927 X86CPU *x86_cpu = X86_CPU(cpu); 1928 CPUX86State *env = &x86_cpu->env; 1929 1930 memset(reg_values, 0, sizeof(reg_values)); 1931 1932 rip = vcpu->exit_ctx.VpContext.Rip + 1933 vcpu->exit_ctx.VpContext.InstructionLength; 1934 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; 1935 1936 /* 1937 * Ideally, these should be supplied to the hypervisor during VCPU 1938 * initialization and it should be able to satisfy this request. 1939 * But, currently, WHPX doesn't support setting CPUID values in the 1940 * hypervisor once the partition has been setup, which is too late 1941 * since VCPUs are realized later. For now, use the values from 1942 * QEMU to satisfy these requests, until WHPX adds support for 1943 * being able to set these values in the hypervisor at runtime. 1944 */ 1945 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, 1946 (UINT32 *)&rcx, (UINT32 *)&rdx); 1947 switch (cpuid_fn) { 1948 case 0x40000000: 1949 /* Expose the vmware cpu frequency cpuid leaf */ 1950 rax = 0x40000010; 1951 rbx = rcx = rdx = 0; 1952 break; 1953 1954 case 0x40000010: 1955 rax = env->tsc_khz; 1956 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1957 rcx = rdx = 0; 1958 break; 1959 1960 case 0x80000001: 1961 /* Remove any support of OSVW */ 1962 rcx &= ~CPUID_EXT3_OSVW; 1963 break; 1964 } 1965 1966 reg_names[0] = WHvX64RegisterRip; 1967 reg_names[1] = WHvX64RegisterRax; 1968 reg_names[2] = WHvX64RegisterRcx; 1969 reg_names[3] = WHvX64RegisterRdx; 1970 reg_names[4] = WHvX64RegisterRbx; 1971 1972 reg_values[0].Reg64 = rip; 1973 reg_values[1].Reg64 = rax; 1974 reg_values[2].Reg64 = rcx; 1975 reg_values[3].Reg64 = rdx; 1976 reg_values[4].Reg64 = rbx; 1977 1978 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1979 whpx->partition, cpu->cpu_index, 1980 reg_names, 1981 reg_count, 1982 reg_values); 1983 1984 if (FAILED(hr)) { 1985 error_report("WHPX: Failed to set CpuidAccess state registers," 1986 " hr=%08lx", hr); 1987 } 1988 ret = 0; 1989 break; 1990 } 1991 case WHvRunVpExitReasonException: 1992 whpx_get_registers(cpu); 1993 1994 if ((vcpu->exit_ctx.VpException.ExceptionType == 1995 WHvX64ExceptionTypeDebugTrapOrFault) && 1996 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && 1997 (vcpu->exit_ctx.VpException.InstructionBytes[0] == 1998 whpx_breakpoint_instruction)) { 1999 /* Stopped at a software breakpoint. */ 2000 cpu->exception_index = EXCP_DEBUG; 2001 } else if ((vcpu->exit_ctx.VpException.ExceptionType == 2002 WHvX64ExceptionTypeDebugTrapOrFault) && 2003 !cpu->singlestep_enabled) { 2004 /* 2005 * Just finished stepping over a breakpoint, but the 2006 * gdb does not expect us to do single-stepping. 2007 * Don't do anything special. 2008 */ 2009 cpu->exception_index = EXCP_INTERRUPT; 2010 } else { 2011 /* Another exception or debug event. Report it to GDB. */ 2012 cpu->exception_index = EXCP_DEBUG; 2013 } 2014 2015 ret = 1; 2016 break; 2017 case WHvRunVpExitReasonNone: 2018 case WHvRunVpExitReasonUnrecoverableException: 2019 case WHvRunVpExitReasonInvalidVpRegisterValue: 2020 case WHvRunVpExitReasonUnsupportedFeature: 2021 default: 2022 error_report("WHPX: Unexpected VP exit code %d", 2023 vcpu->exit_ctx.ExitReason); 2024 whpx_get_registers(cpu); 2025 bql_lock(); 2026 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2027 bql_unlock(); 2028 break; 2029 } 2030 2031 } while (!ret); 2032 2033 if (stepped_over_bp) { 2034 /* Restore the breakpoint we stepped over */ 2035 cpu_memory_rw_debug(cpu, 2036 stepped_over_bp->address, 2037 (void *)&whpx_breakpoint_instruction, 2038 1, 2039 true); 2040 } 2041 2042 if (exclusive_step_mode != WHPX_STEP_NONE) { 2043 g_assert(cpu_in_exclusive_context(cpu)); 2044 cpu->running = false; 2045 end_exclusive(); 2046 2047 exclusive_step_mode = WHPX_STEP_NONE; 2048 } else { 2049 cpu_exec_end(cpu); 2050 } 2051 2052 bql_lock(); 2053 current_cpu = cpu; 2054 2055 if (--whpx->running_cpus == 0) { 2056 whpx_last_vcpu_stopping(cpu); 2057 } 2058 2059 qatomic_set(&cpu->exit_request, false); 2060 2061 return ret < 0; 2062 } 2063 2064 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2065 { 2066 if (!cpu->vcpu_dirty) { 2067 whpx_get_registers(cpu); 2068 cpu->vcpu_dirty = true; 2069 } 2070 } 2071 2072 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, 2073 run_on_cpu_data arg) 2074 { 2075 whpx_set_registers(cpu, WHPX_SET_RESET_STATE); 2076 cpu->vcpu_dirty = false; 2077 } 2078 2079 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, 2080 run_on_cpu_data arg) 2081 { 2082 whpx_set_registers(cpu, WHPX_SET_FULL_STATE); 2083 cpu->vcpu_dirty = false; 2084 } 2085 2086 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, 2087 run_on_cpu_data arg) 2088 { 2089 cpu->vcpu_dirty = true; 2090 } 2091 2092 /* 2093 * CPU support. 2094 */ 2095 2096 void whpx_cpu_synchronize_state(CPUState *cpu) 2097 { 2098 if (!cpu->vcpu_dirty) { 2099 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); 2100 } 2101 } 2102 2103 void whpx_cpu_synchronize_post_reset(CPUState *cpu) 2104 { 2105 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2106 } 2107 2108 void whpx_cpu_synchronize_post_init(CPUState *cpu) 2109 { 2110 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2111 } 2112 2113 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) 2114 { 2115 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2116 } 2117 2118 void whpx_cpu_synchronize_pre_resume(bool step_pending) 2119 { 2120 whpx_global.step_pending = step_pending; 2121 } 2122 2123 /* 2124 * Vcpu support. 2125 */ 2126 2127 static Error *whpx_migration_blocker; 2128 2129 static void whpx_cpu_update_state(void *opaque, bool running, RunState state) 2130 { 2131 CPUX86State *env = opaque; 2132 2133 if (running) { 2134 env->tsc_valid = false; 2135 } 2136 } 2137 2138 int whpx_init_vcpu(CPUState *cpu) 2139 { 2140 HRESULT hr; 2141 struct whpx_state *whpx = &whpx_global; 2142 AccelCPUState *vcpu = NULL; 2143 Error *local_error = NULL; 2144 X86CPU *x86_cpu = X86_CPU(cpu); 2145 CPUX86State *env = &x86_cpu->env; 2146 UINT64 freq = 0; 2147 int ret; 2148 2149 /* Add migration blockers for all unsupported features of the 2150 * Windows Hypervisor Platform 2151 */ 2152 if (whpx_migration_blocker == NULL) { 2153 error_setg(&whpx_migration_blocker, 2154 "State blocked due to non-migratable CPUID feature support," 2155 "dirty memory tracking support, and XSAVE/XRSTOR support"); 2156 2157 if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) { 2158 error_report_err(local_error); 2159 ret = -EINVAL; 2160 goto error; 2161 } 2162 } 2163 2164 vcpu = g_new0(AccelCPUState, 1); 2165 2166 hr = whp_dispatch.WHvEmulatorCreateEmulator( 2167 &whpx_emu_callbacks, 2168 &vcpu->emulator); 2169 if (FAILED(hr)) { 2170 error_report("WHPX: Failed to setup instruction completion support," 2171 " hr=%08lx", hr); 2172 ret = -EINVAL; 2173 goto error; 2174 } 2175 2176 hr = whp_dispatch.WHvCreateVirtualProcessor( 2177 whpx->partition, cpu->cpu_index, 0); 2178 if (FAILED(hr)) { 2179 error_report("WHPX: Failed to create a virtual processor," 2180 " hr=%08lx", hr); 2181 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2182 ret = -EINVAL; 2183 goto error; 2184 } 2185 2186 /* 2187 * vcpu's TSC frequency is either specified by user, or use the value 2188 * provided by Hyper-V if the former is not present. In the latter case, we 2189 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC 2190 * frequency can be migrated later via this field. 2191 */ 2192 if (!env->tsc_khz) { 2193 hr = whp_dispatch.WHvGetCapability( 2194 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), 2195 NULL); 2196 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2197 if (FAILED(hr)) { 2198 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); 2199 } else { 2200 env->tsc_khz = freq / 1000; /* Hz to KHz */ 2201 } 2202 } 2203 } 2204 2205 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; 2206 hr = whp_dispatch.WHvGetCapability( 2207 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); 2208 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2209 if (FAILED(hr)) { 2210 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); 2211 } else { 2212 env->apic_bus_freq = freq; 2213 } 2214 } 2215 2216 /* 2217 * If the vmware cpuid frequency leaf option is set, and we have a valid 2218 * tsc value, trap the corresponding cpuid's. 2219 */ 2220 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { 2221 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; 2222 2223 hr = whp_dispatch.WHvSetPartitionProperty( 2224 whpx->partition, 2225 WHvPartitionPropertyCodeCpuidExitList, 2226 cpuidExitList, 2227 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2228 2229 if (FAILED(hr)) { 2230 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2231 hr); 2232 ret = -EINVAL; 2233 goto error; 2234 } 2235 } 2236 2237 vcpu->interruptable = true; 2238 cpu->vcpu_dirty = true; 2239 cpu->accel = vcpu; 2240 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); 2241 qemu_add_vm_change_state_handler(whpx_cpu_update_state, env); 2242 2243 return 0; 2244 2245 error: 2246 g_free(vcpu); 2247 2248 return ret; 2249 } 2250 2251 int whpx_vcpu_exec(CPUState *cpu) 2252 { 2253 int ret; 2254 int fatal; 2255 2256 for (;;) { 2257 if (cpu->exception_index >= EXCP_INTERRUPT) { 2258 ret = cpu->exception_index; 2259 cpu->exception_index = -1; 2260 break; 2261 } 2262 2263 fatal = whpx_vcpu_run(cpu); 2264 2265 if (fatal) { 2266 error_report("WHPX: Failed to exec a virtual processor"); 2267 abort(); 2268 } 2269 } 2270 2271 return ret; 2272 } 2273 2274 void whpx_destroy_vcpu(CPUState *cpu) 2275 { 2276 struct whpx_state *whpx = &whpx_global; 2277 AccelCPUState *vcpu = cpu->accel; 2278 2279 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); 2280 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2281 g_free(cpu->accel); 2282 return; 2283 } 2284 2285 void whpx_vcpu_kick(CPUState *cpu) 2286 { 2287 struct whpx_state *whpx = &whpx_global; 2288 whp_dispatch.WHvCancelRunVirtualProcessor( 2289 whpx->partition, cpu->cpu_index, 0); 2290 } 2291 2292 /* 2293 * Memory support. 2294 */ 2295 2296 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, 2297 void *host_va, int add, int rom, 2298 const char *name) 2299 { 2300 struct whpx_state *whpx = &whpx_global; 2301 HRESULT hr; 2302 2303 /* 2304 if (add) { 2305 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", 2306 (void*)start_pa, (void*)size, host_va, 2307 (rom ? "ROM" : "RAM"), name); 2308 } else { 2309 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", 2310 (void*)start_pa, (void*)size, host_va, name); 2311 } 2312 */ 2313 2314 if (add) { 2315 hr = whp_dispatch.WHvMapGpaRange(whpx->partition, 2316 host_va, 2317 start_pa, 2318 size, 2319 (WHvMapGpaRangeFlagRead | 2320 WHvMapGpaRangeFlagExecute | 2321 (rom ? 0 : WHvMapGpaRangeFlagWrite))); 2322 } else { 2323 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, 2324 start_pa, 2325 size); 2326 } 2327 2328 if (FAILED(hr)) { 2329 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," 2330 " Host:%p, hr=%08lx", 2331 (add ? "MAP" : "UNMAP"), name, 2332 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); 2333 } 2334 } 2335 2336 static void whpx_process_section(MemoryRegionSection *section, int add) 2337 { 2338 MemoryRegion *mr = section->mr; 2339 hwaddr start_pa = section->offset_within_address_space; 2340 ram_addr_t size = int128_get64(section->size); 2341 unsigned int delta; 2342 uint64_t host_va; 2343 2344 if (!memory_region_is_ram(mr)) { 2345 return; 2346 } 2347 2348 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 2349 delta &= ~qemu_real_host_page_mask(); 2350 if (delta > size) { 2351 return; 2352 } 2353 start_pa += delta; 2354 size -= delta; 2355 size &= qemu_real_host_page_mask(); 2356 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 2357 return; 2358 } 2359 2360 host_va = (uintptr_t)memory_region_get_ram_ptr(mr) 2361 + section->offset_within_region + delta; 2362 2363 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, 2364 memory_region_is_rom(mr), mr->name); 2365 } 2366 2367 static void whpx_region_add(MemoryListener *listener, 2368 MemoryRegionSection *section) 2369 { 2370 memory_region_ref(section->mr); 2371 whpx_process_section(section, 1); 2372 } 2373 2374 static void whpx_region_del(MemoryListener *listener, 2375 MemoryRegionSection *section) 2376 { 2377 whpx_process_section(section, 0); 2378 memory_region_unref(section->mr); 2379 } 2380 2381 static void whpx_transaction_begin(MemoryListener *listener) 2382 { 2383 } 2384 2385 static void whpx_transaction_commit(MemoryListener *listener) 2386 { 2387 } 2388 2389 static void whpx_log_sync(MemoryListener *listener, 2390 MemoryRegionSection *section) 2391 { 2392 MemoryRegion *mr = section->mr; 2393 2394 if (!memory_region_is_ram(mr)) { 2395 return; 2396 } 2397 2398 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 2399 } 2400 2401 static MemoryListener whpx_memory_listener = { 2402 .name = "whpx", 2403 .begin = whpx_transaction_begin, 2404 .commit = whpx_transaction_commit, 2405 .region_add = whpx_region_add, 2406 .region_del = whpx_region_del, 2407 .log_sync = whpx_log_sync, 2408 .priority = MEMORY_LISTENER_PRIORITY_ACCEL, 2409 }; 2410 2411 static void whpx_memory_init(void) 2412 { 2413 memory_listener_register(&whpx_memory_listener, &address_space_memory); 2414 } 2415 2416 /* 2417 * Load the functions from the given library, using the given handle. If a 2418 * handle is provided, it is used, otherwise the library is opened. The 2419 * handle will be updated on return with the opened one. 2420 */ 2421 static bool load_whp_dispatch_fns(HMODULE *handle, 2422 WHPFunctionList function_list) 2423 { 2424 HMODULE hLib = *handle; 2425 2426 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" 2427 #define WINHV_EMULATION_DLL "WinHvEmulation.dll" 2428 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ 2429 whp_dispatch.function_name = \ 2430 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2431 2432 #define WHP_LOAD_FIELD(return_type, function_name, signature) \ 2433 whp_dispatch.function_name = \ 2434 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2435 if (!whp_dispatch.function_name) { \ 2436 error_report("Could not load function %s", #function_name); \ 2437 goto error; \ 2438 } \ 2439 2440 #define WHP_LOAD_LIB(lib_name, handle_lib) \ 2441 if (!handle_lib) { \ 2442 handle_lib = LoadLibrary(lib_name); \ 2443 if (!handle_lib) { \ 2444 error_report("Could not load library %s.", lib_name); \ 2445 goto error; \ 2446 } \ 2447 } \ 2448 2449 switch (function_list) { 2450 case WINHV_PLATFORM_FNS_DEFAULT: 2451 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2452 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) 2453 break; 2454 2455 case WINHV_EMULATION_FNS_DEFAULT: 2456 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) 2457 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) 2458 break; 2459 2460 case WINHV_PLATFORM_FNS_SUPPLEMENTAL: 2461 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2462 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) 2463 break; 2464 } 2465 2466 *handle = hLib; 2467 return true; 2468 2469 error: 2470 if (hLib) { 2471 FreeLibrary(hLib); 2472 } 2473 2474 return false; 2475 } 2476 2477 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, 2478 const char *name, void *opaque, 2479 Error **errp) 2480 { 2481 struct whpx_state *whpx = &whpx_global; 2482 OnOffSplit mode; 2483 2484 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 2485 return; 2486 } 2487 2488 switch (mode) { 2489 case ON_OFF_SPLIT_ON: 2490 whpx->kernel_irqchip_allowed = true; 2491 whpx->kernel_irqchip_required = true; 2492 break; 2493 2494 case ON_OFF_SPLIT_OFF: 2495 whpx->kernel_irqchip_allowed = false; 2496 whpx->kernel_irqchip_required = false; 2497 break; 2498 2499 case ON_OFF_SPLIT_SPLIT: 2500 error_setg(errp, "WHPX: split irqchip currently not supported"); 2501 error_append_hint(errp, 2502 "Try without kernel-irqchip or with kernel-irqchip=on|off"); 2503 break; 2504 2505 default: 2506 /* 2507 * The value was checked in visit_type_OnOffSplit() above. If 2508 * we get here, then something is wrong in QEMU. 2509 */ 2510 abort(); 2511 } 2512 } 2513 2514 /* 2515 * Partition support 2516 */ 2517 2518 static int whpx_accel_init(MachineState *ms) 2519 { 2520 struct whpx_state *whpx; 2521 int ret; 2522 HRESULT hr; 2523 WHV_CAPABILITY whpx_cap; 2524 UINT32 whpx_cap_size; 2525 WHV_PARTITION_PROPERTY prop; 2526 UINT32 cpuidExitList[] = {1, 0x80000001}; 2527 WHV_CAPABILITY_FEATURES features = {0}; 2528 2529 whpx = &whpx_global; 2530 2531 if (!init_whp_dispatch()) { 2532 ret = -ENOSYS; 2533 goto error; 2534 } 2535 2536 whpx->mem_quota = ms->ram_size; 2537 2538 hr = whp_dispatch.WHvGetCapability( 2539 WHvCapabilityCodeHypervisorPresent, &whpx_cap, 2540 sizeof(whpx_cap), &whpx_cap_size); 2541 if (FAILED(hr) || !whpx_cap.HypervisorPresent) { 2542 error_report("WHPX: No accelerator found, hr=%08lx", hr); 2543 ret = -ENOSPC; 2544 goto error; 2545 } 2546 2547 hr = whp_dispatch.WHvGetCapability( 2548 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); 2549 if (FAILED(hr)) { 2550 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); 2551 ret = -EINVAL; 2552 goto error; 2553 } 2554 2555 hr = whp_dispatch.WHvCreatePartition(&whpx->partition); 2556 if (FAILED(hr)) { 2557 error_report("WHPX: Failed to create partition, hr=%08lx", hr); 2558 ret = -EINVAL; 2559 goto error; 2560 } 2561 2562 /* 2563 * Query the XSAVE capability of the partition. Any error here is not 2564 * considered fatal. 2565 */ 2566 hr = whp_dispatch.WHvGetPartitionProperty( 2567 whpx->partition, 2568 WHvPartitionPropertyCodeProcessorXsaveFeatures, 2569 &whpx_xsave_cap, 2570 sizeof(whpx_xsave_cap), 2571 &whpx_cap_size); 2572 2573 /* 2574 * Windows version which don't support this property will return with the 2575 * specific error code. 2576 */ 2577 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) { 2578 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr); 2579 } 2580 2581 if (!whpx_has_xsave()) { 2582 printf("WHPX: Partition is not XSAVE capable\n"); 2583 } 2584 2585 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2586 prop.ProcessorCount = ms->smp.cpus; 2587 hr = whp_dispatch.WHvSetPartitionProperty( 2588 whpx->partition, 2589 WHvPartitionPropertyCodeProcessorCount, 2590 &prop, 2591 sizeof(WHV_PARTITION_PROPERTY)); 2592 2593 if (FAILED(hr)) { 2594 error_report("WHPX: Failed to set partition processor count to %u," 2595 " hr=%08lx", prop.ProcessorCount, hr); 2596 ret = -EINVAL; 2597 goto error; 2598 } 2599 2600 /* 2601 * Error out if WHP doesn't support apic emulation and user is requiring 2602 * it. 2603 */ 2604 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || 2605 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { 2606 error_report("WHPX: kernel irqchip requested, but unavailable. " 2607 "Try without kernel-irqchip or with kernel-irqchip=off"); 2608 ret = -EINVAL; 2609 goto error; 2610 } 2611 2612 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && 2613 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { 2614 WHV_X64_LOCAL_APIC_EMULATION_MODE mode = 2615 WHvX64LocalApicEmulationModeXApic; 2616 printf("WHPX: setting APIC emulation mode in the hypervisor\n"); 2617 hr = whp_dispatch.WHvSetPartitionProperty( 2618 whpx->partition, 2619 WHvPartitionPropertyCodeLocalApicEmulationMode, 2620 &mode, 2621 sizeof(mode)); 2622 if (FAILED(hr)) { 2623 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); 2624 if (whpx->kernel_irqchip_required) { 2625 error_report("WHPX: kernel irqchip requested, but unavailable"); 2626 ret = -EINVAL; 2627 goto error; 2628 } 2629 } else { 2630 whpx->apic_in_platform = true; 2631 } 2632 } 2633 2634 /* Register for MSR and CPUID exits */ 2635 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2636 prop.ExtendedVmExits.X64MsrExit = 1; 2637 prop.ExtendedVmExits.X64CpuidExit = 1; 2638 prop.ExtendedVmExits.ExceptionExit = 1; 2639 if (whpx_apic_in_platform()) { 2640 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; 2641 } 2642 2643 hr = whp_dispatch.WHvSetPartitionProperty( 2644 whpx->partition, 2645 WHvPartitionPropertyCodeExtendedVmExits, 2646 &prop, 2647 sizeof(WHV_PARTITION_PROPERTY)); 2648 if (FAILED(hr)) { 2649 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); 2650 ret = -EINVAL; 2651 goto error; 2652 } 2653 2654 hr = whp_dispatch.WHvSetPartitionProperty( 2655 whpx->partition, 2656 WHvPartitionPropertyCodeCpuidExitList, 2657 cpuidExitList, 2658 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2659 2660 if (FAILED(hr)) { 2661 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2662 hr); 2663 ret = -EINVAL; 2664 goto error; 2665 } 2666 2667 /* 2668 * We do not want to intercept any exceptions from the guest, 2669 * until we actually start debugging with gdb. 2670 */ 2671 whpx->exception_exit_bitmap = -1; 2672 hr = whpx_set_exception_exit_bitmap(0); 2673 2674 if (FAILED(hr)) { 2675 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); 2676 ret = -EINVAL; 2677 goto error; 2678 } 2679 2680 hr = whp_dispatch.WHvSetupPartition(whpx->partition); 2681 if (FAILED(hr)) { 2682 error_report("WHPX: Failed to setup partition, hr=%08lx", hr); 2683 ret = -EINVAL; 2684 goto error; 2685 } 2686 2687 whpx_memory_init(); 2688 2689 printf("Windows Hypervisor Platform accelerator is operational\n"); 2690 return 0; 2691 2692 error: 2693 2694 if (NULL != whpx->partition) { 2695 whp_dispatch.WHvDeletePartition(whpx->partition); 2696 whpx->partition = NULL; 2697 } 2698 2699 return ret; 2700 } 2701 2702 int whpx_enabled(void) 2703 { 2704 return whpx_allowed; 2705 } 2706 2707 bool whpx_apic_in_platform(void) { 2708 return whpx_global.apic_in_platform; 2709 } 2710 2711 static void whpx_accel_class_init(ObjectClass *oc, void *data) 2712 { 2713 AccelClass *ac = ACCEL_CLASS(oc); 2714 ac->name = "WHPX"; 2715 ac->init_machine = whpx_accel_init; 2716 ac->allowed = &whpx_allowed; 2717 2718 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 2719 NULL, whpx_set_kernel_irqchip, 2720 NULL, NULL); 2721 object_class_property_set_description(oc, "kernel-irqchip", 2722 "Configure WHPX in-kernel irqchip"); 2723 } 2724 2725 static void whpx_accel_instance_init(Object *obj) 2726 { 2727 struct whpx_state *whpx = &whpx_global; 2728 2729 memset(whpx, 0, sizeof(struct whpx_state)); 2730 /* Turn on kernel-irqchip, by default */ 2731 whpx->kernel_irqchip_allowed = true; 2732 } 2733 2734 static const TypeInfo whpx_accel_type = { 2735 .name = ACCEL_CLASS_NAME("whpx"), 2736 .parent = TYPE_ACCEL, 2737 .instance_init = whpx_accel_instance_init, 2738 .class_init = whpx_accel_class_init, 2739 }; 2740 2741 static void whpx_type_init(void) 2742 { 2743 type_register_static(&whpx_accel_type); 2744 } 2745 2746 bool init_whp_dispatch(void) 2747 { 2748 if (whp_dispatch_initialized) { 2749 return true; 2750 } 2751 2752 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { 2753 goto error; 2754 } 2755 2756 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { 2757 goto error; 2758 } 2759 2760 assert(load_whp_dispatch_fns(&hWinHvPlatform, 2761 WINHV_PLATFORM_FNS_SUPPLEMENTAL)); 2762 whp_dispatch_initialized = true; 2763 2764 return true; 2765 error: 2766 if (hWinHvPlatform) { 2767 FreeLibrary(hWinHvPlatform); 2768 } 2769 2770 if (hWinHvEmulation) { 2771 FreeLibrary(hWinHvEmulation); 2772 } 2773 2774 return false; 2775 } 2776 2777 type_init(whpx_type_init); 2778