1 /* 2 * QEMU Windows Hypervisor Platform accelerator (WHPX) 3 * 4 * Copyright Microsoft Corp. 2017 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 11 #include "qemu/osdep.h" 12 #include "cpu.h" 13 #include "exec/address-spaces.h" 14 #include "exec/ioport.h" 15 #include "exec/gdbstub.h" 16 #include "qemu/accel.h" 17 #include "sysemu/whpx.h" 18 #include "sysemu/cpus.h" 19 #include "sysemu/runstate.h" 20 #include "qemu/main-loop.h" 21 #include "hw/boards.h" 22 #include "hw/i386/ioapic.h" 23 #include "hw/i386/apic_internal.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "qapi/qapi-types-common.h" 27 #include "qapi/qapi-visit-common.h" 28 #include "migration/blocker.h" 29 #include <winerror.h> 30 31 #include "whpx-internal.h" 32 #include "whpx-accel-ops.h" 33 34 #include <WinHvPlatform.h> 35 #include <WinHvEmulation.h> 36 37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) 38 39 static const WHV_REGISTER_NAME whpx_register_names[] = { 40 41 /* X64 General purpose registers */ 42 WHvX64RegisterRax, 43 WHvX64RegisterRcx, 44 WHvX64RegisterRdx, 45 WHvX64RegisterRbx, 46 WHvX64RegisterRsp, 47 WHvX64RegisterRbp, 48 WHvX64RegisterRsi, 49 WHvX64RegisterRdi, 50 WHvX64RegisterR8, 51 WHvX64RegisterR9, 52 WHvX64RegisterR10, 53 WHvX64RegisterR11, 54 WHvX64RegisterR12, 55 WHvX64RegisterR13, 56 WHvX64RegisterR14, 57 WHvX64RegisterR15, 58 WHvX64RegisterRip, 59 WHvX64RegisterRflags, 60 61 /* X64 Segment registers */ 62 WHvX64RegisterEs, 63 WHvX64RegisterCs, 64 WHvX64RegisterSs, 65 WHvX64RegisterDs, 66 WHvX64RegisterFs, 67 WHvX64RegisterGs, 68 WHvX64RegisterLdtr, 69 WHvX64RegisterTr, 70 71 /* X64 Table registers */ 72 WHvX64RegisterIdtr, 73 WHvX64RegisterGdtr, 74 75 /* X64 Control Registers */ 76 WHvX64RegisterCr0, 77 WHvX64RegisterCr2, 78 WHvX64RegisterCr3, 79 WHvX64RegisterCr4, 80 WHvX64RegisterCr8, 81 82 /* X64 Debug Registers */ 83 /* 84 * WHvX64RegisterDr0, 85 * WHvX64RegisterDr1, 86 * WHvX64RegisterDr2, 87 * WHvX64RegisterDr3, 88 * WHvX64RegisterDr6, 89 * WHvX64RegisterDr7, 90 */ 91 92 /* X64 Floating Point and Vector Registers */ 93 WHvX64RegisterXmm0, 94 WHvX64RegisterXmm1, 95 WHvX64RegisterXmm2, 96 WHvX64RegisterXmm3, 97 WHvX64RegisterXmm4, 98 WHvX64RegisterXmm5, 99 WHvX64RegisterXmm6, 100 WHvX64RegisterXmm7, 101 WHvX64RegisterXmm8, 102 WHvX64RegisterXmm9, 103 WHvX64RegisterXmm10, 104 WHvX64RegisterXmm11, 105 WHvX64RegisterXmm12, 106 WHvX64RegisterXmm13, 107 WHvX64RegisterXmm14, 108 WHvX64RegisterXmm15, 109 WHvX64RegisterFpMmx0, 110 WHvX64RegisterFpMmx1, 111 WHvX64RegisterFpMmx2, 112 WHvX64RegisterFpMmx3, 113 WHvX64RegisterFpMmx4, 114 WHvX64RegisterFpMmx5, 115 WHvX64RegisterFpMmx6, 116 WHvX64RegisterFpMmx7, 117 WHvX64RegisterFpControlStatus, 118 WHvX64RegisterXmmControlStatus, 119 120 /* X64 MSRs */ 121 WHvX64RegisterEfer, 122 #ifdef TARGET_X86_64 123 WHvX64RegisterKernelGsBase, 124 #endif 125 WHvX64RegisterApicBase, 126 /* WHvX64RegisterPat, */ 127 WHvX64RegisterSysenterCs, 128 WHvX64RegisterSysenterEip, 129 WHvX64RegisterSysenterEsp, 130 WHvX64RegisterStar, 131 #ifdef TARGET_X86_64 132 WHvX64RegisterLstar, 133 WHvX64RegisterCstar, 134 WHvX64RegisterSfmask, 135 #endif 136 137 /* Interrupt / Event Registers */ 138 /* 139 * WHvRegisterPendingInterruption, 140 * WHvRegisterInterruptState, 141 * WHvRegisterPendingEvent0, 142 * WHvRegisterPendingEvent1 143 * WHvX64RegisterDeliverabilityNotifications, 144 */ 145 }; 146 147 struct whpx_register_set { 148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; 149 }; 150 151 /* 152 * The current implementation of instruction stepping sets the TF flag 153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction. 154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. 155 * 156 * This approach has a few limitations: 157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag 158 * along with the other flags, possibly restoring it later. It would 159 * result in another INT1 when the flags are restored, triggering 160 * a stop in gdb that could be cleared by doing another step. 161 * 162 * Stepping over a POPF/LAHF instruction will let it overwrite the 163 * TF flags, ending the stepping mode. 164 * 165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, 166 * or anything that could result in a page fault) will save the flags 167 * to the stack, clear the TF flag, and let the guest execute the 168 * handler. Normally, the guest will restore the original flags, 169 * that will continue single-stepping. 170 * 171 * 3. Debuggers running on the guest may wish to set TF to do instruction 172 * stepping. INT1 events generated by it would be intercepted by us, 173 * as long as the gdb is connected to QEMU. 174 * 175 * In practice this means that: 176 * 1. Stepping through flags-modifying instructions may cause gdb to 177 * continue or stop in unexpected places. This will be fully recoverable 178 * and will not crash the target. 179 * 180 * 2. Stepping over an instruction that triggers an exception will step 181 * over the exception handler, not into it. 182 * 183 * 3. Debugging the guest via gdb, while running debugger on the guest 184 * at the same time may lead to unexpected effects. Removing all 185 * breakpoints set via QEMU will prevent any further interference 186 * with the guest-level debuggers. 187 * 188 * The limitations can be addressed as shown below: 189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of 190 * stepping through them. The exact semantics of the instructions is 191 * defined in the "Combined Volume Set of Intel 64 and IA-32 192 * Architectures Software Developer's Manuals", however it involves a 193 * fair amount of corner cases due to compatibility with real mode, 194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes. 195 * 196 * 2. We could step into the guest's exception handlers using the following 197 * sequence: 198 * a. Temporarily enable catching of all exception types via 199 * whpx_set_exception_exit_bitmap(). 200 * b. Once an exception is intercepted, read the IDT/GDT and locate 201 * the original handler. 202 * c. Patch the original handler, injecting an INT3 at the beginning. 203 * d. Update the exception exit bitmap to only catch the 204 * WHvX64ExceptionTypeBreakpointTrap exception. 205 * e. Let the affected CPU run in the exclusive mode. 206 * f. Restore the original handler and the exception exit bitmap. 207 * Note that handling all corner cases related to IDT/GDT is harder 208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a 209 * rough idea. 210 * 211 * 3. In order to properly support guest-level debugging in parallel with 212 * the QEMU-level debugging, we would need to be able to pass some INT1 213 * events to the guest. This could be done via the following methods: 214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, 215 * it seems to only work for interrupts and not software 216 * exceptions. 217 * b. Locating and patching the original handler by parsing IDT/GDT. 218 * This involves relatively complex logic outlined in the previous 219 * paragraph. 220 * c. Emulating the exception invocation (i.e. manually updating RIP, 221 * RFLAGS, and pushing the old values to stack). This is even more 222 * complicated than the previous option, since it involves checking 223 * CPL, gate attributes, and doing various adjustments depending 224 * on the current CPU mode, whether the CPL is changing, etc. 225 */ 226 typedef enum WhpxStepMode { 227 WHPX_STEP_NONE = 0, 228 /* Halt other VCPUs */ 229 WHPX_STEP_EXCLUSIVE, 230 } WhpxStepMode; 231 232 struct whpx_vcpu { 233 WHV_EMULATOR_HANDLE emulator; 234 bool window_registered; 235 bool interruptable; 236 bool ready_for_pic_interrupt; 237 uint64_t tpr; 238 uint64_t apic_base; 239 bool interruption_pending; 240 241 /* Must be the last field as it may have a tail */ 242 WHV_RUN_VP_EXIT_CONTEXT exit_ctx; 243 }; 244 245 static bool whpx_allowed; 246 static bool whp_dispatch_initialized; 247 static HMODULE hWinHvPlatform, hWinHvEmulation; 248 static uint32_t max_vcpu_index; 249 struct whpx_state whpx_global; 250 struct WHPDispatch whp_dispatch; 251 252 253 /* 254 * VP support 255 */ 256 257 static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu) 258 { 259 return (struct whpx_vcpu *)cpu->hax_vcpu; 260 } 261 262 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, 263 int r86) 264 { 265 WHV_X64_SEGMENT_REGISTER hs; 266 unsigned flags = qs->flags; 267 268 hs.Base = qs->base; 269 hs.Limit = qs->limit; 270 hs.Selector = qs->selector; 271 272 if (v86) { 273 hs.Attributes = 0; 274 hs.SegmentType = 3; 275 hs.Present = 1; 276 hs.DescriptorPrivilegeLevel = 3; 277 hs.NonSystemSegment = 1; 278 279 } else { 280 hs.Attributes = (flags >> DESC_TYPE_SHIFT); 281 282 if (r86) { 283 /* hs.Base &= 0xfffff; */ 284 } 285 } 286 287 return hs; 288 } 289 290 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) 291 { 292 SegmentCache qs; 293 294 qs.base = hs->Base; 295 qs.limit = hs->Limit; 296 qs.selector = hs->Selector; 297 298 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; 299 300 return qs; 301 } 302 303 static int whpx_set_tsc(CPUState *cpu) 304 { 305 CPUX86State *env = cpu->env_ptr; 306 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 307 WHV_REGISTER_VALUE tsc_val; 308 HRESULT hr; 309 struct whpx_state *whpx = &whpx_global; 310 311 /* 312 * Suspend the partition prior to setting the TSC to reduce the variance 313 * in TSC across vCPUs. When the first vCPU runs post suspend, the 314 * partition is automatically resumed. 315 */ 316 if (whp_dispatch.WHvSuspendPartitionTime) { 317 318 /* 319 * Unable to suspend partition while setting TSC is not a fatal 320 * error. It just increases the likelihood of TSC variance between 321 * vCPUs and some guest OS are able to handle that just fine. 322 */ 323 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); 324 if (FAILED(hr)) { 325 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); 326 } 327 } 328 329 tsc_val.Reg64 = env->tsc; 330 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 331 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 332 if (FAILED(hr)) { 333 error_report("WHPX: Failed to set TSC, hr=%08lx", hr); 334 return -1; 335 } 336 337 return 0; 338 } 339 340 /* 341 * The CR8 register in the CPU is mapped to the TPR register of the APIC, 342 * however, they use a slightly different encoding. Specifically: 343 * 344 * APIC.TPR[bits 7:4] = CR8[bits 3:0] 345 * 346 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 347 * and IA-32 Architectures Software Developer's Manual. 348 */ 349 350 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) 351 { 352 return tpr >> 4; 353 } 354 355 static void whpx_set_registers(CPUState *cpu, int level) 356 { 357 struct whpx_state *whpx = &whpx_global; 358 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 359 CPUX86State *env = cpu->env_ptr; 360 X86CPU *x86_cpu = X86_CPU(cpu); 361 struct whpx_register_set vcxt; 362 HRESULT hr; 363 int idx; 364 int idx_next; 365 int i; 366 int v86, r86; 367 368 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 369 370 /* 371 * Following MSRs have side effects on the guest or are too heavy for 372 * runtime. Limit them to full state update. 373 */ 374 if (level >= WHPX_SET_RESET_STATE) { 375 whpx_set_tsc(cpu); 376 } 377 378 memset(&vcxt, 0, sizeof(struct whpx_register_set)); 379 380 v86 = (env->eflags & VM_MASK); 381 r86 = !(env->cr[0] & CR0_PE_MASK); 382 383 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 384 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); 385 386 idx = 0; 387 388 /* Indexes for first 16 registers match between HV and QEMU definitions */ 389 idx_next = 16; 390 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 391 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; 392 } 393 idx = idx_next; 394 395 /* Same goes for RIP and RFLAGS */ 396 assert(whpx_register_names[idx] == WHvX64RegisterRip); 397 vcxt.values[idx++].Reg64 = env->eip; 398 399 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 400 vcxt.values[idx++].Reg64 = env->eflags; 401 402 /* Translate 6+4 segment registers. HV and QEMU order matches */ 403 assert(idx == WHvX64RegisterEs); 404 for (i = 0; i < 6; i += 1, idx += 1) { 405 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); 406 } 407 408 assert(idx == WHvX64RegisterLdtr); 409 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); 410 411 assert(idx == WHvX64RegisterTr); 412 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); 413 414 assert(idx == WHvX64RegisterIdtr); 415 vcxt.values[idx].Table.Base = env->idt.base; 416 vcxt.values[idx].Table.Limit = env->idt.limit; 417 idx += 1; 418 419 assert(idx == WHvX64RegisterGdtr); 420 vcxt.values[idx].Table.Base = env->gdt.base; 421 vcxt.values[idx].Table.Limit = env->gdt.limit; 422 idx += 1; 423 424 /* CR0, 2, 3, 4, 8 */ 425 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 426 vcxt.values[idx++].Reg64 = env->cr[0]; 427 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 428 vcxt.values[idx++].Reg64 = env->cr[2]; 429 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 430 vcxt.values[idx++].Reg64 = env->cr[3]; 431 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 432 vcxt.values[idx++].Reg64 = env->cr[4]; 433 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 434 vcxt.values[idx++].Reg64 = vcpu->tpr; 435 436 /* 8 Debug Registers - Skipped */ 437 438 /* 16 XMM registers */ 439 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 440 idx_next = idx + 16; 441 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 442 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); 443 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); 444 } 445 idx = idx_next; 446 447 /* 8 FP registers */ 448 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 449 for (i = 0; i < 8; i += 1, idx += 1) { 450 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); 451 /* vcxt.values[idx].Fp.AsUINT128.High64 = 452 env->fpregs[i].mmx.MMX_Q(1); 453 */ 454 } 455 456 /* FP control status register */ 457 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 458 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; 459 vcxt.values[idx].FpControlStatus.FpStatus = 460 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; 461 vcxt.values[idx].FpControlStatus.FpTag = 0; 462 for (i = 0; i < 8; ++i) { 463 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; 464 } 465 vcxt.values[idx].FpControlStatus.Reserved = 0; 466 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; 467 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; 468 idx += 1; 469 470 /* XMM control status register */ 471 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 472 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; 473 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; 474 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; 475 idx += 1; 476 477 /* MSRs */ 478 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 479 vcxt.values[idx++].Reg64 = env->efer; 480 #ifdef TARGET_X86_64 481 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 482 vcxt.values[idx++].Reg64 = env->kernelgsbase; 483 #endif 484 485 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 486 vcxt.values[idx++].Reg64 = vcpu->apic_base; 487 488 /* WHvX64RegisterPat - Skipped */ 489 490 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 491 vcxt.values[idx++].Reg64 = env->sysenter_cs; 492 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 493 vcxt.values[idx++].Reg64 = env->sysenter_eip; 494 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 495 vcxt.values[idx++].Reg64 = env->sysenter_esp; 496 assert(whpx_register_names[idx] == WHvX64RegisterStar); 497 vcxt.values[idx++].Reg64 = env->star; 498 #ifdef TARGET_X86_64 499 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 500 vcxt.values[idx++].Reg64 = env->lstar; 501 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 502 vcxt.values[idx++].Reg64 = env->cstar; 503 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 504 vcxt.values[idx++].Reg64 = env->fmask; 505 #endif 506 507 /* Interrupt / Event Registers - Skipped */ 508 509 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 510 511 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 512 whpx->partition, cpu->cpu_index, 513 whpx_register_names, 514 RTL_NUMBER_OF(whpx_register_names), 515 &vcxt.values[0]); 516 517 if (FAILED(hr)) { 518 error_report("WHPX: Failed to set virtual processor context, hr=%08lx", 519 hr); 520 } 521 522 return; 523 } 524 525 static int whpx_get_tsc(CPUState *cpu) 526 { 527 CPUX86State *env = cpu->env_ptr; 528 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 529 WHV_REGISTER_VALUE tsc_val; 530 HRESULT hr; 531 struct whpx_state *whpx = &whpx_global; 532 533 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 534 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 535 if (FAILED(hr)) { 536 error_report("WHPX: Failed to get TSC, hr=%08lx", hr); 537 return -1; 538 } 539 540 env->tsc = tsc_val.Reg64; 541 return 0; 542 } 543 544 static void whpx_get_registers(CPUState *cpu) 545 { 546 struct whpx_state *whpx = &whpx_global; 547 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 548 CPUX86State *env = cpu->env_ptr; 549 X86CPU *x86_cpu = X86_CPU(cpu); 550 struct whpx_register_set vcxt; 551 uint64_t tpr, apic_base; 552 HRESULT hr; 553 int idx; 554 int idx_next; 555 int i; 556 557 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 558 559 if (!env->tsc_valid) { 560 whpx_get_tsc(cpu); 561 env->tsc_valid = !runstate_is_running(); 562 } 563 564 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 565 whpx->partition, cpu->cpu_index, 566 whpx_register_names, 567 RTL_NUMBER_OF(whpx_register_names), 568 &vcxt.values[0]); 569 if (FAILED(hr)) { 570 error_report("WHPX: Failed to get virtual processor context, hr=%08lx", 571 hr); 572 } 573 574 if (whpx_apic_in_platform()) { 575 /* 576 * Fetch the TPR value from the emulated APIC. It may get overwritten 577 * below with the value from CR8 returned by 578 * WHvGetVirtualProcessorRegisters(). 579 */ 580 whpx_apic_get(x86_cpu->apic_state); 581 vcpu->tpr = whpx_apic_tpr_to_cr8( 582 cpu_get_apic_tpr(x86_cpu->apic_state)); 583 } 584 585 idx = 0; 586 587 /* Indexes for first 16 registers match between HV and QEMU definitions */ 588 idx_next = 16; 589 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 590 env->regs[idx] = vcxt.values[idx].Reg64; 591 } 592 idx = idx_next; 593 594 /* Same goes for RIP and RFLAGS */ 595 assert(whpx_register_names[idx] == WHvX64RegisterRip); 596 env->eip = vcxt.values[idx++].Reg64; 597 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 598 env->eflags = vcxt.values[idx++].Reg64; 599 600 /* Translate 6+4 segment registers. HV and QEMU order matches */ 601 assert(idx == WHvX64RegisterEs); 602 for (i = 0; i < 6; i += 1, idx += 1) { 603 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); 604 } 605 606 assert(idx == WHvX64RegisterLdtr); 607 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); 608 assert(idx == WHvX64RegisterTr); 609 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); 610 assert(idx == WHvX64RegisterIdtr); 611 env->idt.base = vcxt.values[idx].Table.Base; 612 env->idt.limit = vcxt.values[idx].Table.Limit; 613 idx += 1; 614 assert(idx == WHvX64RegisterGdtr); 615 env->gdt.base = vcxt.values[idx].Table.Base; 616 env->gdt.limit = vcxt.values[idx].Table.Limit; 617 idx += 1; 618 619 /* CR0, 2, 3, 4, 8 */ 620 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 621 env->cr[0] = vcxt.values[idx++].Reg64; 622 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 623 env->cr[2] = vcxt.values[idx++].Reg64; 624 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 625 env->cr[3] = vcxt.values[idx++].Reg64; 626 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 627 env->cr[4] = vcxt.values[idx++].Reg64; 628 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 629 tpr = vcxt.values[idx++].Reg64; 630 if (tpr != vcpu->tpr) { 631 vcpu->tpr = tpr; 632 cpu_set_apic_tpr(x86_cpu->apic_state, tpr); 633 } 634 635 /* 8 Debug Registers - Skipped */ 636 637 /* 16 XMM registers */ 638 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 639 idx_next = idx + 16; 640 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 641 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; 642 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; 643 } 644 idx = idx_next; 645 646 /* 8 FP registers */ 647 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 648 for (i = 0; i < 8; i += 1, idx += 1) { 649 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; 650 /* env->fpregs[i].mmx.MMX_Q(1) = 651 vcxt.values[idx].Fp.AsUINT128.High64; 652 */ 653 } 654 655 /* FP control status register */ 656 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 657 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; 658 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; 659 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; 660 for (i = 0; i < 8; ++i) { 661 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); 662 } 663 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; 664 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; 665 idx += 1; 666 667 /* XMM control status register */ 668 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 669 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; 670 idx += 1; 671 672 /* MSRs */ 673 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 674 env->efer = vcxt.values[idx++].Reg64; 675 #ifdef TARGET_X86_64 676 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 677 env->kernelgsbase = vcxt.values[idx++].Reg64; 678 #endif 679 680 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 681 apic_base = vcxt.values[idx++].Reg64; 682 if (apic_base != vcpu->apic_base) { 683 vcpu->apic_base = apic_base; 684 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); 685 } 686 687 /* WHvX64RegisterPat - Skipped */ 688 689 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 690 env->sysenter_cs = vcxt.values[idx++].Reg64; 691 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 692 env->sysenter_eip = vcxt.values[idx++].Reg64; 693 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 694 env->sysenter_esp = vcxt.values[idx++].Reg64; 695 assert(whpx_register_names[idx] == WHvX64RegisterStar); 696 env->star = vcxt.values[idx++].Reg64; 697 #ifdef TARGET_X86_64 698 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 699 env->lstar = vcxt.values[idx++].Reg64; 700 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 701 env->cstar = vcxt.values[idx++].Reg64; 702 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 703 env->fmask = vcxt.values[idx++].Reg64; 704 #endif 705 706 /* Interrupt / Event Registers - Skipped */ 707 708 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 709 710 if (whpx_apic_in_platform()) { 711 whpx_apic_get(x86_cpu->apic_state); 712 } 713 714 x86_update_hflags(env); 715 716 return; 717 } 718 719 static HRESULT CALLBACK whpx_emu_ioport_callback( 720 void *ctx, 721 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) 722 { 723 MemTxAttrs attrs = { 0 }; 724 address_space_rw(&address_space_io, IoAccess->Port, attrs, 725 &IoAccess->Data, IoAccess->AccessSize, 726 IoAccess->Direction); 727 return S_OK; 728 } 729 730 static HRESULT CALLBACK whpx_emu_mmio_callback( 731 void *ctx, 732 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) 733 { 734 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, 735 ma->Direction); 736 return S_OK; 737 } 738 739 static HRESULT CALLBACK whpx_emu_getreg_callback( 740 void *ctx, 741 const WHV_REGISTER_NAME *RegisterNames, 742 UINT32 RegisterCount, 743 WHV_REGISTER_VALUE *RegisterValues) 744 { 745 HRESULT hr; 746 struct whpx_state *whpx = &whpx_global; 747 CPUState *cpu = (CPUState *)ctx; 748 749 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 750 whpx->partition, cpu->cpu_index, 751 RegisterNames, RegisterCount, 752 RegisterValues); 753 if (FAILED(hr)) { 754 error_report("WHPX: Failed to get virtual processor registers," 755 " hr=%08lx", hr); 756 } 757 758 return hr; 759 } 760 761 static HRESULT CALLBACK whpx_emu_setreg_callback( 762 void *ctx, 763 const WHV_REGISTER_NAME *RegisterNames, 764 UINT32 RegisterCount, 765 const WHV_REGISTER_VALUE *RegisterValues) 766 { 767 HRESULT hr; 768 struct whpx_state *whpx = &whpx_global; 769 CPUState *cpu = (CPUState *)ctx; 770 771 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 772 whpx->partition, cpu->cpu_index, 773 RegisterNames, RegisterCount, 774 RegisterValues); 775 if (FAILED(hr)) { 776 error_report("WHPX: Failed to set virtual processor registers," 777 " hr=%08lx", hr); 778 } 779 780 /* 781 * The emulator just successfully wrote the register state. We clear the 782 * dirty state so we avoid the double write on resume of the VP. 783 */ 784 cpu->vcpu_dirty = false; 785 786 return hr; 787 } 788 789 static HRESULT CALLBACK whpx_emu_translate_callback( 790 void *ctx, 791 WHV_GUEST_VIRTUAL_ADDRESS Gva, 792 WHV_TRANSLATE_GVA_FLAGS TranslateFlags, 793 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, 794 WHV_GUEST_PHYSICAL_ADDRESS *Gpa) 795 { 796 HRESULT hr; 797 struct whpx_state *whpx = &whpx_global; 798 CPUState *cpu = (CPUState *)ctx; 799 WHV_TRANSLATE_GVA_RESULT res; 800 801 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, 802 Gva, TranslateFlags, &res, Gpa); 803 if (FAILED(hr)) { 804 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); 805 } else { 806 *TranslationResult = res.ResultCode; 807 } 808 809 return hr; 810 } 811 812 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { 813 .Size = sizeof(WHV_EMULATOR_CALLBACKS), 814 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, 815 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, 816 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, 817 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, 818 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, 819 }; 820 821 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) 822 { 823 HRESULT hr; 824 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 825 WHV_EMULATOR_STATUS emu_status; 826 827 hr = whp_dispatch.WHvEmulatorTryMmioEmulation( 828 vcpu->emulator, cpu, 829 &vcpu->exit_ctx.VpContext, ctx, 830 &emu_status); 831 if (FAILED(hr)) { 832 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); 833 return -1; 834 } 835 836 if (!emu_status.EmulationSuccessful) { 837 error_report("WHPX: Failed to emulate MMIO access with" 838 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 839 return -1; 840 } 841 842 return 0; 843 } 844 845 static int whpx_handle_portio(CPUState *cpu, 846 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) 847 { 848 HRESULT hr; 849 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 850 WHV_EMULATOR_STATUS emu_status; 851 852 hr = whp_dispatch.WHvEmulatorTryIoEmulation( 853 vcpu->emulator, cpu, 854 &vcpu->exit_ctx.VpContext, ctx, 855 &emu_status); 856 if (FAILED(hr)) { 857 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); 858 return -1; 859 } 860 861 if (!emu_status.EmulationSuccessful) { 862 error_report("WHPX: Failed to emulate PortIO access with" 863 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 864 return -1; 865 } 866 867 return 0; 868 } 869 870 /* 871 * Controls whether we should intercept various exceptions on the guest, 872 * namely breakpoint/single-step events. 873 * 874 * The 'exceptions' argument accepts a bitmask, e.g: 875 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) 876 */ 877 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) 878 { 879 struct whpx_state *whpx = &whpx_global; 880 WHV_PARTITION_PROPERTY prop = { 0, }; 881 HRESULT hr; 882 883 if (exceptions == whpx->exception_exit_bitmap) { 884 return S_OK; 885 } 886 887 prop.ExceptionExitBitmap = exceptions; 888 889 hr = whp_dispatch.WHvSetPartitionProperty( 890 whpx->partition, 891 WHvPartitionPropertyCodeExceptionExitBitmap, 892 &prop, 893 sizeof(WHV_PARTITION_PROPERTY)); 894 895 if (SUCCEEDED(hr)) { 896 whpx->exception_exit_bitmap = exceptions; 897 } 898 899 return hr; 900 } 901 902 903 /* 904 * This function is called before/after stepping over a single instruction. 905 * It will update the CPU registers to arm/disarm the instruction stepping 906 * accordingly. 907 */ 908 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, 909 bool set, 910 uint64_t *exit_context_rflags) 911 { 912 WHV_REGISTER_NAME reg_name; 913 WHV_REGISTER_VALUE reg_value; 914 HRESULT hr; 915 struct whpx_state *whpx = &whpx_global; 916 917 /* 918 * If we are trying to step over a single instruction, we need to set the 919 * TF bit in rflags. Otherwise, clear it. 920 */ 921 reg_name = WHvX64RegisterRflags; 922 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 923 whpx->partition, 924 cpu->cpu_index, 925 ®_name, 926 1, 927 ®_value); 928 929 if (FAILED(hr)) { 930 error_report("WHPX: Failed to get rflags, hr=%08lx", hr); 931 return hr; 932 } 933 934 if (exit_context_rflags) { 935 assert(*exit_context_rflags == reg_value.Reg64); 936 } 937 938 if (set) { 939 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ 940 reg_value.Reg64 |= TF_MASK; 941 } else { 942 reg_value.Reg64 &= ~TF_MASK; 943 } 944 945 if (exit_context_rflags) { 946 *exit_context_rflags = reg_value.Reg64; 947 } 948 949 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 950 whpx->partition, 951 cpu->cpu_index, 952 ®_name, 953 1, 954 ®_value); 955 956 if (FAILED(hr)) { 957 error_report("WHPX: Failed to set rflags," 958 " hr=%08lx", 959 hr); 960 return hr; 961 } 962 963 reg_name = WHvRegisterInterruptState; 964 reg_value.Reg64 = 0; 965 966 /* Suspend delivery of hardware interrupts during single-stepping. */ 967 reg_value.InterruptState.InterruptShadow = set != 0; 968 969 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 970 whpx->partition, 971 cpu->cpu_index, 972 ®_name, 973 1, 974 ®_value); 975 976 if (FAILED(hr)) { 977 error_report("WHPX: Failed to set InterruptState," 978 " hr=%08lx", 979 hr); 980 return hr; 981 } 982 983 if (!set) { 984 /* 985 * We have just finished stepping over a single instruction, 986 * and intercepted the INT1 generated by it. 987 * We need to now hide the INT1 from the guest, 988 * as it would not be expecting it. 989 */ 990 991 reg_name = WHvX64RegisterPendingDebugException; 992 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 993 whpx->partition, 994 cpu->cpu_index, 995 ®_name, 996 1, 997 ®_value); 998 999 if (FAILED(hr)) { 1000 error_report("WHPX: Failed to get pending debug exceptions," 1001 "hr=%08lx", hr); 1002 return hr; 1003 } 1004 1005 if (reg_value.PendingDebugException.SingleStep) { 1006 reg_value.PendingDebugException.SingleStep = 0; 1007 1008 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1009 whpx->partition, 1010 cpu->cpu_index, 1011 ®_name, 1012 1, 1013 ®_value); 1014 1015 if (FAILED(hr)) { 1016 error_report("WHPX: Failed to clear pending debug exceptions," 1017 "hr=%08lx", hr); 1018 return hr; 1019 } 1020 } 1021 1022 } 1023 1024 return S_OK; 1025 } 1026 1027 /* Tries to find a breakpoint at the specified address. */ 1028 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) 1029 { 1030 struct whpx_state *whpx = &whpx_global; 1031 int i; 1032 1033 if (whpx->breakpoints.breakpoints) { 1034 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { 1035 if (address == whpx->breakpoints.breakpoints->data[i].address) { 1036 return &whpx->breakpoints.breakpoints->data[i]; 1037 } 1038 } 1039 } 1040 1041 return NULL; 1042 } 1043 1044 /* 1045 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for 1046 * debugging user-mode applications. Since the WHPX API does not offer 1047 * an easy way to pass the intercepted exception back to the guest, we 1048 * resort to using INT1 instead, and let the guest always handle INT3. 1049 */ 1050 static const uint8_t whpx_breakpoint_instruction = 0xF1; 1051 1052 /* 1053 * The WHPX QEMU backend implements breakpoints by writing the INT1 1054 * instruction into memory (ignoring the DRx registers). This raises a few 1055 * issues that need to be carefully handled: 1056 * 1057 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints 1058 * at the same location, and later remove them in arbitrary order. 1059 * This should not cause memory corruption, and should only remove the 1060 * physical breakpoint instruction when the last QEMU breakpoint is gone. 1061 * 1062 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid 1063 * physical location. Hence, physically adding/removing a breakpoint can 1064 * theoretically fail at any time. We need to keep track of it. 1065 * 1066 * The function below rebuilds a list of low-level breakpoints (one per 1067 * address, tracking the original instruction and any errors) from the list of 1068 * high-level breakpoints (set via cpu_breakpoint_insert()). 1069 * 1070 * In order to optimize performance, this function stores the list of 1071 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the 1072 * low-level ones, so that it won't be re-invoked until these breakpoints 1073 * change. 1074 * 1075 * Note that this function decides which breakpoints should be inserted into, 1076 * memory, but doesn't actually do it. The memory accessing is done in 1077 * whpx_apply_breakpoints(). 1078 */ 1079 static void whpx_translate_cpu_breakpoints( 1080 struct whpx_breakpoints *breakpoints, 1081 CPUState *cpu, 1082 int cpu_breakpoint_count) 1083 { 1084 CPUBreakpoint *bp; 1085 int cpu_bp_index = 0; 1086 1087 breakpoints->original_addresses = 1088 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); 1089 1090 breakpoints->original_address_count = cpu_breakpoint_count; 1091 1092 int max_breakpoints = cpu_breakpoint_count + 1093 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); 1094 1095 struct whpx_breakpoint_collection *new_breakpoints = 1096 (struct whpx_breakpoint_collection *)g_malloc0( 1097 sizeof(struct whpx_breakpoint_collection) + 1098 max_breakpoints * sizeof(struct whpx_breakpoint)); 1099 1100 new_breakpoints->allocated = max_breakpoints; 1101 new_breakpoints->used = 0; 1102 1103 /* 1104 * 1. Preserve all old breakpoints that could not be automatically 1105 * cleared when the CPU got stopped. 1106 */ 1107 if (breakpoints->breakpoints) { 1108 int i; 1109 for (i = 0; i < breakpoints->breakpoints->used; i++) { 1110 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { 1111 new_breakpoints->data[new_breakpoints->used++] = 1112 breakpoints->breakpoints->data[i]; 1113 } 1114 } 1115 } 1116 1117 /* 2. Map all CPU breakpoints to WHPX breakpoints */ 1118 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1119 int i; 1120 bool found = false; 1121 1122 /* This will be used to detect changed CPU breakpoints later. */ 1123 breakpoints->original_addresses[cpu_bp_index++] = bp->pc; 1124 1125 for (i = 0; i < new_breakpoints->used; i++) { 1126 /* 1127 * WARNING: This loop has O(N^2) complexity, where N is the 1128 * number of breakpoints. It should not be a bottleneck in 1129 * real-world scenarios, since it only needs to run once after 1130 * the breakpoints have been modified. 1131 * If this ever becomes a concern, it can be optimized by storing 1132 * high-level breakpoint objects in a tree or hash map. 1133 */ 1134 1135 if (new_breakpoints->data[i].address == bp->pc) { 1136 /* There was already a breakpoint at this address. */ 1137 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { 1138 new_breakpoints->data[i].state = WHPX_BP_SET; 1139 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { 1140 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; 1141 } 1142 1143 found = true; 1144 break; 1145 } 1146 } 1147 1148 if (!found && new_breakpoints->used < new_breakpoints->allocated) { 1149 /* No WHPX breakpoint at this address. Create one. */ 1150 new_breakpoints->data[new_breakpoints->used].address = bp->pc; 1151 new_breakpoints->data[new_breakpoints->used].state = 1152 WHPX_BP_SET_PENDING; 1153 new_breakpoints->used++; 1154 } 1155 } 1156 1157 if (breakpoints->breakpoints) { 1158 /* 1159 * Free the previous breakpoint list. This can be optimized by keeping 1160 * it as shadow buffer for the next computation instead of freeing 1161 * it immediately. 1162 */ 1163 g_free(breakpoints->breakpoints); 1164 } 1165 1166 breakpoints->breakpoints = new_breakpoints; 1167 } 1168 1169 /* 1170 * Physically inserts/removes the breakpoints by reading and writing the 1171 * physical memory, keeping a track of the failed attempts. 1172 * 1173 * Passing resuming=true will try to set all previously unset breakpoints. 1174 * Passing resuming=false will remove all inserted ones. 1175 */ 1176 static void whpx_apply_breakpoints( 1177 struct whpx_breakpoint_collection *breakpoints, 1178 CPUState *cpu, 1179 bool resuming) 1180 { 1181 int i, rc; 1182 if (!breakpoints) { 1183 return; 1184 } 1185 1186 for (i = 0; i < breakpoints->used; i++) { 1187 /* Decide what to do right now based on the last known state. */ 1188 WhpxBreakpointState state = breakpoints->data[i].state; 1189 switch (state) { 1190 case WHPX_BP_CLEARED: 1191 if (resuming) { 1192 state = WHPX_BP_SET_PENDING; 1193 } 1194 break; 1195 case WHPX_BP_SET_PENDING: 1196 if (!resuming) { 1197 state = WHPX_BP_CLEARED; 1198 } 1199 break; 1200 case WHPX_BP_SET: 1201 if (!resuming) { 1202 state = WHPX_BP_CLEAR_PENDING; 1203 } 1204 break; 1205 case WHPX_BP_CLEAR_PENDING: 1206 if (resuming) { 1207 state = WHPX_BP_SET; 1208 } 1209 break; 1210 } 1211 1212 if (state == WHPX_BP_SET_PENDING) { 1213 /* Remember the original instruction. */ 1214 rc = cpu_memory_rw_debug(cpu, 1215 breakpoints->data[i].address, 1216 &breakpoints->data[i].original_instruction, 1217 1, 1218 false); 1219 1220 if (!rc) { 1221 /* Write the breakpoint instruction. */ 1222 rc = cpu_memory_rw_debug(cpu, 1223 breakpoints->data[i].address, 1224 (void *)&whpx_breakpoint_instruction, 1225 1, 1226 true); 1227 } 1228 1229 if (!rc) { 1230 state = WHPX_BP_SET; 1231 } 1232 1233 } 1234 1235 if (state == WHPX_BP_CLEAR_PENDING) { 1236 /* Restore the original instruction. */ 1237 rc = cpu_memory_rw_debug(cpu, 1238 breakpoints->data[i].address, 1239 &breakpoints->data[i].original_instruction, 1240 1, 1241 true); 1242 1243 if (!rc) { 1244 state = WHPX_BP_CLEARED; 1245 } 1246 } 1247 1248 breakpoints->data[i].state = state; 1249 } 1250 } 1251 1252 /* 1253 * This function is called when the a VCPU is about to start and no other 1254 * VCPUs have been started so far. Since the VCPU start order could be 1255 * arbitrary, it doesn't have to be VCPU#0. 1256 * 1257 * It is used to commit the breakpoints into memory, and configure WHPX 1258 * to intercept debug exceptions. 1259 * 1260 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or 1261 * more VCPUs are already running, so this is the best place to do it. 1262 */ 1263 static int whpx_first_vcpu_starting(CPUState *cpu) 1264 { 1265 struct whpx_state *whpx = &whpx_global; 1266 HRESULT hr; 1267 1268 g_assert(qemu_mutex_iothread_locked()); 1269 1270 if (!QTAILQ_EMPTY(&cpu->breakpoints) || 1271 (whpx->breakpoints.breakpoints && 1272 whpx->breakpoints.breakpoints->used)) { 1273 CPUBreakpoint *bp; 1274 int i = 0; 1275 bool update_pending = false; 1276 1277 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1278 if (i >= whpx->breakpoints.original_address_count || 1279 bp->pc != whpx->breakpoints.original_addresses[i]) { 1280 update_pending = true; 1281 } 1282 1283 i++; 1284 } 1285 1286 if (i != whpx->breakpoints.original_address_count) { 1287 update_pending = true; 1288 } 1289 1290 if (update_pending) { 1291 /* 1292 * The CPU breakpoints have changed since the last call to 1293 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must 1294 * now be recomputed. 1295 */ 1296 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); 1297 } 1298 1299 /* Actually insert the breakpoints into the memory. */ 1300 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); 1301 } 1302 1303 uint64_t exception_mask; 1304 if (whpx->step_pending || 1305 (whpx->breakpoints.breakpoints && 1306 whpx->breakpoints.breakpoints->used)) { 1307 /* 1308 * We are either attempting to single-step one or more CPUs, or 1309 * have one or more breakpoints enabled. Both require intercepting 1310 * the WHvX64ExceptionTypeBreakpointTrap exception. 1311 */ 1312 1313 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; 1314 } else { 1315 /* Let the guest handle all exceptions. */ 1316 exception_mask = 0; 1317 } 1318 1319 hr = whpx_set_exception_exit_bitmap(exception_mask); 1320 if (!SUCCEEDED(hr)) { 1321 error_report("WHPX: Failed to update exception exit mask," 1322 "hr=%08lx.", hr); 1323 return 1; 1324 } 1325 1326 return 0; 1327 } 1328 1329 /* 1330 * This function is called when the last VCPU has finished running. 1331 * It is used to remove any previously set breakpoints from memory. 1332 */ 1333 static int whpx_last_vcpu_stopping(CPUState *cpu) 1334 { 1335 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); 1336 return 0; 1337 } 1338 1339 /* Returns the address of the next instruction that is about to be executed. */ 1340 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) 1341 { 1342 if (cpu->vcpu_dirty) { 1343 /* The CPU registers have been modified by other parts of QEMU. */ 1344 CPUArchState *env = (CPUArchState *)(cpu->env_ptr); 1345 return env->eip; 1346 } else if (exit_context_valid) { 1347 /* 1348 * The CPU registers have not been modified by neither other parts 1349 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). 1350 * This is the most common case. 1351 */ 1352 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1353 return vcpu->exit_ctx.VpContext.Rip; 1354 } else { 1355 /* 1356 * The CPU registers have been modified by a call to 1357 * WHvSetVirtualProcessorRegisters() and must be re-queried from 1358 * the target. 1359 */ 1360 WHV_REGISTER_VALUE reg_value; 1361 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; 1362 HRESULT hr; 1363 struct whpx_state *whpx = &whpx_global; 1364 1365 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1366 whpx->partition, 1367 cpu->cpu_index, 1368 ®_name, 1369 1, 1370 ®_value); 1371 1372 if (FAILED(hr)) { 1373 error_report("WHPX: Failed to get PC, hr=%08lx", hr); 1374 return 0; 1375 } 1376 1377 return reg_value.Reg64; 1378 } 1379 } 1380 1381 static int whpx_handle_halt(CPUState *cpu) 1382 { 1383 CPUX86State *env = cpu->env_ptr; 1384 int ret = 0; 1385 1386 qemu_mutex_lock_iothread(); 1387 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1388 (env->eflags & IF_MASK)) && 1389 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1390 cpu->exception_index = EXCP_HLT; 1391 cpu->halted = true; 1392 ret = 1; 1393 } 1394 qemu_mutex_unlock_iothread(); 1395 1396 return ret; 1397 } 1398 1399 static void whpx_vcpu_pre_run(CPUState *cpu) 1400 { 1401 HRESULT hr; 1402 struct whpx_state *whpx = &whpx_global; 1403 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1404 CPUX86State *env = cpu->env_ptr; 1405 X86CPU *x86_cpu = X86_CPU(cpu); 1406 int irq; 1407 uint8_t tpr; 1408 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; 1409 UINT32 reg_count = 0; 1410 WHV_REGISTER_VALUE reg_values[3]; 1411 WHV_REGISTER_NAME reg_names[3]; 1412 1413 memset(&new_int, 0, sizeof(new_int)); 1414 memset(reg_values, 0, sizeof(reg_values)); 1415 1416 qemu_mutex_lock_iothread(); 1417 1418 /* Inject NMI */ 1419 if (!vcpu->interruption_pending && 1420 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 1421 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 1422 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 1423 vcpu->interruptable = false; 1424 new_int.InterruptionType = WHvX64PendingNmi; 1425 new_int.InterruptionPending = 1; 1426 new_int.InterruptionVector = 2; 1427 } 1428 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 1429 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 1430 } 1431 } 1432 1433 /* 1434 * Force the VCPU out of its inner loop to process any INIT requests or 1435 * commit pending TPR access. 1436 */ 1437 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 1438 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1439 !(env->hflags & HF_SMM_MASK)) { 1440 cpu->exit_request = 1; 1441 } 1442 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1443 cpu->exit_request = 1; 1444 } 1445 } 1446 1447 /* Get pending hard interruption or replay one that was overwritten */ 1448 if (!whpx_apic_in_platform()) { 1449 if (!vcpu->interruption_pending && 1450 vcpu->interruptable && (env->eflags & IF_MASK)) { 1451 assert(!new_int.InterruptionPending); 1452 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1453 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1454 irq = cpu_get_pic_interrupt(env); 1455 if (irq >= 0) { 1456 new_int.InterruptionType = WHvX64PendingInterrupt; 1457 new_int.InterruptionPending = 1; 1458 new_int.InterruptionVector = irq; 1459 } 1460 } 1461 } 1462 1463 /* Setup interrupt state if new one was prepared */ 1464 if (new_int.InterruptionPending) { 1465 reg_values[reg_count].PendingInterruption = new_int; 1466 reg_names[reg_count] = WHvRegisterPendingInterruption; 1467 reg_count += 1; 1468 } 1469 } else if (vcpu->ready_for_pic_interrupt && 1470 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 1471 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1472 irq = cpu_get_pic_interrupt(env); 1473 if (irq >= 0) { 1474 reg_names[reg_count] = WHvRegisterPendingEvent; 1475 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) 1476 { 1477 .EventPending = 1, 1478 .EventType = WHvX64PendingEventExtInt, 1479 .Vector = irq, 1480 }; 1481 reg_count += 1; 1482 } 1483 } 1484 1485 /* Sync the TPR to the CR8 if was modified during the intercept */ 1486 tpr = cpu_get_apic_tpr(x86_cpu->apic_state); 1487 if (tpr != vcpu->tpr) { 1488 vcpu->tpr = tpr; 1489 reg_values[reg_count].Reg64 = tpr; 1490 cpu->exit_request = 1; 1491 reg_names[reg_count] = WHvX64RegisterCr8; 1492 reg_count += 1; 1493 } 1494 1495 /* Update the state of the interrupt delivery notification */ 1496 if (!vcpu->window_registered && 1497 cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1498 reg_values[reg_count].DeliverabilityNotifications = 1499 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { 1500 .InterruptNotification = 1 1501 }; 1502 vcpu->window_registered = 1; 1503 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; 1504 reg_count += 1; 1505 } 1506 1507 qemu_mutex_unlock_iothread(); 1508 vcpu->ready_for_pic_interrupt = false; 1509 1510 if (reg_count) { 1511 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1512 whpx->partition, cpu->cpu_index, 1513 reg_names, reg_count, reg_values); 1514 if (FAILED(hr)) { 1515 error_report("WHPX: Failed to set interrupt state registers," 1516 " hr=%08lx", hr); 1517 } 1518 } 1519 1520 return; 1521 } 1522 1523 static void whpx_vcpu_post_run(CPUState *cpu) 1524 { 1525 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1526 CPUX86State *env = cpu->env_ptr; 1527 X86CPU *x86_cpu = X86_CPU(cpu); 1528 1529 env->eflags = vcpu->exit_ctx.VpContext.Rflags; 1530 1531 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; 1532 if (vcpu->tpr != tpr) { 1533 vcpu->tpr = tpr; 1534 qemu_mutex_lock_iothread(); 1535 cpu_set_apic_tpr(x86_cpu->apic_state, vcpu->tpr); 1536 qemu_mutex_unlock_iothread(); 1537 } 1538 1539 vcpu->interruption_pending = 1540 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; 1541 1542 vcpu->interruptable = 1543 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; 1544 1545 return; 1546 } 1547 1548 static void whpx_vcpu_process_async_events(CPUState *cpu) 1549 { 1550 CPUX86State *env = cpu->env_ptr; 1551 X86CPU *x86_cpu = X86_CPU(cpu); 1552 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1553 1554 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1555 !(env->hflags & HF_SMM_MASK)) { 1556 whpx_cpu_synchronize_state(cpu); 1557 do_cpu_init(x86_cpu); 1558 vcpu->interruptable = true; 1559 } 1560 1561 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 1562 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 1563 apic_poll_irq(x86_cpu->apic_state); 1564 } 1565 1566 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1567 (env->eflags & IF_MASK)) || 1568 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1569 cpu->halted = false; 1570 } 1571 1572 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 1573 whpx_cpu_synchronize_state(cpu); 1574 do_cpu_sipi(x86_cpu); 1575 } 1576 1577 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1578 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 1579 whpx_cpu_synchronize_state(cpu); 1580 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 1581 env->tpr_access_type); 1582 } 1583 1584 return; 1585 } 1586 1587 static int whpx_vcpu_run(CPUState *cpu) 1588 { 1589 HRESULT hr; 1590 struct whpx_state *whpx = &whpx_global; 1591 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 1592 struct whpx_breakpoint *stepped_over_bp = NULL; 1593 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; 1594 int ret; 1595 1596 g_assert(qemu_mutex_iothread_locked()); 1597 1598 if (whpx->running_cpus++ == 0) { 1599 /* Insert breakpoints into memory, update exception exit bitmap. */ 1600 ret = whpx_first_vcpu_starting(cpu); 1601 if (ret != 0) { 1602 return ret; 1603 } 1604 } 1605 1606 if (whpx->breakpoints.breakpoints && 1607 whpx->breakpoints.breakpoints->used > 0) 1608 { 1609 uint64_t pc = whpx_vcpu_get_pc(cpu, true); 1610 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); 1611 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { 1612 stepped_over_bp = NULL; 1613 } 1614 1615 if (stepped_over_bp) { 1616 /* 1617 * We are trying to run the instruction overwritten by an active 1618 * breakpoint. We will temporarily disable the breakpoint, suspend 1619 * other CPUs, and step over the instruction. 1620 */ 1621 exclusive_step_mode = WHPX_STEP_EXCLUSIVE; 1622 } 1623 } 1624 1625 if (exclusive_step_mode == WHPX_STEP_NONE) { 1626 whpx_vcpu_process_async_events(cpu); 1627 if (cpu->halted && !whpx_apic_in_platform()) { 1628 cpu->exception_index = EXCP_HLT; 1629 qatomic_set(&cpu->exit_request, false); 1630 return 0; 1631 } 1632 } 1633 1634 qemu_mutex_unlock_iothread(); 1635 1636 if (exclusive_step_mode != WHPX_STEP_NONE) { 1637 start_exclusive(); 1638 g_assert(cpu == current_cpu); 1639 g_assert(!cpu->running); 1640 cpu->running = true; 1641 1642 hr = whpx_set_exception_exit_bitmap( 1643 1UL << WHvX64ExceptionTypeDebugTrapOrFault); 1644 if (!SUCCEEDED(hr)) { 1645 error_report("WHPX: Failed to update exception exit mask, " 1646 "hr=%08lx.", hr); 1647 return 1; 1648 } 1649 1650 if (stepped_over_bp) { 1651 /* Temporarily disable the triggered breakpoint. */ 1652 cpu_memory_rw_debug(cpu, 1653 stepped_over_bp->address, 1654 &stepped_over_bp->original_instruction, 1655 1, 1656 true); 1657 } 1658 } else { 1659 cpu_exec_start(cpu); 1660 } 1661 1662 do { 1663 if (cpu->vcpu_dirty) { 1664 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); 1665 cpu->vcpu_dirty = false; 1666 } 1667 1668 if (exclusive_step_mode == WHPX_STEP_NONE) { 1669 whpx_vcpu_pre_run(cpu); 1670 1671 if (qatomic_read(&cpu->exit_request)) { 1672 whpx_vcpu_kick(cpu); 1673 } 1674 } 1675 1676 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1677 whpx_vcpu_configure_single_stepping(cpu, true, NULL); 1678 } 1679 1680 hr = whp_dispatch.WHvRunVirtualProcessor( 1681 whpx->partition, cpu->cpu_index, 1682 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); 1683 1684 if (FAILED(hr)) { 1685 error_report("WHPX: Failed to exec a virtual processor," 1686 " hr=%08lx", hr); 1687 ret = -1; 1688 break; 1689 } 1690 1691 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1692 whpx_vcpu_configure_single_stepping(cpu, 1693 false, 1694 &vcpu->exit_ctx.VpContext.Rflags); 1695 } 1696 1697 whpx_vcpu_post_run(cpu); 1698 1699 switch (vcpu->exit_ctx.ExitReason) { 1700 case WHvRunVpExitReasonMemoryAccess: 1701 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); 1702 break; 1703 1704 case WHvRunVpExitReasonX64IoPortAccess: 1705 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); 1706 break; 1707 1708 case WHvRunVpExitReasonX64InterruptWindow: 1709 vcpu->ready_for_pic_interrupt = 1; 1710 vcpu->window_registered = 0; 1711 ret = 0; 1712 break; 1713 1714 case WHvRunVpExitReasonX64ApicEoi: 1715 assert(whpx_apic_in_platform()); 1716 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); 1717 break; 1718 1719 case WHvRunVpExitReasonX64Halt: 1720 /* 1721 * WARNING: as of build 19043.1526 (21H1), this exit reason is no 1722 * longer used. 1723 */ 1724 ret = whpx_handle_halt(cpu); 1725 break; 1726 1727 case WHvRunVpExitReasonX64ApicInitSipiTrap: { 1728 WHV_INTERRUPT_CONTROL ipi = {0}; 1729 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; 1730 uint32_t delivery_mode = 1731 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; 1732 int dest_shorthand = 1733 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; 1734 bool broadcast = false; 1735 bool include_self = false; 1736 uint32_t i; 1737 1738 /* We only registered for INIT and SIPI exits. */ 1739 if ((delivery_mode != APIC_DM_INIT) && 1740 (delivery_mode != APIC_DM_SIPI)) { 1741 error_report( 1742 "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); 1743 break; 1744 } 1745 1746 if (delivery_mode == APIC_DM_INIT) { 1747 ipi.Type = WHvX64InterruptTypeInit; 1748 } else { 1749 ipi.Type = WHvX64InterruptTypeSipi; 1750 } 1751 1752 ipi.DestinationMode = 1753 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? 1754 WHvX64InterruptDestinationModeLogical : 1755 WHvX64InterruptDestinationModePhysical; 1756 1757 ipi.TriggerMode = 1758 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? 1759 WHvX64InterruptTriggerModeLevel : 1760 WHvX64InterruptTriggerModeEdge; 1761 1762 ipi.Vector = icr & APIC_VECTOR_MASK; 1763 switch (dest_shorthand) { 1764 /* no shorthand. Bits 56-63 contain the destination. */ 1765 case 0: 1766 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; 1767 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1768 &ipi, sizeof(ipi)); 1769 if (FAILED(hr)) { 1770 error_report("WHPX: Failed to request interrupt hr=%08lx", 1771 hr); 1772 } 1773 1774 break; 1775 1776 /* self */ 1777 case 1: 1778 include_self = true; 1779 break; 1780 1781 /* broadcast, including self */ 1782 case 2: 1783 broadcast = true; 1784 include_self = true; 1785 break; 1786 1787 /* broadcast, excluding self */ 1788 case 3: 1789 broadcast = true; 1790 break; 1791 } 1792 1793 if (!broadcast && !include_self) { 1794 break; 1795 } 1796 1797 for (i = 0; i <= max_vcpu_index; i++) { 1798 if (i == cpu->cpu_index && !include_self) { 1799 continue; 1800 } 1801 1802 /* 1803 * Assuming that APIC Ids are identity mapped since 1804 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers 1805 * are not handled yet and the hypervisor doesn't allow the 1806 * guest to modify the APIC ID. 1807 */ 1808 ipi.Destination = i; 1809 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1810 &ipi, sizeof(ipi)); 1811 if (FAILED(hr)) { 1812 error_report( 1813 "WHPX: Failed to request SIPI for %d, hr=%08lx", 1814 i, hr); 1815 } 1816 } 1817 1818 break; 1819 } 1820 1821 case WHvRunVpExitReasonCanceled: 1822 if (exclusive_step_mode != WHPX_STEP_NONE) { 1823 /* 1824 * We are trying to step over a single instruction, and 1825 * likely got a request to stop from another thread. 1826 * Delay it until we are done stepping 1827 * over. 1828 */ 1829 ret = 0; 1830 } else { 1831 cpu->exception_index = EXCP_INTERRUPT; 1832 ret = 1; 1833 } 1834 break; 1835 case WHvRunVpExitReasonX64MsrAccess: { 1836 WHV_REGISTER_VALUE reg_values[3] = {0}; 1837 WHV_REGISTER_NAME reg_names[3]; 1838 UINT32 reg_count; 1839 1840 reg_names[0] = WHvX64RegisterRip; 1841 reg_names[1] = WHvX64RegisterRax; 1842 reg_names[2] = WHvX64RegisterRdx; 1843 1844 reg_values[0].Reg64 = 1845 vcpu->exit_ctx.VpContext.Rip + 1846 vcpu->exit_ctx.VpContext.InstructionLength; 1847 1848 /* 1849 * For all unsupported MSR access we: 1850 * ignore writes 1851 * return 0 on read. 1852 */ 1853 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? 1854 1 : 3; 1855 1856 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1857 whpx->partition, 1858 cpu->cpu_index, 1859 reg_names, reg_count, 1860 reg_values); 1861 1862 if (FAILED(hr)) { 1863 error_report("WHPX: Failed to set MsrAccess state " 1864 " registers, hr=%08lx", hr); 1865 } 1866 ret = 0; 1867 break; 1868 } 1869 case WHvRunVpExitReasonX64Cpuid: { 1870 WHV_REGISTER_VALUE reg_values[5]; 1871 WHV_REGISTER_NAME reg_names[5]; 1872 UINT32 reg_count = 5; 1873 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; 1874 X86CPU *x86_cpu = X86_CPU(cpu); 1875 CPUX86State *env = &x86_cpu->env; 1876 1877 memset(reg_values, 0, sizeof(reg_values)); 1878 1879 rip = vcpu->exit_ctx.VpContext.Rip + 1880 vcpu->exit_ctx.VpContext.InstructionLength; 1881 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; 1882 1883 /* 1884 * Ideally, these should be supplied to the hypervisor during VCPU 1885 * initialization and it should be able to satisfy this request. 1886 * But, currently, WHPX doesn't support setting CPUID values in the 1887 * hypervisor once the partition has been setup, which is too late 1888 * since VCPUs are realized later. For now, use the values from 1889 * QEMU to satisfy these requests, until WHPX adds support for 1890 * being able to set these values in the hypervisor at runtime. 1891 */ 1892 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, 1893 (UINT32 *)&rcx, (UINT32 *)&rdx); 1894 switch (cpuid_fn) { 1895 case 0x40000000: 1896 /* Expose the vmware cpu frequency cpuid leaf */ 1897 rax = 0x40000010; 1898 rbx = rcx = rdx = 0; 1899 break; 1900 1901 case 0x40000010: 1902 rax = env->tsc_khz; 1903 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1904 rcx = rdx = 0; 1905 break; 1906 1907 case 0x80000001: 1908 /* Remove any support of OSVW */ 1909 rcx &= ~CPUID_EXT3_OSVW; 1910 break; 1911 } 1912 1913 reg_names[0] = WHvX64RegisterRip; 1914 reg_names[1] = WHvX64RegisterRax; 1915 reg_names[2] = WHvX64RegisterRcx; 1916 reg_names[3] = WHvX64RegisterRdx; 1917 reg_names[4] = WHvX64RegisterRbx; 1918 1919 reg_values[0].Reg64 = rip; 1920 reg_values[1].Reg64 = rax; 1921 reg_values[2].Reg64 = rcx; 1922 reg_values[3].Reg64 = rdx; 1923 reg_values[4].Reg64 = rbx; 1924 1925 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1926 whpx->partition, cpu->cpu_index, 1927 reg_names, 1928 reg_count, 1929 reg_values); 1930 1931 if (FAILED(hr)) { 1932 error_report("WHPX: Failed to set CpuidAccess state registers," 1933 " hr=%08lx", hr); 1934 } 1935 ret = 0; 1936 break; 1937 } 1938 case WHvRunVpExitReasonException: 1939 whpx_get_registers(cpu); 1940 1941 if ((vcpu->exit_ctx.VpException.ExceptionType == 1942 WHvX64ExceptionTypeDebugTrapOrFault) && 1943 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && 1944 (vcpu->exit_ctx.VpException.InstructionBytes[0] == 1945 whpx_breakpoint_instruction)) { 1946 /* Stopped at a software breakpoint. */ 1947 cpu->exception_index = EXCP_DEBUG; 1948 } else if ((vcpu->exit_ctx.VpException.ExceptionType == 1949 WHvX64ExceptionTypeDebugTrapOrFault) && 1950 !cpu->singlestep_enabled) { 1951 /* 1952 * Just finished stepping over a breakpoint, but the 1953 * gdb does not expect us to do single-stepping. 1954 * Don't do anything special. 1955 */ 1956 cpu->exception_index = EXCP_INTERRUPT; 1957 } else { 1958 /* Another exception or debug event. Report it to GDB. */ 1959 cpu->exception_index = EXCP_DEBUG; 1960 } 1961 1962 ret = 1; 1963 break; 1964 case WHvRunVpExitReasonNone: 1965 case WHvRunVpExitReasonUnrecoverableException: 1966 case WHvRunVpExitReasonInvalidVpRegisterValue: 1967 case WHvRunVpExitReasonUnsupportedFeature: 1968 default: 1969 error_report("WHPX: Unexpected VP exit code %d", 1970 vcpu->exit_ctx.ExitReason); 1971 whpx_get_registers(cpu); 1972 qemu_mutex_lock_iothread(); 1973 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 1974 qemu_mutex_unlock_iothread(); 1975 break; 1976 } 1977 1978 } while (!ret); 1979 1980 if (stepped_over_bp) { 1981 /* Restore the breakpoint we stepped over */ 1982 cpu_memory_rw_debug(cpu, 1983 stepped_over_bp->address, 1984 (void *)&whpx_breakpoint_instruction, 1985 1, 1986 true); 1987 } 1988 1989 if (exclusive_step_mode != WHPX_STEP_NONE) { 1990 g_assert(cpu_in_exclusive_context(cpu)); 1991 cpu->running = false; 1992 end_exclusive(); 1993 1994 exclusive_step_mode = WHPX_STEP_NONE; 1995 } else { 1996 cpu_exec_end(cpu); 1997 } 1998 1999 qemu_mutex_lock_iothread(); 2000 current_cpu = cpu; 2001 2002 if (--whpx->running_cpus == 0) { 2003 whpx_last_vcpu_stopping(cpu); 2004 } 2005 2006 qatomic_set(&cpu->exit_request, false); 2007 2008 return ret < 0; 2009 } 2010 2011 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2012 { 2013 if (!cpu->vcpu_dirty) { 2014 whpx_get_registers(cpu); 2015 cpu->vcpu_dirty = true; 2016 } 2017 } 2018 2019 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, 2020 run_on_cpu_data arg) 2021 { 2022 whpx_set_registers(cpu, WHPX_SET_RESET_STATE); 2023 cpu->vcpu_dirty = false; 2024 } 2025 2026 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, 2027 run_on_cpu_data arg) 2028 { 2029 whpx_set_registers(cpu, WHPX_SET_FULL_STATE); 2030 cpu->vcpu_dirty = false; 2031 } 2032 2033 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, 2034 run_on_cpu_data arg) 2035 { 2036 cpu->vcpu_dirty = true; 2037 } 2038 2039 /* 2040 * CPU support. 2041 */ 2042 2043 void whpx_cpu_synchronize_state(CPUState *cpu) 2044 { 2045 if (!cpu->vcpu_dirty) { 2046 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); 2047 } 2048 } 2049 2050 void whpx_cpu_synchronize_post_reset(CPUState *cpu) 2051 { 2052 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2053 } 2054 2055 void whpx_cpu_synchronize_post_init(CPUState *cpu) 2056 { 2057 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2058 } 2059 2060 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) 2061 { 2062 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2063 } 2064 2065 void whpx_cpu_synchronize_pre_resume(bool step_pending) 2066 { 2067 whpx_global.step_pending = step_pending; 2068 } 2069 2070 /* 2071 * Vcpu support. 2072 */ 2073 2074 static Error *whpx_migration_blocker; 2075 2076 static void whpx_cpu_update_state(void *opaque, bool running, RunState state) 2077 { 2078 CPUX86State *env = opaque; 2079 2080 if (running) { 2081 env->tsc_valid = false; 2082 } 2083 } 2084 2085 int whpx_init_vcpu(CPUState *cpu) 2086 { 2087 HRESULT hr; 2088 struct whpx_state *whpx = &whpx_global; 2089 struct whpx_vcpu *vcpu = NULL; 2090 Error *local_error = NULL; 2091 CPUX86State *env = cpu->env_ptr; 2092 X86CPU *x86_cpu = X86_CPU(cpu); 2093 UINT64 freq = 0; 2094 int ret; 2095 2096 /* Add migration blockers for all unsupported features of the 2097 * Windows Hypervisor Platform 2098 */ 2099 if (whpx_migration_blocker == NULL) { 2100 error_setg(&whpx_migration_blocker, 2101 "State blocked due to non-migratable CPUID feature support," 2102 "dirty memory tracking support, and XSAVE/XRSTOR support"); 2103 2104 if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) { 2105 error_report_err(local_error); 2106 error_free(whpx_migration_blocker); 2107 ret = -EINVAL; 2108 goto error; 2109 } 2110 } 2111 2112 vcpu = g_new0(struct whpx_vcpu, 1); 2113 2114 if (!vcpu) { 2115 error_report("WHPX: Failed to allocte VCPU context."); 2116 ret = -ENOMEM; 2117 goto error; 2118 } 2119 2120 hr = whp_dispatch.WHvEmulatorCreateEmulator( 2121 &whpx_emu_callbacks, 2122 &vcpu->emulator); 2123 if (FAILED(hr)) { 2124 error_report("WHPX: Failed to setup instruction completion support," 2125 " hr=%08lx", hr); 2126 ret = -EINVAL; 2127 goto error; 2128 } 2129 2130 hr = whp_dispatch.WHvCreateVirtualProcessor( 2131 whpx->partition, cpu->cpu_index, 0); 2132 if (FAILED(hr)) { 2133 error_report("WHPX: Failed to create a virtual processor," 2134 " hr=%08lx", hr); 2135 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2136 ret = -EINVAL; 2137 goto error; 2138 } 2139 2140 /* 2141 * vcpu's TSC frequency is either specified by user, or use the value 2142 * provided by Hyper-V if the former is not present. In the latter case, we 2143 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC 2144 * frequency can be migrated later via this field. 2145 */ 2146 if (!env->tsc_khz) { 2147 hr = whp_dispatch.WHvGetCapability( 2148 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), 2149 NULL); 2150 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2151 if (FAILED(hr)) { 2152 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); 2153 } else { 2154 env->tsc_khz = freq / 1000; /* Hz to KHz */ 2155 } 2156 } 2157 } 2158 2159 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; 2160 hr = whp_dispatch.WHvGetCapability( 2161 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); 2162 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2163 if (FAILED(hr)) { 2164 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); 2165 } else { 2166 env->apic_bus_freq = freq; 2167 } 2168 } 2169 2170 /* 2171 * If the vmware cpuid frequency leaf option is set, and we have a valid 2172 * tsc value, trap the corresponding cpuid's. 2173 */ 2174 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { 2175 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; 2176 2177 hr = whp_dispatch.WHvSetPartitionProperty( 2178 whpx->partition, 2179 WHvPartitionPropertyCodeCpuidExitList, 2180 cpuidExitList, 2181 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2182 2183 if (FAILED(hr)) { 2184 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2185 hr); 2186 ret = -EINVAL; 2187 goto error; 2188 } 2189 } 2190 2191 vcpu->interruptable = true; 2192 cpu->vcpu_dirty = true; 2193 cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu; 2194 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); 2195 qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr); 2196 2197 return 0; 2198 2199 error: 2200 g_free(vcpu); 2201 2202 return ret; 2203 } 2204 2205 int whpx_vcpu_exec(CPUState *cpu) 2206 { 2207 int ret; 2208 int fatal; 2209 2210 for (;;) { 2211 if (cpu->exception_index >= EXCP_INTERRUPT) { 2212 ret = cpu->exception_index; 2213 cpu->exception_index = -1; 2214 break; 2215 } 2216 2217 fatal = whpx_vcpu_run(cpu); 2218 2219 if (fatal) { 2220 error_report("WHPX: Failed to exec a virtual processor"); 2221 abort(); 2222 } 2223 } 2224 2225 return ret; 2226 } 2227 2228 void whpx_destroy_vcpu(CPUState *cpu) 2229 { 2230 struct whpx_state *whpx = &whpx_global; 2231 struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); 2232 2233 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); 2234 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2235 g_free(cpu->hax_vcpu); 2236 return; 2237 } 2238 2239 void whpx_vcpu_kick(CPUState *cpu) 2240 { 2241 struct whpx_state *whpx = &whpx_global; 2242 whp_dispatch.WHvCancelRunVirtualProcessor( 2243 whpx->partition, cpu->cpu_index, 0); 2244 } 2245 2246 /* 2247 * Memory support. 2248 */ 2249 2250 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, 2251 void *host_va, int add, int rom, 2252 const char *name) 2253 { 2254 struct whpx_state *whpx = &whpx_global; 2255 HRESULT hr; 2256 2257 /* 2258 if (add) { 2259 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", 2260 (void*)start_pa, (void*)size, host_va, 2261 (rom ? "ROM" : "RAM"), name); 2262 } else { 2263 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", 2264 (void*)start_pa, (void*)size, host_va, name); 2265 } 2266 */ 2267 2268 if (add) { 2269 hr = whp_dispatch.WHvMapGpaRange(whpx->partition, 2270 host_va, 2271 start_pa, 2272 size, 2273 (WHvMapGpaRangeFlagRead | 2274 WHvMapGpaRangeFlagExecute | 2275 (rom ? 0 : WHvMapGpaRangeFlagWrite))); 2276 } else { 2277 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, 2278 start_pa, 2279 size); 2280 } 2281 2282 if (FAILED(hr)) { 2283 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," 2284 " Host:%p, hr=%08lx", 2285 (add ? "MAP" : "UNMAP"), name, 2286 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); 2287 } 2288 } 2289 2290 static void whpx_process_section(MemoryRegionSection *section, int add) 2291 { 2292 MemoryRegion *mr = section->mr; 2293 hwaddr start_pa = section->offset_within_address_space; 2294 ram_addr_t size = int128_get64(section->size); 2295 unsigned int delta; 2296 uint64_t host_va; 2297 2298 if (!memory_region_is_ram(mr)) { 2299 return; 2300 } 2301 2302 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 2303 delta &= ~qemu_real_host_page_mask(); 2304 if (delta > size) { 2305 return; 2306 } 2307 start_pa += delta; 2308 size -= delta; 2309 size &= qemu_real_host_page_mask(); 2310 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 2311 return; 2312 } 2313 2314 host_va = (uintptr_t)memory_region_get_ram_ptr(mr) 2315 + section->offset_within_region + delta; 2316 2317 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, 2318 memory_region_is_rom(mr), mr->name); 2319 } 2320 2321 static void whpx_region_add(MemoryListener *listener, 2322 MemoryRegionSection *section) 2323 { 2324 memory_region_ref(section->mr); 2325 whpx_process_section(section, 1); 2326 } 2327 2328 static void whpx_region_del(MemoryListener *listener, 2329 MemoryRegionSection *section) 2330 { 2331 whpx_process_section(section, 0); 2332 memory_region_unref(section->mr); 2333 } 2334 2335 static void whpx_transaction_begin(MemoryListener *listener) 2336 { 2337 } 2338 2339 static void whpx_transaction_commit(MemoryListener *listener) 2340 { 2341 } 2342 2343 static void whpx_log_sync(MemoryListener *listener, 2344 MemoryRegionSection *section) 2345 { 2346 MemoryRegion *mr = section->mr; 2347 2348 if (!memory_region_is_ram(mr)) { 2349 return; 2350 } 2351 2352 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 2353 } 2354 2355 static MemoryListener whpx_memory_listener = { 2356 .name = "whpx", 2357 .begin = whpx_transaction_begin, 2358 .commit = whpx_transaction_commit, 2359 .region_add = whpx_region_add, 2360 .region_del = whpx_region_del, 2361 .log_sync = whpx_log_sync, 2362 .priority = 10, 2363 }; 2364 2365 static void whpx_memory_init(void) 2366 { 2367 memory_listener_register(&whpx_memory_listener, &address_space_memory); 2368 } 2369 2370 /* 2371 * Load the functions from the given library, using the given handle. If a 2372 * handle is provided, it is used, otherwise the library is opened. The 2373 * handle will be updated on return with the opened one. 2374 */ 2375 static bool load_whp_dispatch_fns(HMODULE *handle, 2376 WHPFunctionList function_list) 2377 { 2378 HMODULE hLib = *handle; 2379 2380 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" 2381 #define WINHV_EMULATION_DLL "WinHvEmulation.dll" 2382 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ 2383 whp_dispatch.function_name = \ 2384 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2385 2386 #define WHP_LOAD_FIELD(return_type, function_name, signature) \ 2387 whp_dispatch.function_name = \ 2388 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2389 if (!whp_dispatch.function_name) { \ 2390 error_report("Could not load function %s", #function_name); \ 2391 goto error; \ 2392 } \ 2393 2394 #define WHP_LOAD_LIB(lib_name, handle_lib) \ 2395 if (!handle_lib) { \ 2396 handle_lib = LoadLibrary(lib_name); \ 2397 if (!handle_lib) { \ 2398 error_report("Could not load library %s.", lib_name); \ 2399 goto error; \ 2400 } \ 2401 } \ 2402 2403 switch (function_list) { 2404 case WINHV_PLATFORM_FNS_DEFAULT: 2405 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2406 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) 2407 break; 2408 2409 case WINHV_EMULATION_FNS_DEFAULT: 2410 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) 2411 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) 2412 break; 2413 2414 case WINHV_PLATFORM_FNS_SUPPLEMENTAL: 2415 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2416 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) 2417 break; 2418 } 2419 2420 *handle = hLib; 2421 return true; 2422 2423 error: 2424 if (hLib) { 2425 FreeLibrary(hLib); 2426 } 2427 2428 return false; 2429 } 2430 2431 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, 2432 const char *name, void *opaque, 2433 Error **errp) 2434 { 2435 struct whpx_state *whpx = &whpx_global; 2436 OnOffSplit mode; 2437 2438 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 2439 return; 2440 } 2441 2442 switch (mode) { 2443 case ON_OFF_SPLIT_ON: 2444 whpx->kernel_irqchip_allowed = true; 2445 whpx->kernel_irqchip_required = true; 2446 break; 2447 2448 case ON_OFF_SPLIT_OFF: 2449 whpx->kernel_irqchip_allowed = false; 2450 whpx->kernel_irqchip_required = false; 2451 break; 2452 2453 case ON_OFF_SPLIT_SPLIT: 2454 error_setg(errp, "WHPX: split irqchip currently not supported"); 2455 error_append_hint(errp, 2456 "Try without kernel-irqchip or with kernel-irqchip=on|off"); 2457 break; 2458 2459 default: 2460 /* 2461 * The value was checked in visit_type_OnOffSplit() above. If 2462 * we get here, then something is wrong in QEMU. 2463 */ 2464 abort(); 2465 } 2466 } 2467 2468 /* 2469 * Partition support 2470 */ 2471 2472 static int whpx_accel_init(MachineState *ms) 2473 { 2474 struct whpx_state *whpx; 2475 int ret; 2476 HRESULT hr; 2477 WHV_CAPABILITY whpx_cap; 2478 UINT32 whpx_cap_size; 2479 WHV_PARTITION_PROPERTY prop; 2480 UINT32 cpuidExitList[] = {1, 0x80000001}; 2481 WHV_CAPABILITY_FEATURES features = {0}; 2482 2483 whpx = &whpx_global; 2484 2485 if (!init_whp_dispatch()) { 2486 ret = -ENOSYS; 2487 goto error; 2488 } 2489 2490 whpx->mem_quota = ms->ram_size; 2491 2492 hr = whp_dispatch.WHvGetCapability( 2493 WHvCapabilityCodeHypervisorPresent, &whpx_cap, 2494 sizeof(whpx_cap), &whpx_cap_size); 2495 if (FAILED(hr) || !whpx_cap.HypervisorPresent) { 2496 error_report("WHPX: No accelerator found, hr=%08lx", hr); 2497 ret = -ENOSPC; 2498 goto error; 2499 } 2500 2501 hr = whp_dispatch.WHvGetCapability( 2502 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); 2503 if (FAILED(hr)) { 2504 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); 2505 ret = -EINVAL; 2506 goto error; 2507 } 2508 2509 hr = whp_dispatch.WHvCreatePartition(&whpx->partition); 2510 if (FAILED(hr)) { 2511 error_report("WHPX: Failed to create partition, hr=%08lx", hr); 2512 ret = -EINVAL; 2513 goto error; 2514 } 2515 2516 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2517 prop.ProcessorCount = ms->smp.cpus; 2518 hr = whp_dispatch.WHvSetPartitionProperty( 2519 whpx->partition, 2520 WHvPartitionPropertyCodeProcessorCount, 2521 &prop, 2522 sizeof(WHV_PARTITION_PROPERTY)); 2523 2524 if (FAILED(hr)) { 2525 error_report("WHPX: Failed to set partition core count to %d," 2526 " hr=%08lx", ms->smp.cores, hr); 2527 ret = -EINVAL; 2528 goto error; 2529 } 2530 2531 /* 2532 * Error out if WHP doesn't support apic emulation and user is requiring 2533 * it. 2534 */ 2535 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || 2536 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { 2537 error_report("WHPX: kernel irqchip requested, but unavailable. " 2538 "Try without kernel-irqchip or with kernel-irqchip=off"); 2539 ret = -EINVAL; 2540 goto error; 2541 } 2542 2543 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && 2544 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { 2545 WHV_X64_LOCAL_APIC_EMULATION_MODE mode = 2546 WHvX64LocalApicEmulationModeXApic; 2547 printf("WHPX: setting APIC emulation mode in the hypervisor\n"); 2548 hr = whp_dispatch.WHvSetPartitionProperty( 2549 whpx->partition, 2550 WHvPartitionPropertyCodeLocalApicEmulationMode, 2551 &mode, 2552 sizeof(mode)); 2553 if (FAILED(hr)) { 2554 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); 2555 if (whpx->kernel_irqchip_required) { 2556 error_report("WHPX: kernel irqchip requested, but unavailable"); 2557 ret = -EINVAL; 2558 goto error; 2559 } 2560 } else { 2561 whpx->apic_in_platform = true; 2562 } 2563 } 2564 2565 /* Register for MSR and CPUID exits */ 2566 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2567 prop.ExtendedVmExits.X64MsrExit = 1; 2568 prop.ExtendedVmExits.X64CpuidExit = 1; 2569 prop.ExtendedVmExits.ExceptionExit = 1; 2570 if (whpx_apic_in_platform()) { 2571 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; 2572 } 2573 2574 hr = whp_dispatch.WHvSetPartitionProperty( 2575 whpx->partition, 2576 WHvPartitionPropertyCodeExtendedVmExits, 2577 &prop, 2578 sizeof(WHV_PARTITION_PROPERTY)); 2579 if (FAILED(hr)) { 2580 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); 2581 ret = -EINVAL; 2582 goto error; 2583 } 2584 2585 hr = whp_dispatch.WHvSetPartitionProperty( 2586 whpx->partition, 2587 WHvPartitionPropertyCodeCpuidExitList, 2588 cpuidExitList, 2589 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2590 2591 if (FAILED(hr)) { 2592 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2593 hr); 2594 ret = -EINVAL; 2595 goto error; 2596 } 2597 2598 /* 2599 * We do not want to intercept any exceptions from the guest, 2600 * until we actually start debugging with gdb. 2601 */ 2602 whpx->exception_exit_bitmap = -1; 2603 hr = whpx_set_exception_exit_bitmap(0); 2604 2605 if (FAILED(hr)) { 2606 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); 2607 ret = -EINVAL; 2608 goto error; 2609 } 2610 2611 hr = whp_dispatch.WHvSetupPartition(whpx->partition); 2612 if (FAILED(hr)) { 2613 error_report("WHPX: Failed to setup partition, hr=%08lx", hr); 2614 ret = -EINVAL; 2615 goto error; 2616 } 2617 2618 whpx_memory_init(); 2619 2620 printf("Windows Hypervisor Platform accelerator is operational\n"); 2621 return 0; 2622 2623 error: 2624 2625 if (NULL != whpx->partition) { 2626 whp_dispatch.WHvDeletePartition(whpx->partition); 2627 whpx->partition = NULL; 2628 } 2629 2630 return ret; 2631 } 2632 2633 int whpx_enabled(void) 2634 { 2635 return whpx_allowed; 2636 } 2637 2638 bool whpx_apic_in_platform(void) { 2639 return whpx_global.apic_in_platform; 2640 } 2641 2642 static void whpx_accel_class_init(ObjectClass *oc, void *data) 2643 { 2644 AccelClass *ac = ACCEL_CLASS(oc); 2645 ac->name = "WHPX"; 2646 ac->init_machine = whpx_accel_init; 2647 ac->allowed = &whpx_allowed; 2648 2649 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 2650 NULL, whpx_set_kernel_irqchip, 2651 NULL, NULL); 2652 object_class_property_set_description(oc, "kernel-irqchip", 2653 "Configure WHPX in-kernel irqchip"); 2654 } 2655 2656 static void whpx_accel_instance_init(Object *obj) 2657 { 2658 struct whpx_state *whpx = &whpx_global; 2659 2660 memset(whpx, 0, sizeof(struct whpx_state)); 2661 /* Turn on kernel-irqchip, by default */ 2662 whpx->kernel_irqchip_allowed = true; 2663 } 2664 2665 static const TypeInfo whpx_accel_type = { 2666 .name = ACCEL_CLASS_NAME("whpx"), 2667 .parent = TYPE_ACCEL, 2668 .instance_init = whpx_accel_instance_init, 2669 .class_init = whpx_accel_class_init, 2670 }; 2671 2672 static void whpx_type_init(void) 2673 { 2674 type_register_static(&whpx_accel_type); 2675 } 2676 2677 bool init_whp_dispatch(void) 2678 { 2679 if (whp_dispatch_initialized) { 2680 return true; 2681 } 2682 2683 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { 2684 goto error; 2685 } 2686 2687 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { 2688 goto error; 2689 } 2690 2691 assert(load_whp_dispatch_fns(&hWinHvPlatform, 2692 WINHV_PLATFORM_FNS_SUPPLEMENTAL)); 2693 whp_dispatch_initialized = true; 2694 2695 return true; 2696 error: 2697 if (hWinHvPlatform) { 2698 FreeLibrary(hWinHvPlatform); 2699 } 2700 2701 if (hWinHvEmulation) { 2702 FreeLibrary(hWinHvEmulation); 2703 } 2704 2705 return false; 2706 } 2707 2708 type_init(whpx_type_init); 2709