1 /* 2 * Copyright (c) 2018-2019 Maxime Villard, All rights reserved. 3 * 4 * NetBSD Virtual Machine Monitor (NVMM) accelerator for QEMU. 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "exec/address-spaces.h" 13 #include "exec/ioport.h" 14 #include "qemu/accel.h" 15 #include "sysemu/nvmm.h" 16 #include "sysemu/cpus.h" 17 #include "sysemu/runstate.h" 18 #include "qemu/main-loop.h" 19 #include "qemu/error-report.h" 20 #include "qapi/error.h" 21 #include "qemu/queue.h" 22 #include "migration/blocker.h" 23 #include "strings.h" 24 25 #include "nvmm-accel-ops.h" 26 27 #include <nvmm.h> 28 29 struct AccelCPUState { 30 struct nvmm_vcpu vcpu; 31 uint8_t tpr; 32 bool stop; 33 bool dirty; 34 35 /* Window-exiting for INTs/NMIs. */ 36 bool int_window_exit; 37 bool nmi_window_exit; 38 39 /* The guest is in an interrupt shadow (POP SS, etc). */ 40 bool int_shadow; 41 }; 42 43 struct qemu_machine { 44 struct nvmm_capability cap; 45 struct nvmm_machine mach; 46 }; 47 48 /* -------------------------------------------------------------------------- */ 49 50 static bool nvmm_allowed; 51 static struct qemu_machine qemu_mach; 52 53 static struct nvmm_machine * 54 get_nvmm_mach(void) 55 { 56 return &qemu_mach.mach; 57 } 58 59 /* -------------------------------------------------------------------------- */ 60 61 static void 62 nvmm_set_segment(struct nvmm_x64_state_seg *nseg, const SegmentCache *qseg) 63 { 64 uint32_t attrib = qseg->flags; 65 66 nseg->selector = qseg->selector; 67 nseg->limit = qseg->limit; 68 nseg->base = qseg->base; 69 nseg->attrib.type = __SHIFTOUT(attrib, DESC_TYPE_MASK); 70 nseg->attrib.s = __SHIFTOUT(attrib, DESC_S_MASK); 71 nseg->attrib.dpl = __SHIFTOUT(attrib, DESC_DPL_MASK); 72 nseg->attrib.p = __SHIFTOUT(attrib, DESC_P_MASK); 73 nseg->attrib.avl = __SHIFTOUT(attrib, DESC_AVL_MASK); 74 nseg->attrib.l = __SHIFTOUT(attrib, DESC_L_MASK); 75 nseg->attrib.def = __SHIFTOUT(attrib, DESC_B_MASK); 76 nseg->attrib.g = __SHIFTOUT(attrib, DESC_G_MASK); 77 } 78 79 static void 80 nvmm_set_registers(CPUState *cpu) 81 { 82 CPUX86State *env = cpu_env(cpu); 83 struct nvmm_machine *mach = get_nvmm_mach(); 84 AccelCPUState *qcpu = cpu->accel; 85 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 86 struct nvmm_x64_state *state = vcpu->state; 87 uint64_t bitmap; 88 size_t i; 89 int ret; 90 91 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 92 93 /* GPRs. */ 94 state->gprs[NVMM_X64_GPR_RAX] = env->regs[R_EAX]; 95 state->gprs[NVMM_X64_GPR_RCX] = env->regs[R_ECX]; 96 state->gprs[NVMM_X64_GPR_RDX] = env->regs[R_EDX]; 97 state->gprs[NVMM_X64_GPR_RBX] = env->regs[R_EBX]; 98 state->gprs[NVMM_X64_GPR_RSP] = env->regs[R_ESP]; 99 state->gprs[NVMM_X64_GPR_RBP] = env->regs[R_EBP]; 100 state->gprs[NVMM_X64_GPR_RSI] = env->regs[R_ESI]; 101 state->gprs[NVMM_X64_GPR_RDI] = env->regs[R_EDI]; 102 #ifdef TARGET_X86_64 103 state->gprs[NVMM_X64_GPR_R8] = env->regs[R_R8]; 104 state->gprs[NVMM_X64_GPR_R9] = env->regs[R_R9]; 105 state->gprs[NVMM_X64_GPR_R10] = env->regs[R_R10]; 106 state->gprs[NVMM_X64_GPR_R11] = env->regs[R_R11]; 107 state->gprs[NVMM_X64_GPR_R12] = env->regs[R_R12]; 108 state->gprs[NVMM_X64_GPR_R13] = env->regs[R_R13]; 109 state->gprs[NVMM_X64_GPR_R14] = env->regs[R_R14]; 110 state->gprs[NVMM_X64_GPR_R15] = env->regs[R_R15]; 111 #endif 112 113 /* RIP and RFLAGS. */ 114 state->gprs[NVMM_X64_GPR_RIP] = env->eip; 115 state->gprs[NVMM_X64_GPR_RFLAGS] = env->eflags; 116 117 /* Segments. */ 118 nvmm_set_segment(&state->segs[NVMM_X64_SEG_CS], &env->segs[R_CS]); 119 nvmm_set_segment(&state->segs[NVMM_X64_SEG_DS], &env->segs[R_DS]); 120 nvmm_set_segment(&state->segs[NVMM_X64_SEG_ES], &env->segs[R_ES]); 121 nvmm_set_segment(&state->segs[NVMM_X64_SEG_FS], &env->segs[R_FS]); 122 nvmm_set_segment(&state->segs[NVMM_X64_SEG_GS], &env->segs[R_GS]); 123 nvmm_set_segment(&state->segs[NVMM_X64_SEG_SS], &env->segs[R_SS]); 124 125 /* Special segments. */ 126 nvmm_set_segment(&state->segs[NVMM_X64_SEG_GDT], &env->gdt); 127 nvmm_set_segment(&state->segs[NVMM_X64_SEG_LDT], &env->ldt); 128 nvmm_set_segment(&state->segs[NVMM_X64_SEG_TR], &env->tr); 129 nvmm_set_segment(&state->segs[NVMM_X64_SEG_IDT], &env->idt); 130 131 /* Control registers. */ 132 state->crs[NVMM_X64_CR_CR0] = env->cr[0]; 133 state->crs[NVMM_X64_CR_CR2] = env->cr[2]; 134 state->crs[NVMM_X64_CR_CR3] = env->cr[3]; 135 state->crs[NVMM_X64_CR_CR4] = env->cr[4]; 136 state->crs[NVMM_X64_CR_CR8] = qcpu->tpr; 137 state->crs[NVMM_X64_CR_XCR0] = env->xcr0; 138 139 /* Debug registers. */ 140 state->drs[NVMM_X64_DR_DR0] = env->dr[0]; 141 state->drs[NVMM_X64_DR_DR1] = env->dr[1]; 142 state->drs[NVMM_X64_DR_DR2] = env->dr[2]; 143 state->drs[NVMM_X64_DR_DR3] = env->dr[3]; 144 state->drs[NVMM_X64_DR_DR6] = env->dr[6]; 145 state->drs[NVMM_X64_DR_DR7] = env->dr[7]; 146 147 /* FPU. */ 148 state->fpu.fx_cw = env->fpuc; 149 state->fpu.fx_sw = (env->fpus & ~0x3800) | ((env->fpstt & 0x7) << 11); 150 state->fpu.fx_tw = 0; 151 for (i = 0; i < 8; i++) { 152 state->fpu.fx_tw |= (!env->fptags[i]) << i; 153 } 154 state->fpu.fx_opcode = env->fpop; 155 state->fpu.fx_ip.fa_64 = env->fpip; 156 state->fpu.fx_dp.fa_64 = env->fpdp; 157 state->fpu.fx_mxcsr = env->mxcsr; 158 state->fpu.fx_mxcsr_mask = 0x0000FFFF; 159 assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs)); 160 memcpy(state->fpu.fx_87_ac, env->fpregs, sizeof(env->fpregs)); 161 for (i = 0; i < CPU_NB_REGS; i++) { 162 memcpy(&state->fpu.fx_xmm[i].xmm_bytes[0], 163 &env->xmm_regs[i].ZMM_Q(0), 8); 164 memcpy(&state->fpu.fx_xmm[i].xmm_bytes[8], 165 &env->xmm_regs[i].ZMM_Q(1), 8); 166 } 167 168 /* MSRs. */ 169 state->msrs[NVMM_X64_MSR_EFER] = env->efer; 170 state->msrs[NVMM_X64_MSR_STAR] = env->star; 171 #ifdef TARGET_X86_64 172 state->msrs[NVMM_X64_MSR_LSTAR] = env->lstar; 173 state->msrs[NVMM_X64_MSR_CSTAR] = env->cstar; 174 state->msrs[NVMM_X64_MSR_SFMASK] = env->fmask; 175 state->msrs[NVMM_X64_MSR_KERNELGSBASE] = env->kernelgsbase; 176 #endif 177 state->msrs[NVMM_X64_MSR_SYSENTER_CS] = env->sysenter_cs; 178 state->msrs[NVMM_X64_MSR_SYSENTER_ESP] = env->sysenter_esp; 179 state->msrs[NVMM_X64_MSR_SYSENTER_EIP] = env->sysenter_eip; 180 state->msrs[NVMM_X64_MSR_PAT] = env->pat; 181 state->msrs[NVMM_X64_MSR_TSC] = env->tsc; 182 183 bitmap = 184 NVMM_X64_STATE_SEGS | 185 NVMM_X64_STATE_GPRS | 186 NVMM_X64_STATE_CRS | 187 NVMM_X64_STATE_DRS | 188 NVMM_X64_STATE_MSRS | 189 NVMM_X64_STATE_FPU; 190 191 ret = nvmm_vcpu_setstate(mach, vcpu, bitmap); 192 if (ret == -1) { 193 error_report("NVMM: Failed to set virtual processor context," 194 " error=%d", errno); 195 } 196 } 197 198 static void 199 nvmm_get_segment(SegmentCache *qseg, const struct nvmm_x64_state_seg *nseg) 200 { 201 qseg->selector = nseg->selector; 202 qseg->limit = nseg->limit; 203 qseg->base = nseg->base; 204 205 qseg->flags = 206 __SHIFTIN((uint32_t)nseg->attrib.type, DESC_TYPE_MASK) | 207 __SHIFTIN((uint32_t)nseg->attrib.s, DESC_S_MASK) | 208 __SHIFTIN((uint32_t)nseg->attrib.dpl, DESC_DPL_MASK) | 209 __SHIFTIN((uint32_t)nseg->attrib.p, DESC_P_MASK) | 210 __SHIFTIN((uint32_t)nseg->attrib.avl, DESC_AVL_MASK) | 211 __SHIFTIN((uint32_t)nseg->attrib.l, DESC_L_MASK) | 212 __SHIFTIN((uint32_t)nseg->attrib.def, DESC_B_MASK) | 213 __SHIFTIN((uint32_t)nseg->attrib.g, DESC_G_MASK); 214 } 215 216 static void 217 nvmm_get_registers(CPUState *cpu) 218 { 219 CPUX86State *env = cpu_env(cpu); 220 struct nvmm_machine *mach = get_nvmm_mach(); 221 AccelCPUState *qcpu = cpu->accel; 222 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 223 X86CPU *x86_cpu = X86_CPU(cpu); 224 struct nvmm_x64_state *state = vcpu->state; 225 uint64_t bitmap, tpr; 226 size_t i; 227 int ret; 228 229 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 230 231 bitmap = 232 NVMM_X64_STATE_SEGS | 233 NVMM_X64_STATE_GPRS | 234 NVMM_X64_STATE_CRS | 235 NVMM_X64_STATE_DRS | 236 NVMM_X64_STATE_MSRS | 237 NVMM_X64_STATE_FPU; 238 239 ret = nvmm_vcpu_getstate(mach, vcpu, bitmap); 240 if (ret == -1) { 241 error_report("NVMM: Failed to get virtual processor context," 242 " error=%d", errno); 243 } 244 245 /* GPRs. */ 246 env->regs[R_EAX] = state->gprs[NVMM_X64_GPR_RAX]; 247 env->regs[R_ECX] = state->gprs[NVMM_X64_GPR_RCX]; 248 env->regs[R_EDX] = state->gprs[NVMM_X64_GPR_RDX]; 249 env->regs[R_EBX] = state->gprs[NVMM_X64_GPR_RBX]; 250 env->regs[R_ESP] = state->gprs[NVMM_X64_GPR_RSP]; 251 env->regs[R_EBP] = state->gprs[NVMM_X64_GPR_RBP]; 252 env->regs[R_ESI] = state->gprs[NVMM_X64_GPR_RSI]; 253 env->regs[R_EDI] = state->gprs[NVMM_X64_GPR_RDI]; 254 #ifdef TARGET_X86_64 255 env->regs[R_R8] = state->gprs[NVMM_X64_GPR_R8]; 256 env->regs[R_R9] = state->gprs[NVMM_X64_GPR_R9]; 257 env->regs[R_R10] = state->gprs[NVMM_X64_GPR_R10]; 258 env->regs[R_R11] = state->gprs[NVMM_X64_GPR_R11]; 259 env->regs[R_R12] = state->gprs[NVMM_X64_GPR_R12]; 260 env->regs[R_R13] = state->gprs[NVMM_X64_GPR_R13]; 261 env->regs[R_R14] = state->gprs[NVMM_X64_GPR_R14]; 262 env->regs[R_R15] = state->gprs[NVMM_X64_GPR_R15]; 263 #endif 264 265 /* RIP and RFLAGS. */ 266 env->eip = state->gprs[NVMM_X64_GPR_RIP]; 267 env->eflags = state->gprs[NVMM_X64_GPR_RFLAGS]; 268 269 /* Segments. */ 270 nvmm_get_segment(&env->segs[R_ES], &state->segs[NVMM_X64_SEG_ES]); 271 nvmm_get_segment(&env->segs[R_CS], &state->segs[NVMM_X64_SEG_CS]); 272 nvmm_get_segment(&env->segs[R_SS], &state->segs[NVMM_X64_SEG_SS]); 273 nvmm_get_segment(&env->segs[R_DS], &state->segs[NVMM_X64_SEG_DS]); 274 nvmm_get_segment(&env->segs[R_FS], &state->segs[NVMM_X64_SEG_FS]); 275 nvmm_get_segment(&env->segs[R_GS], &state->segs[NVMM_X64_SEG_GS]); 276 277 /* Special segments. */ 278 nvmm_get_segment(&env->gdt, &state->segs[NVMM_X64_SEG_GDT]); 279 nvmm_get_segment(&env->ldt, &state->segs[NVMM_X64_SEG_LDT]); 280 nvmm_get_segment(&env->tr, &state->segs[NVMM_X64_SEG_TR]); 281 nvmm_get_segment(&env->idt, &state->segs[NVMM_X64_SEG_IDT]); 282 283 /* Control registers. */ 284 env->cr[0] = state->crs[NVMM_X64_CR_CR0]; 285 env->cr[2] = state->crs[NVMM_X64_CR_CR2]; 286 env->cr[3] = state->crs[NVMM_X64_CR_CR3]; 287 env->cr[4] = state->crs[NVMM_X64_CR_CR4]; 288 tpr = state->crs[NVMM_X64_CR_CR8]; 289 if (tpr != qcpu->tpr) { 290 qcpu->tpr = tpr; 291 cpu_set_apic_tpr(x86_cpu->apic_state, tpr); 292 } 293 env->xcr0 = state->crs[NVMM_X64_CR_XCR0]; 294 295 /* Debug registers. */ 296 env->dr[0] = state->drs[NVMM_X64_DR_DR0]; 297 env->dr[1] = state->drs[NVMM_X64_DR_DR1]; 298 env->dr[2] = state->drs[NVMM_X64_DR_DR2]; 299 env->dr[3] = state->drs[NVMM_X64_DR_DR3]; 300 env->dr[6] = state->drs[NVMM_X64_DR_DR6]; 301 env->dr[7] = state->drs[NVMM_X64_DR_DR7]; 302 303 /* FPU. */ 304 env->fpuc = state->fpu.fx_cw; 305 env->fpstt = (state->fpu.fx_sw >> 11) & 0x7; 306 env->fpus = state->fpu.fx_sw & ~0x3800; 307 for (i = 0; i < 8; i++) { 308 env->fptags[i] = !((state->fpu.fx_tw >> i) & 1); 309 } 310 env->fpop = state->fpu.fx_opcode; 311 env->fpip = state->fpu.fx_ip.fa_64; 312 env->fpdp = state->fpu.fx_dp.fa_64; 313 env->mxcsr = state->fpu.fx_mxcsr; 314 assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs)); 315 memcpy(env->fpregs, state->fpu.fx_87_ac, sizeof(env->fpregs)); 316 for (i = 0; i < CPU_NB_REGS; i++) { 317 memcpy(&env->xmm_regs[i].ZMM_Q(0), 318 &state->fpu.fx_xmm[i].xmm_bytes[0], 8); 319 memcpy(&env->xmm_regs[i].ZMM_Q(1), 320 &state->fpu.fx_xmm[i].xmm_bytes[8], 8); 321 } 322 323 /* MSRs. */ 324 env->efer = state->msrs[NVMM_X64_MSR_EFER]; 325 env->star = state->msrs[NVMM_X64_MSR_STAR]; 326 #ifdef TARGET_X86_64 327 env->lstar = state->msrs[NVMM_X64_MSR_LSTAR]; 328 env->cstar = state->msrs[NVMM_X64_MSR_CSTAR]; 329 env->fmask = state->msrs[NVMM_X64_MSR_SFMASK]; 330 env->kernelgsbase = state->msrs[NVMM_X64_MSR_KERNELGSBASE]; 331 #endif 332 env->sysenter_cs = state->msrs[NVMM_X64_MSR_SYSENTER_CS]; 333 env->sysenter_esp = state->msrs[NVMM_X64_MSR_SYSENTER_ESP]; 334 env->sysenter_eip = state->msrs[NVMM_X64_MSR_SYSENTER_EIP]; 335 env->pat = state->msrs[NVMM_X64_MSR_PAT]; 336 env->tsc = state->msrs[NVMM_X64_MSR_TSC]; 337 338 x86_update_hflags(env); 339 } 340 341 static bool 342 nvmm_can_take_int(CPUState *cpu) 343 { 344 AccelCPUState *qcpu = cpu->accel; 345 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 346 struct nvmm_machine *mach = get_nvmm_mach(); 347 348 if (qcpu->int_window_exit) { 349 return false; 350 } 351 352 if (qcpu->int_shadow || !(cpu_env(cpu)->eflags & IF_MASK)) { 353 struct nvmm_x64_state *state = vcpu->state; 354 355 /* Exit on interrupt window. */ 356 nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_INTR); 357 state->intr.int_window_exiting = 1; 358 nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_INTR); 359 360 return false; 361 } 362 363 return true; 364 } 365 366 static bool 367 nvmm_can_take_nmi(CPUState *cpu) 368 { 369 AccelCPUState *qcpu = cpu->accel; 370 371 /* 372 * Contrary to INTs, NMIs always schedule an exit when they are 373 * completed. Therefore, if window-exiting is enabled, it means 374 * NMIs are blocked. 375 */ 376 if (qcpu->nmi_window_exit) { 377 return false; 378 } 379 380 return true; 381 } 382 383 /* 384 * Called before the VCPU is run. We inject events generated by the I/O 385 * thread, and synchronize the guest TPR. 386 */ 387 static void 388 nvmm_vcpu_pre_run(CPUState *cpu) 389 { 390 CPUX86State *env = cpu_env(cpu); 391 struct nvmm_machine *mach = get_nvmm_mach(); 392 AccelCPUState *qcpu = cpu->accel; 393 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 394 X86CPU *x86_cpu = X86_CPU(cpu); 395 struct nvmm_x64_state *state = vcpu->state; 396 struct nvmm_vcpu_event *event = vcpu->event; 397 bool has_event = false; 398 bool sync_tpr = false; 399 uint8_t tpr; 400 int ret; 401 402 bql_lock(); 403 404 tpr = cpu_get_apic_tpr(x86_cpu->apic_state); 405 if (tpr != qcpu->tpr) { 406 qcpu->tpr = tpr; 407 sync_tpr = true; 408 } 409 410 /* 411 * Force the VCPU out of its inner loop to process any INIT requests 412 * or commit pending TPR access. 413 */ 414 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 415 cpu->exit_request = 1; 416 } 417 418 if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 419 if (nvmm_can_take_nmi(cpu)) { 420 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 421 event->type = NVMM_VCPU_EVENT_INTR; 422 event->vector = 2; 423 has_event = true; 424 } 425 } 426 427 if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 428 if (nvmm_can_take_int(cpu)) { 429 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 430 event->type = NVMM_VCPU_EVENT_INTR; 431 event->vector = cpu_get_pic_interrupt(env); 432 has_event = true; 433 } 434 } 435 436 /* Don't want SMIs. */ 437 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 438 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 439 } 440 441 if (sync_tpr) { 442 ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_CRS); 443 if (ret == -1) { 444 error_report("NVMM: Failed to get CPU state," 445 " error=%d", errno); 446 } 447 448 state->crs[NVMM_X64_CR_CR8] = qcpu->tpr; 449 450 ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_CRS); 451 if (ret == -1) { 452 error_report("NVMM: Failed to set CPU state," 453 " error=%d", errno); 454 } 455 } 456 457 if (has_event) { 458 ret = nvmm_vcpu_inject(mach, vcpu); 459 if (ret == -1) { 460 error_report("NVMM: Failed to inject event," 461 " error=%d", errno); 462 } 463 } 464 465 bql_unlock(); 466 } 467 468 /* 469 * Called after the VCPU ran. We synchronize the host view of the TPR and 470 * RFLAGS. 471 */ 472 static void 473 nvmm_vcpu_post_run(CPUState *cpu, struct nvmm_vcpu_exit *exit) 474 { 475 AccelCPUState *qcpu = cpu->accel; 476 X86CPU *x86_cpu = X86_CPU(cpu); 477 CPUX86State *env = &x86_cpu->env; 478 uint64_t tpr; 479 480 env->eflags = exit->exitstate.rflags; 481 qcpu->int_shadow = exit->exitstate.int_shadow; 482 qcpu->int_window_exit = exit->exitstate.int_window_exiting; 483 qcpu->nmi_window_exit = exit->exitstate.nmi_window_exiting; 484 485 tpr = exit->exitstate.cr8; 486 if (qcpu->tpr != tpr) { 487 qcpu->tpr = tpr; 488 bql_lock(); 489 cpu_set_apic_tpr(x86_cpu->apic_state, qcpu->tpr); 490 bql_unlock(); 491 } 492 } 493 494 /* -------------------------------------------------------------------------- */ 495 496 static void 497 nvmm_io_callback(struct nvmm_io *io) 498 { 499 MemTxAttrs attrs = { 0 }; 500 int ret; 501 502 ret = address_space_rw(&address_space_io, io->port, attrs, io->data, 503 io->size, !io->in); 504 if (ret != MEMTX_OK) { 505 error_report("NVMM: I/O Transaction Failed " 506 "[%s, port=%u, size=%zu]", (io->in ? "in" : "out"), 507 io->port, io->size); 508 } 509 510 /* Needed, otherwise infinite loop. */ 511 current_cpu->accel->dirty = false; 512 } 513 514 static void 515 nvmm_mem_callback(struct nvmm_mem *mem) 516 { 517 cpu_physical_memory_rw(mem->gpa, mem->data, mem->size, mem->write); 518 519 /* Needed, otherwise infinite loop. */ 520 current_cpu->accel->dirty = false; 521 } 522 523 static struct nvmm_assist_callbacks nvmm_callbacks = { 524 .io = nvmm_io_callback, 525 .mem = nvmm_mem_callback 526 }; 527 528 /* -------------------------------------------------------------------------- */ 529 530 static int 531 nvmm_handle_mem(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu) 532 { 533 int ret; 534 535 ret = nvmm_assist_mem(mach, vcpu); 536 if (ret == -1) { 537 error_report("NVMM: Mem Assist Failed [gpa=%p]", 538 (void *)vcpu->exit->u.mem.gpa); 539 } 540 541 return ret; 542 } 543 544 static int 545 nvmm_handle_io(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu) 546 { 547 int ret; 548 549 ret = nvmm_assist_io(mach, vcpu); 550 if (ret == -1) { 551 error_report("NVMM: I/O Assist Failed [port=%d]", 552 (int)vcpu->exit->u.io.port); 553 } 554 555 return ret; 556 } 557 558 static int 559 nvmm_handle_rdmsr(struct nvmm_machine *mach, CPUState *cpu, 560 struct nvmm_vcpu_exit *exit) 561 { 562 AccelCPUState *qcpu = cpu->accel; 563 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 564 X86CPU *x86_cpu = X86_CPU(cpu); 565 struct nvmm_x64_state *state = vcpu->state; 566 uint64_t val; 567 int ret; 568 569 switch (exit->u.rdmsr.msr) { 570 case MSR_IA32_APICBASE: 571 val = cpu_get_apic_base(x86_cpu->apic_state); 572 break; 573 case MSR_MTRRcap: 574 case MSR_MTRRdefType: 575 case MSR_MCG_CAP: 576 case MSR_MCG_STATUS: 577 val = 0; 578 break; 579 default: /* More MSRs to add? */ 580 val = 0; 581 error_report("NVMM: Unexpected RDMSR 0x%x, ignored", 582 exit->u.rdmsr.msr); 583 break; 584 } 585 586 ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS); 587 if (ret == -1) { 588 return -1; 589 } 590 591 state->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF); 592 state->gprs[NVMM_X64_GPR_RDX] = (val >> 32); 593 state->gprs[NVMM_X64_GPR_RIP] = exit->u.rdmsr.npc; 594 595 ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS); 596 if (ret == -1) { 597 return -1; 598 } 599 600 return 0; 601 } 602 603 static int 604 nvmm_handle_wrmsr(struct nvmm_machine *mach, CPUState *cpu, 605 struct nvmm_vcpu_exit *exit) 606 { 607 AccelCPUState *qcpu = cpu->accel; 608 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 609 X86CPU *x86_cpu = X86_CPU(cpu); 610 struct nvmm_x64_state *state = vcpu->state; 611 uint64_t val; 612 int ret; 613 614 val = exit->u.wrmsr.val; 615 616 switch (exit->u.wrmsr.msr) { 617 case MSR_IA32_APICBASE: 618 cpu_set_apic_base(x86_cpu->apic_state, val); 619 break; 620 case MSR_MTRRdefType: 621 case MSR_MCG_STATUS: 622 break; 623 default: /* More MSRs to add? */ 624 error_report("NVMM: Unexpected WRMSR 0x%x [val=0x%lx], ignored", 625 exit->u.wrmsr.msr, val); 626 break; 627 } 628 629 ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS); 630 if (ret == -1) { 631 return -1; 632 } 633 634 state->gprs[NVMM_X64_GPR_RIP] = exit->u.wrmsr.npc; 635 636 ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS); 637 if (ret == -1) { 638 return -1; 639 } 640 641 return 0; 642 } 643 644 static int 645 nvmm_handle_halted(struct nvmm_machine *mach, CPUState *cpu, 646 struct nvmm_vcpu_exit *exit) 647 { 648 int ret = 0; 649 650 bql_lock(); 651 652 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 653 (cpu_env(cpu)->eflags & IF_MASK)) && 654 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 655 cpu->exception_index = EXCP_HLT; 656 cpu->halted = true; 657 ret = 1; 658 } 659 660 bql_unlock(); 661 662 return ret; 663 } 664 665 static int 666 nvmm_inject_ud(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu) 667 { 668 struct nvmm_vcpu_event *event = vcpu->event; 669 670 event->type = NVMM_VCPU_EVENT_EXCP; 671 event->vector = 6; 672 event->u.excp.error = 0; 673 674 return nvmm_vcpu_inject(mach, vcpu); 675 } 676 677 static int 678 nvmm_vcpu_loop(CPUState *cpu) 679 { 680 struct nvmm_machine *mach = get_nvmm_mach(); 681 AccelCPUState *qcpu = cpu->accel; 682 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 683 X86CPU *x86_cpu = X86_CPU(cpu); 684 CPUX86State *env = &x86_cpu->env; 685 struct nvmm_vcpu_exit *exit = vcpu->exit; 686 int ret; 687 688 /* 689 * Some asynchronous events must be handled outside of the inner 690 * VCPU loop. They are handled here. 691 */ 692 if (cpu->interrupt_request & CPU_INTERRUPT_INIT) { 693 nvmm_cpu_synchronize_state(cpu); 694 do_cpu_init(x86_cpu); 695 /* set int/nmi windows back to the reset state */ 696 } 697 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 698 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 699 apic_poll_irq(x86_cpu->apic_state); 700 } 701 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 702 (env->eflags & IF_MASK)) || 703 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 704 cpu->halted = false; 705 } 706 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 707 nvmm_cpu_synchronize_state(cpu); 708 do_cpu_sipi(x86_cpu); 709 } 710 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 711 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 712 nvmm_cpu_synchronize_state(cpu); 713 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 714 env->tpr_access_type); 715 } 716 717 if (cpu->halted) { 718 cpu->exception_index = EXCP_HLT; 719 qatomic_set(&cpu->exit_request, false); 720 return 0; 721 } 722 723 bql_unlock(); 724 cpu_exec_start(cpu); 725 726 /* 727 * Inner VCPU loop. 728 */ 729 do { 730 if (cpu->accel->dirty) { 731 nvmm_set_registers(cpu); 732 cpu->accel->dirty = false; 733 } 734 735 if (qcpu->stop) { 736 cpu->exception_index = EXCP_INTERRUPT; 737 qcpu->stop = false; 738 ret = 1; 739 break; 740 } 741 742 nvmm_vcpu_pre_run(cpu); 743 744 if (qatomic_read(&cpu->exit_request)) { 745 #if NVMM_USER_VERSION >= 2 746 nvmm_vcpu_stop(vcpu); 747 #else 748 qemu_cpu_kick_self(); 749 #endif 750 } 751 752 /* Read exit_request before the kernel reads the immediate exit flag */ 753 smp_rmb(); 754 ret = nvmm_vcpu_run(mach, vcpu); 755 if (ret == -1) { 756 error_report("NVMM: Failed to exec a virtual processor," 757 " error=%d", errno); 758 break; 759 } 760 761 nvmm_vcpu_post_run(cpu, exit); 762 763 switch (exit->reason) { 764 case NVMM_VCPU_EXIT_NONE: 765 break; 766 #if NVMM_USER_VERSION >= 2 767 case NVMM_VCPU_EXIT_STOPPED: 768 /* 769 * The kernel cleared the immediate exit flag; cpu->exit_request 770 * must be cleared after 771 */ 772 smp_wmb(); 773 qcpu->stop = true; 774 break; 775 #endif 776 case NVMM_VCPU_EXIT_MEMORY: 777 ret = nvmm_handle_mem(mach, vcpu); 778 break; 779 case NVMM_VCPU_EXIT_IO: 780 ret = nvmm_handle_io(mach, vcpu); 781 break; 782 case NVMM_VCPU_EXIT_INT_READY: 783 case NVMM_VCPU_EXIT_NMI_READY: 784 case NVMM_VCPU_EXIT_TPR_CHANGED: 785 break; 786 case NVMM_VCPU_EXIT_HALTED: 787 ret = nvmm_handle_halted(mach, cpu, exit); 788 break; 789 case NVMM_VCPU_EXIT_SHUTDOWN: 790 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 791 cpu->exception_index = EXCP_INTERRUPT; 792 ret = 1; 793 break; 794 case NVMM_VCPU_EXIT_RDMSR: 795 ret = nvmm_handle_rdmsr(mach, cpu, exit); 796 break; 797 case NVMM_VCPU_EXIT_WRMSR: 798 ret = nvmm_handle_wrmsr(mach, cpu, exit); 799 break; 800 case NVMM_VCPU_EXIT_MONITOR: 801 case NVMM_VCPU_EXIT_MWAIT: 802 ret = nvmm_inject_ud(mach, vcpu); 803 break; 804 default: 805 error_report("NVMM: Unexpected VM exit code 0x%lx [hw=0x%lx]", 806 exit->reason, exit->u.inv.hwcode); 807 nvmm_get_registers(cpu); 808 bql_lock(); 809 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 810 bql_unlock(); 811 ret = -1; 812 break; 813 } 814 } while (ret == 0); 815 816 cpu_exec_end(cpu); 817 bql_lock(); 818 819 qatomic_set(&cpu->exit_request, false); 820 821 return ret < 0; 822 } 823 824 /* -------------------------------------------------------------------------- */ 825 826 static void 827 do_nvmm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 828 { 829 nvmm_get_registers(cpu); 830 cpu->accel->dirty = true; 831 } 832 833 static void 834 do_nvmm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 835 { 836 nvmm_set_registers(cpu); 837 cpu->accel->dirty = false; 838 } 839 840 static void 841 do_nvmm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 842 { 843 nvmm_set_registers(cpu); 844 cpu->accel->dirty = false; 845 } 846 847 static void 848 do_nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 849 { 850 cpu->accel->dirty = true; 851 } 852 853 void nvmm_cpu_synchronize_state(CPUState *cpu) 854 { 855 if (!cpu->accel->dirty) { 856 run_on_cpu(cpu, do_nvmm_cpu_synchronize_state, RUN_ON_CPU_NULL); 857 } 858 } 859 860 void nvmm_cpu_synchronize_post_reset(CPUState *cpu) 861 { 862 run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 863 } 864 865 void nvmm_cpu_synchronize_post_init(CPUState *cpu) 866 { 867 run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 868 } 869 870 void nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu) 871 { 872 run_on_cpu(cpu, do_nvmm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 873 } 874 875 /* -------------------------------------------------------------------------- */ 876 877 static Error *nvmm_migration_blocker; 878 879 /* 880 * The nvmm_vcpu_stop() mechanism breaks races between entering the VMM 881 * and another thread signaling the vCPU thread to exit. 882 */ 883 884 static void 885 nvmm_ipi_signal(int sigcpu) 886 { 887 if (current_cpu) { 888 AccelCPUState *qcpu = current_cpu->accel; 889 #if NVMM_USER_VERSION >= 2 890 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 891 nvmm_vcpu_stop(vcpu); 892 #else 893 qcpu->stop = true; 894 #endif 895 } 896 } 897 898 static void 899 nvmm_init_cpu_signals(void) 900 { 901 struct sigaction sigact; 902 sigset_t set; 903 904 /* Install the IPI handler. */ 905 memset(&sigact, 0, sizeof(sigact)); 906 sigact.sa_handler = nvmm_ipi_signal; 907 sigaction(SIG_IPI, &sigact, NULL); 908 909 /* Allow IPIs on the current thread. */ 910 sigprocmask(SIG_BLOCK, NULL, &set); 911 sigdelset(&set, SIG_IPI); 912 pthread_sigmask(SIG_SETMASK, &set, NULL); 913 } 914 915 int 916 nvmm_init_vcpu(CPUState *cpu) 917 { 918 struct nvmm_machine *mach = get_nvmm_mach(); 919 struct nvmm_vcpu_conf_cpuid cpuid; 920 struct nvmm_vcpu_conf_tpr tpr; 921 Error *local_error = NULL; 922 AccelCPUState *qcpu; 923 int ret, err; 924 925 nvmm_init_cpu_signals(); 926 927 if (nvmm_migration_blocker == NULL) { 928 error_setg(&nvmm_migration_blocker, 929 "NVMM: Migration not supported"); 930 931 if (migrate_add_blocker(&nvmm_migration_blocker, &local_error) < 0) { 932 error_report_err(local_error); 933 return -EINVAL; 934 } 935 } 936 937 qcpu = g_new0(AccelCPUState, 1); 938 939 ret = nvmm_vcpu_create(mach, cpu->cpu_index, &qcpu->vcpu); 940 if (ret == -1) { 941 err = errno; 942 error_report("NVMM: Failed to create a virtual processor," 943 " error=%d", err); 944 g_free(qcpu); 945 return -err; 946 } 947 948 memset(&cpuid, 0, sizeof(cpuid)); 949 cpuid.mask = 1; 950 cpuid.leaf = 0x00000001; 951 cpuid.u.mask.set.edx = CPUID_MCE | CPUID_MCA | CPUID_MTRR; 952 ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CPUID, 953 &cpuid); 954 if (ret == -1) { 955 err = errno; 956 error_report("NVMM: Failed to configure a virtual processor," 957 " error=%d", err); 958 g_free(qcpu); 959 return -err; 960 } 961 962 ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CALLBACKS, 963 &nvmm_callbacks); 964 if (ret == -1) { 965 err = errno; 966 error_report("NVMM: Failed to configure a virtual processor," 967 " error=%d", err); 968 g_free(qcpu); 969 return -err; 970 } 971 972 if (qemu_mach.cap.arch.vcpu_conf_support & NVMM_CAP_ARCH_VCPU_CONF_TPR) { 973 memset(&tpr, 0, sizeof(tpr)); 974 tpr.exit_changed = 1; 975 ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_TPR, &tpr); 976 if (ret == -1) { 977 err = errno; 978 error_report("NVMM: Failed to configure a virtual processor," 979 " error=%d", err); 980 g_free(qcpu); 981 return -err; 982 } 983 } 984 985 cpu->accel->dirty = true; 986 cpu->accel = qcpu; 987 988 return 0; 989 } 990 991 int 992 nvmm_vcpu_exec(CPUState *cpu) 993 { 994 int ret, fatal; 995 996 while (1) { 997 if (cpu->exception_index >= EXCP_INTERRUPT) { 998 ret = cpu->exception_index; 999 cpu->exception_index = -1; 1000 break; 1001 } 1002 1003 fatal = nvmm_vcpu_loop(cpu); 1004 1005 if (fatal) { 1006 error_report("NVMM: Failed to execute a VCPU."); 1007 abort(); 1008 } 1009 } 1010 1011 return ret; 1012 } 1013 1014 void 1015 nvmm_destroy_vcpu(CPUState *cpu) 1016 { 1017 struct nvmm_machine *mach = get_nvmm_mach(); 1018 AccelCPUState *qcpu = cpu->accel; 1019 1020 nvmm_vcpu_destroy(mach, &qcpu->vcpu); 1021 g_free(cpu->accel); 1022 } 1023 1024 /* -------------------------------------------------------------------------- */ 1025 1026 static void 1027 nvmm_update_mapping(hwaddr start_pa, ram_addr_t size, uintptr_t hva, 1028 bool add, bool rom, const char *name) 1029 { 1030 struct nvmm_machine *mach = get_nvmm_mach(); 1031 int ret, prot; 1032 1033 if (add) { 1034 prot = PROT_READ | PROT_EXEC; 1035 if (!rom) { 1036 prot |= PROT_WRITE; 1037 } 1038 ret = nvmm_gpa_map(mach, hva, start_pa, size, prot); 1039 } else { 1040 ret = nvmm_gpa_unmap(mach, hva, start_pa, size); 1041 } 1042 1043 if (ret == -1) { 1044 error_report("NVMM: Failed to %s GPA range '%s' PA:%p, " 1045 "Size:%p bytes, HostVA:%p, error=%d", 1046 (add ? "map" : "unmap"), name, (void *)(uintptr_t)start_pa, 1047 (void *)size, (void *)hva, errno); 1048 } 1049 } 1050 1051 static void 1052 nvmm_process_section(MemoryRegionSection *section, int add) 1053 { 1054 MemoryRegion *mr = section->mr; 1055 hwaddr start_pa = section->offset_within_address_space; 1056 ram_addr_t size = int128_get64(section->size); 1057 unsigned int delta; 1058 uintptr_t hva; 1059 1060 if (!memory_region_is_ram(mr)) { 1061 return; 1062 } 1063 1064 /* Adjust start_pa and size so that they are page-aligned. */ 1065 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 1066 delta &= ~qemu_real_host_page_mask(); 1067 if (delta > size) { 1068 return; 1069 } 1070 start_pa += delta; 1071 size -= delta; 1072 size &= qemu_real_host_page_mask(); 1073 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 1074 return; 1075 } 1076 1077 hva = (uintptr_t)memory_region_get_ram_ptr(mr) + 1078 section->offset_within_region + delta; 1079 1080 nvmm_update_mapping(start_pa, size, hva, add, 1081 memory_region_is_rom(mr), mr->name); 1082 } 1083 1084 static void 1085 nvmm_region_add(MemoryListener *listener, MemoryRegionSection *section) 1086 { 1087 memory_region_ref(section->mr); 1088 nvmm_process_section(section, 1); 1089 } 1090 1091 static void 1092 nvmm_region_del(MemoryListener *listener, MemoryRegionSection *section) 1093 { 1094 nvmm_process_section(section, 0); 1095 memory_region_unref(section->mr); 1096 } 1097 1098 static void 1099 nvmm_transaction_begin(MemoryListener *listener) 1100 { 1101 /* nothing */ 1102 } 1103 1104 static void 1105 nvmm_transaction_commit(MemoryListener *listener) 1106 { 1107 /* nothing */ 1108 } 1109 1110 static void 1111 nvmm_log_sync(MemoryListener *listener, MemoryRegionSection *section) 1112 { 1113 MemoryRegion *mr = section->mr; 1114 1115 if (!memory_region_is_ram(mr)) { 1116 return; 1117 } 1118 1119 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 1120 } 1121 1122 static MemoryListener nvmm_memory_listener = { 1123 .name = "nvmm", 1124 .begin = nvmm_transaction_begin, 1125 .commit = nvmm_transaction_commit, 1126 .region_add = nvmm_region_add, 1127 .region_del = nvmm_region_del, 1128 .log_sync = nvmm_log_sync, 1129 .priority = MEMORY_LISTENER_PRIORITY_ACCEL, 1130 }; 1131 1132 static void 1133 nvmm_ram_block_added(RAMBlockNotifier *n, void *host, size_t size, 1134 size_t max_size) 1135 { 1136 struct nvmm_machine *mach = get_nvmm_mach(); 1137 uintptr_t hva = (uintptr_t)host; 1138 int ret; 1139 1140 ret = nvmm_hva_map(mach, hva, max_size); 1141 1142 if (ret == -1) { 1143 error_report("NVMM: Failed to map HVA, HostVA:%p " 1144 "Size:%p bytes, error=%d", 1145 (void *)hva, (void *)size, errno); 1146 } 1147 } 1148 1149 static struct RAMBlockNotifier nvmm_ram_notifier = { 1150 .ram_block_added = nvmm_ram_block_added 1151 }; 1152 1153 /* -------------------------------------------------------------------------- */ 1154 1155 static int 1156 nvmm_accel_init(MachineState *ms) 1157 { 1158 int ret, err; 1159 1160 ret = nvmm_init(); 1161 if (ret == -1) { 1162 err = errno; 1163 error_report("NVMM: Initialization failed, error=%d", errno); 1164 return -err; 1165 } 1166 1167 ret = nvmm_capability(&qemu_mach.cap); 1168 if (ret == -1) { 1169 err = errno; 1170 error_report("NVMM: Unable to fetch capability, error=%d", errno); 1171 return -err; 1172 } 1173 if (qemu_mach.cap.version < NVMM_KERN_VERSION) { 1174 error_report("NVMM: Unsupported version %u", qemu_mach.cap.version); 1175 return -EPROGMISMATCH; 1176 } 1177 if (qemu_mach.cap.state_size != sizeof(struct nvmm_x64_state)) { 1178 error_report("NVMM: Wrong state size %u", qemu_mach.cap.state_size); 1179 return -EPROGMISMATCH; 1180 } 1181 1182 ret = nvmm_machine_create(&qemu_mach.mach); 1183 if (ret == -1) { 1184 err = errno; 1185 error_report("NVMM: Machine creation failed, error=%d", errno); 1186 return -err; 1187 } 1188 1189 memory_listener_register(&nvmm_memory_listener, &address_space_memory); 1190 ram_block_notifier_add(&nvmm_ram_notifier); 1191 1192 printf("NetBSD Virtual Machine Monitor accelerator is operational\n"); 1193 return 0; 1194 } 1195 1196 int 1197 nvmm_enabled(void) 1198 { 1199 return nvmm_allowed; 1200 } 1201 1202 static void 1203 nvmm_accel_class_init(ObjectClass *oc, void *data) 1204 { 1205 AccelClass *ac = ACCEL_CLASS(oc); 1206 ac->name = "NVMM"; 1207 ac->init_machine = nvmm_accel_init; 1208 ac->allowed = &nvmm_allowed; 1209 } 1210 1211 static const TypeInfo nvmm_accel_type = { 1212 .name = ACCEL_CLASS_NAME("nvmm"), 1213 .parent = TYPE_ACCEL, 1214 .class_init = nvmm_accel_class_init, 1215 }; 1216 1217 static void 1218 nvmm_type_init(void) 1219 { 1220 type_register_static(&nvmm_accel_type); 1221 } 1222 1223 type_init(nvmm_type_init); 1224