1 /* 2 * Copyright (c) 2018-2019 Maxime Villard, All rights reserved. 3 * 4 * NetBSD Virtual Machine Monitor (NVMM) accelerator for QEMU. 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "exec/address-spaces.h" 13 #include "exec/ioport.h" 14 #include "qemu/accel.h" 15 #include "sysemu/nvmm.h" 16 #include "sysemu/cpus.h" 17 #include "sysemu/runstate.h" 18 #include "qemu/main-loop.h" 19 #include "qemu/error-report.h" 20 #include "qapi/error.h" 21 #include "qemu/queue.h" 22 #include "migration/blocker.h" 23 #include "strings.h" 24 25 #include "nvmm-accel-ops.h" 26 27 #include <nvmm.h> 28 29 struct AccelCPUState { 30 struct nvmm_vcpu vcpu; 31 uint8_t tpr; 32 bool stop; 33 34 /* Window-exiting for INTs/NMIs. */ 35 bool int_window_exit; 36 bool nmi_window_exit; 37 38 /* The guest is in an interrupt shadow (POP SS, etc). */ 39 bool int_shadow; 40 }; 41 42 struct qemu_machine { 43 struct nvmm_capability cap; 44 struct nvmm_machine mach; 45 }; 46 47 /* -------------------------------------------------------------------------- */ 48 49 static bool nvmm_allowed; 50 static struct qemu_machine qemu_mach; 51 52 static struct nvmm_machine * 53 get_nvmm_mach(void) 54 { 55 return &qemu_mach.mach; 56 } 57 58 /* -------------------------------------------------------------------------- */ 59 60 static void 61 nvmm_set_segment(struct nvmm_x64_state_seg *nseg, const SegmentCache *qseg) 62 { 63 uint32_t attrib = qseg->flags; 64 65 nseg->selector = qseg->selector; 66 nseg->limit = qseg->limit; 67 nseg->base = qseg->base; 68 nseg->attrib.type = __SHIFTOUT(attrib, DESC_TYPE_MASK); 69 nseg->attrib.s = __SHIFTOUT(attrib, DESC_S_MASK); 70 nseg->attrib.dpl = __SHIFTOUT(attrib, DESC_DPL_MASK); 71 nseg->attrib.p = __SHIFTOUT(attrib, DESC_P_MASK); 72 nseg->attrib.avl = __SHIFTOUT(attrib, DESC_AVL_MASK); 73 nseg->attrib.l = __SHIFTOUT(attrib, DESC_L_MASK); 74 nseg->attrib.def = __SHIFTOUT(attrib, DESC_B_MASK); 75 nseg->attrib.g = __SHIFTOUT(attrib, DESC_G_MASK); 76 } 77 78 static void 79 nvmm_set_registers(CPUState *cpu) 80 { 81 CPUX86State *env = cpu_env(cpu); 82 struct nvmm_machine *mach = get_nvmm_mach(); 83 AccelCPUState *qcpu = cpu->accel; 84 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 85 struct nvmm_x64_state *state = vcpu->state; 86 uint64_t bitmap; 87 size_t i; 88 int ret; 89 90 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 91 92 /* GPRs. */ 93 state->gprs[NVMM_X64_GPR_RAX] = env->regs[R_EAX]; 94 state->gprs[NVMM_X64_GPR_RCX] = env->regs[R_ECX]; 95 state->gprs[NVMM_X64_GPR_RDX] = env->regs[R_EDX]; 96 state->gprs[NVMM_X64_GPR_RBX] = env->regs[R_EBX]; 97 state->gprs[NVMM_X64_GPR_RSP] = env->regs[R_ESP]; 98 state->gprs[NVMM_X64_GPR_RBP] = env->regs[R_EBP]; 99 state->gprs[NVMM_X64_GPR_RSI] = env->regs[R_ESI]; 100 state->gprs[NVMM_X64_GPR_RDI] = env->regs[R_EDI]; 101 #ifdef TARGET_X86_64 102 state->gprs[NVMM_X64_GPR_R8] = env->regs[R_R8]; 103 state->gprs[NVMM_X64_GPR_R9] = env->regs[R_R9]; 104 state->gprs[NVMM_X64_GPR_R10] = env->regs[R_R10]; 105 state->gprs[NVMM_X64_GPR_R11] = env->regs[R_R11]; 106 state->gprs[NVMM_X64_GPR_R12] = env->regs[R_R12]; 107 state->gprs[NVMM_X64_GPR_R13] = env->regs[R_R13]; 108 state->gprs[NVMM_X64_GPR_R14] = env->regs[R_R14]; 109 state->gprs[NVMM_X64_GPR_R15] = env->regs[R_R15]; 110 #endif 111 112 /* RIP and RFLAGS. */ 113 state->gprs[NVMM_X64_GPR_RIP] = env->eip; 114 state->gprs[NVMM_X64_GPR_RFLAGS] = env->eflags; 115 116 /* Segments. */ 117 nvmm_set_segment(&state->segs[NVMM_X64_SEG_CS], &env->segs[R_CS]); 118 nvmm_set_segment(&state->segs[NVMM_X64_SEG_DS], &env->segs[R_DS]); 119 nvmm_set_segment(&state->segs[NVMM_X64_SEG_ES], &env->segs[R_ES]); 120 nvmm_set_segment(&state->segs[NVMM_X64_SEG_FS], &env->segs[R_FS]); 121 nvmm_set_segment(&state->segs[NVMM_X64_SEG_GS], &env->segs[R_GS]); 122 nvmm_set_segment(&state->segs[NVMM_X64_SEG_SS], &env->segs[R_SS]); 123 124 /* Special segments. */ 125 nvmm_set_segment(&state->segs[NVMM_X64_SEG_GDT], &env->gdt); 126 nvmm_set_segment(&state->segs[NVMM_X64_SEG_LDT], &env->ldt); 127 nvmm_set_segment(&state->segs[NVMM_X64_SEG_TR], &env->tr); 128 nvmm_set_segment(&state->segs[NVMM_X64_SEG_IDT], &env->idt); 129 130 /* Control registers. */ 131 state->crs[NVMM_X64_CR_CR0] = env->cr[0]; 132 state->crs[NVMM_X64_CR_CR2] = env->cr[2]; 133 state->crs[NVMM_X64_CR_CR3] = env->cr[3]; 134 state->crs[NVMM_X64_CR_CR4] = env->cr[4]; 135 state->crs[NVMM_X64_CR_CR8] = qcpu->tpr; 136 state->crs[NVMM_X64_CR_XCR0] = env->xcr0; 137 138 /* Debug registers. */ 139 state->drs[NVMM_X64_DR_DR0] = env->dr[0]; 140 state->drs[NVMM_X64_DR_DR1] = env->dr[1]; 141 state->drs[NVMM_X64_DR_DR2] = env->dr[2]; 142 state->drs[NVMM_X64_DR_DR3] = env->dr[3]; 143 state->drs[NVMM_X64_DR_DR6] = env->dr[6]; 144 state->drs[NVMM_X64_DR_DR7] = env->dr[7]; 145 146 /* FPU. */ 147 state->fpu.fx_cw = env->fpuc; 148 state->fpu.fx_sw = (env->fpus & ~0x3800) | ((env->fpstt & 0x7) << 11); 149 state->fpu.fx_tw = 0; 150 for (i = 0; i < 8; i++) { 151 state->fpu.fx_tw |= (!env->fptags[i]) << i; 152 } 153 state->fpu.fx_opcode = env->fpop; 154 state->fpu.fx_ip.fa_64 = env->fpip; 155 state->fpu.fx_dp.fa_64 = env->fpdp; 156 state->fpu.fx_mxcsr = env->mxcsr; 157 state->fpu.fx_mxcsr_mask = 0x0000FFFF; 158 assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs)); 159 memcpy(state->fpu.fx_87_ac, env->fpregs, sizeof(env->fpregs)); 160 for (i = 0; i < CPU_NB_REGS; i++) { 161 memcpy(&state->fpu.fx_xmm[i].xmm_bytes[0], 162 &env->xmm_regs[i].ZMM_Q(0), 8); 163 memcpy(&state->fpu.fx_xmm[i].xmm_bytes[8], 164 &env->xmm_regs[i].ZMM_Q(1), 8); 165 } 166 167 /* MSRs. */ 168 state->msrs[NVMM_X64_MSR_EFER] = env->efer; 169 state->msrs[NVMM_X64_MSR_STAR] = env->star; 170 #ifdef TARGET_X86_64 171 state->msrs[NVMM_X64_MSR_LSTAR] = env->lstar; 172 state->msrs[NVMM_X64_MSR_CSTAR] = env->cstar; 173 state->msrs[NVMM_X64_MSR_SFMASK] = env->fmask; 174 state->msrs[NVMM_X64_MSR_KERNELGSBASE] = env->kernelgsbase; 175 #endif 176 state->msrs[NVMM_X64_MSR_SYSENTER_CS] = env->sysenter_cs; 177 state->msrs[NVMM_X64_MSR_SYSENTER_ESP] = env->sysenter_esp; 178 state->msrs[NVMM_X64_MSR_SYSENTER_EIP] = env->sysenter_eip; 179 state->msrs[NVMM_X64_MSR_PAT] = env->pat; 180 state->msrs[NVMM_X64_MSR_TSC] = env->tsc; 181 182 bitmap = 183 NVMM_X64_STATE_SEGS | 184 NVMM_X64_STATE_GPRS | 185 NVMM_X64_STATE_CRS | 186 NVMM_X64_STATE_DRS | 187 NVMM_X64_STATE_MSRS | 188 NVMM_X64_STATE_FPU; 189 190 ret = nvmm_vcpu_setstate(mach, vcpu, bitmap); 191 if (ret == -1) { 192 error_report("NVMM: Failed to set virtual processor context," 193 " error=%d", errno); 194 } 195 } 196 197 static void 198 nvmm_get_segment(SegmentCache *qseg, const struct nvmm_x64_state_seg *nseg) 199 { 200 qseg->selector = nseg->selector; 201 qseg->limit = nseg->limit; 202 qseg->base = nseg->base; 203 204 qseg->flags = 205 __SHIFTIN((uint32_t)nseg->attrib.type, DESC_TYPE_MASK) | 206 __SHIFTIN((uint32_t)nseg->attrib.s, DESC_S_MASK) | 207 __SHIFTIN((uint32_t)nseg->attrib.dpl, DESC_DPL_MASK) | 208 __SHIFTIN((uint32_t)nseg->attrib.p, DESC_P_MASK) | 209 __SHIFTIN((uint32_t)nseg->attrib.avl, DESC_AVL_MASK) | 210 __SHIFTIN((uint32_t)nseg->attrib.l, DESC_L_MASK) | 211 __SHIFTIN((uint32_t)nseg->attrib.def, DESC_B_MASK) | 212 __SHIFTIN((uint32_t)nseg->attrib.g, DESC_G_MASK); 213 } 214 215 static void 216 nvmm_get_registers(CPUState *cpu) 217 { 218 CPUX86State *env = cpu_env(cpu); 219 struct nvmm_machine *mach = get_nvmm_mach(); 220 AccelCPUState *qcpu = cpu->accel; 221 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 222 X86CPU *x86_cpu = X86_CPU(cpu); 223 struct nvmm_x64_state *state = vcpu->state; 224 uint64_t bitmap, tpr; 225 size_t i; 226 int ret; 227 228 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 229 230 bitmap = 231 NVMM_X64_STATE_SEGS | 232 NVMM_X64_STATE_GPRS | 233 NVMM_X64_STATE_CRS | 234 NVMM_X64_STATE_DRS | 235 NVMM_X64_STATE_MSRS | 236 NVMM_X64_STATE_FPU; 237 238 ret = nvmm_vcpu_getstate(mach, vcpu, bitmap); 239 if (ret == -1) { 240 error_report("NVMM: Failed to get virtual processor context," 241 " error=%d", errno); 242 } 243 244 /* GPRs. */ 245 env->regs[R_EAX] = state->gprs[NVMM_X64_GPR_RAX]; 246 env->regs[R_ECX] = state->gprs[NVMM_X64_GPR_RCX]; 247 env->regs[R_EDX] = state->gprs[NVMM_X64_GPR_RDX]; 248 env->regs[R_EBX] = state->gprs[NVMM_X64_GPR_RBX]; 249 env->regs[R_ESP] = state->gprs[NVMM_X64_GPR_RSP]; 250 env->regs[R_EBP] = state->gprs[NVMM_X64_GPR_RBP]; 251 env->regs[R_ESI] = state->gprs[NVMM_X64_GPR_RSI]; 252 env->regs[R_EDI] = state->gprs[NVMM_X64_GPR_RDI]; 253 #ifdef TARGET_X86_64 254 env->regs[R_R8] = state->gprs[NVMM_X64_GPR_R8]; 255 env->regs[R_R9] = state->gprs[NVMM_X64_GPR_R9]; 256 env->regs[R_R10] = state->gprs[NVMM_X64_GPR_R10]; 257 env->regs[R_R11] = state->gprs[NVMM_X64_GPR_R11]; 258 env->regs[R_R12] = state->gprs[NVMM_X64_GPR_R12]; 259 env->regs[R_R13] = state->gprs[NVMM_X64_GPR_R13]; 260 env->regs[R_R14] = state->gprs[NVMM_X64_GPR_R14]; 261 env->regs[R_R15] = state->gprs[NVMM_X64_GPR_R15]; 262 #endif 263 264 /* RIP and RFLAGS. */ 265 env->eip = state->gprs[NVMM_X64_GPR_RIP]; 266 env->eflags = state->gprs[NVMM_X64_GPR_RFLAGS]; 267 268 /* Segments. */ 269 nvmm_get_segment(&env->segs[R_ES], &state->segs[NVMM_X64_SEG_ES]); 270 nvmm_get_segment(&env->segs[R_CS], &state->segs[NVMM_X64_SEG_CS]); 271 nvmm_get_segment(&env->segs[R_SS], &state->segs[NVMM_X64_SEG_SS]); 272 nvmm_get_segment(&env->segs[R_DS], &state->segs[NVMM_X64_SEG_DS]); 273 nvmm_get_segment(&env->segs[R_FS], &state->segs[NVMM_X64_SEG_FS]); 274 nvmm_get_segment(&env->segs[R_GS], &state->segs[NVMM_X64_SEG_GS]); 275 276 /* Special segments. */ 277 nvmm_get_segment(&env->gdt, &state->segs[NVMM_X64_SEG_GDT]); 278 nvmm_get_segment(&env->ldt, &state->segs[NVMM_X64_SEG_LDT]); 279 nvmm_get_segment(&env->tr, &state->segs[NVMM_X64_SEG_TR]); 280 nvmm_get_segment(&env->idt, &state->segs[NVMM_X64_SEG_IDT]); 281 282 /* Control registers. */ 283 env->cr[0] = state->crs[NVMM_X64_CR_CR0]; 284 env->cr[2] = state->crs[NVMM_X64_CR_CR2]; 285 env->cr[3] = state->crs[NVMM_X64_CR_CR3]; 286 env->cr[4] = state->crs[NVMM_X64_CR_CR4]; 287 tpr = state->crs[NVMM_X64_CR_CR8]; 288 if (tpr != qcpu->tpr) { 289 qcpu->tpr = tpr; 290 cpu_set_apic_tpr(x86_cpu->apic_state, tpr); 291 } 292 env->xcr0 = state->crs[NVMM_X64_CR_XCR0]; 293 294 /* Debug registers. */ 295 env->dr[0] = state->drs[NVMM_X64_DR_DR0]; 296 env->dr[1] = state->drs[NVMM_X64_DR_DR1]; 297 env->dr[2] = state->drs[NVMM_X64_DR_DR2]; 298 env->dr[3] = state->drs[NVMM_X64_DR_DR3]; 299 env->dr[6] = state->drs[NVMM_X64_DR_DR6]; 300 env->dr[7] = state->drs[NVMM_X64_DR_DR7]; 301 302 /* FPU. */ 303 env->fpuc = state->fpu.fx_cw; 304 env->fpstt = (state->fpu.fx_sw >> 11) & 0x7; 305 env->fpus = state->fpu.fx_sw & ~0x3800; 306 for (i = 0; i < 8; i++) { 307 env->fptags[i] = !((state->fpu.fx_tw >> i) & 1); 308 } 309 env->fpop = state->fpu.fx_opcode; 310 env->fpip = state->fpu.fx_ip.fa_64; 311 env->fpdp = state->fpu.fx_dp.fa_64; 312 env->mxcsr = state->fpu.fx_mxcsr; 313 assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs)); 314 memcpy(env->fpregs, state->fpu.fx_87_ac, sizeof(env->fpregs)); 315 for (i = 0; i < CPU_NB_REGS; i++) { 316 memcpy(&env->xmm_regs[i].ZMM_Q(0), 317 &state->fpu.fx_xmm[i].xmm_bytes[0], 8); 318 memcpy(&env->xmm_regs[i].ZMM_Q(1), 319 &state->fpu.fx_xmm[i].xmm_bytes[8], 8); 320 } 321 322 /* MSRs. */ 323 env->efer = state->msrs[NVMM_X64_MSR_EFER]; 324 env->star = state->msrs[NVMM_X64_MSR_STAR]; 325 #ifdef TARGET_X86_64 326 env->lstar = state->msrs[NVMM_X64_MSR_LSTAR]; 327 env->cstar = state->msrs[NVMM_X64_MSR_CSTAR]; 328 env->fmask = state->msrs[NVMM_X64_MSR_SFMASK]; 329 env->kernelgsbase = state->msrs[NVMM_X64_MSR_KERNELGSBASE]; 330 #endif 331 env->sysenter_cs = state->msrs[NVMM_X64_MSR_SYSENTER_CS]; 332 env->sysenter_esp = state->msrs[NVMM_X64_MSR_SYSENTER_ESP]; 333 env->sysenter_eip = state->msrs[NVMM_X64_MSR_SYSENTER_EIP]; 334 env->pat = state->msrs[NVMM_X64_MSR_PAT]; 335 env->tsc = state->msrs[NVMM_X64_MSR_TSC]; 336 337 x86_update_hflags(env); 338 } 339 340 static bool 341 nvmm_can_take_int(CPUState *cpu) 342 { 343 AccelCPUState *qcpu = cpu->accel; 344 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 345 struct nvmm_machine *mach = get_nvmm_mach(); 346 347 if (qcpu->int_window_exit) { 348 return false; 349 } 350 351 if (qcpu->int_shadow || !(cpu_env(cpu)->eflags & IF_MASK)) { 352 struct nvmm_x64_state *state = vcpu->state; 353 354 /* Exit on interrupt window. */ 355 nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_INTR); 356 state->intr.int_window_exiting = 1; 357 nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_INTR); 358 359 return false; 360 } 361 362 return true; 363 } 364 365 static bool 366 nvmm_can_take_nmi(CPUState *cpu) 367 { 368 AccelCPUState *qcpu = cpu->accel; 369 370 /* 371 * Contrary to INTs, NMIs always schedule an exit when they are 372 * completed. Therefore, if window-exiting is enabled, it means 373 * NMIs are blocked. 374 */ 375 if (qcpu->nmi_window_exit) { 376 return false; 377 } 378 379 return true; 380 } 381 382 /* 383 * Called before the VCPU is run. We inject events generated by the I/O 384 * thread, and synchronize the guest TPR. 385 */ 386 static void 387 nvmm_vcpu_pre_run(CPUState *cpu) 388 { 389 CPUX86State *env = cpu_env(cpu); 390 struct nvmm_machine *mach = get_nvmm_mach(); 391 AccelCPUState *qcpu = cpu->accel; 392 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 393 X86CPU *x86_cpu = X86_CPU(cpu); 394 struct nvmm_x64_state *state = vcpu->state; 395 struct nvmm_vcpu_event *event = vcpu->event; 396 bool has_event = false; 397 bool sync_tpr = false; 398 uint8_t tpr; 399 int ret; 400 401 bql_lock(); 402 403 tpr = cpu_get_apic_tpr(x86_cpu->apic_state); 404 if (tpr != qcpu->tpr) { 405 qcpu->tpr = tpr; 406 sync_tpr = true; 407 } 408 409 /* 410 * Force the VCPU out of its inner loop to process any INIT requests 411 * or commit pending TPR access. 412 */ 413 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 414 cpu->exit_request = 1; 415 } 416 417 if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 418 if (nvmm_can_take_nmi(cpu)) { 419 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 420 event->type = NVMM_VCPU_EVENT_INTR; 421 event->vector = 2; 422 has_event = true; 423 } 424 } 425 426 if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 427 if (nvmm_can_take_int(cpu)) { 428 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 429 event->type = NVMM_VCPU_EVENT_INTR; 430 event->vector = cpu_get_pic_interrupt(env); 431 has_event = true; 432 } 433 } 434 435 /* Don't want SMIs. */ 436 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 437 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 438 } 439 440 if (sync_tpr) { 441 ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_CRS); 442 if (ret == -1) { 443 error_report("NVMM: Failed to get CPU state," 444 " error=%d", errno); 445 } 446 447 state->crs[NVMM_X64_CR_CR8] = qcpu->tpr; 448 449 ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_CRS); 450 if (ret == -1) { 451 error_report("NVMM: Failed to set CPU state," 452 " error=%d", errno); 453 } 454 } 455 456 if (has_event) { 457 ret = nvmm_vcpu_inject(mach, vcpu); 458 if (ret == -1) { 459 error_report("NVMM: Failed to inject event," 460 " error=%d", errno); 461 } 462 } 463 464 bql_unlock(); 465 } 466 467 /* 468 * Called after the VCPU ran. We synchronize the host view of the TPR and 469 * RFLAGS. 470 */ 471 static void 472 nvmm_vcpu_post_run(CPUState *cpu, struct nvmm_vcpu_exit *exit) 473 { 474 AccelCPUState *qcpu = cpu->accel; 475 X86CPU *x86_cpu = X86_CPU(cpu); 476 CPUX86State *env = &x86_cpu->env; 477 uint64_t tpr; 478 479 env->eflags = exit->exitstate.rflags; 480 qcpu->int_shadow = exit->exitstate.int_shadow; 481 qcpu->int_window_exit = exit->exitstate.int_window_exiting; 482 qcpu->nmi_window_exit = exit->exitstate.nmi_window_exiting; 483 484 tpr = exit->exitstate.cr8; 485 if (qcpu->tpr != tpr) { 486 qcpu->tpr = tpr; 487 bql_lock(); 488 cpu_set_apic_tpr(x86_cpu->apic_state, qcpu->tpr); 489 bql_unlock(); 490 } 491 } 492 493 /* -------------------------------------------------------------------------- */ 494 495 static void 496 nvmm_io_callback(struct nvmm_io *io) 497 { 498 MemTxAttrs attrs = { 0 }; 499 int ret; 500 501 ret = address_space_rw(&address_space_io, io->port, attrs, io->data, 502 io->size, !io->in); 503 if (ret != MEMTX_OK) { 504 error_report("NVMM: I/O Transaction Failed " 505 "[%s, port=%u, size=%zu]", (io->in ? "in" : "out"), 506 io->port, io->size); 507 } 508 509 /* Needed, otherwise infinite loop. */ 510 current_cpu->vcpu_dirty = false; 511 } 512 513 static void 514 nvmm_mem_callback(struct nvmm_mem *mem) 515 { 516 cpu_physical_memory_rw(mem->gpa, mem->data, mem->size, mem->write); 517 518 /* Needed, otherwise infinite loop. */ 519 current_cpu->vcpu_dirty = false; 520 } 521 522 static struct nvmm_assist_callbacks nvmm_callbacks = { 523 .io = nvmm_io_callback, 524 .mem = nvmm_mem_callback 525 }; 526 527 /* -------------------------------------------------------------------------- */ 528 529 static int 530 nvmm_handle_mem(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu) 531 { 532 int ret; 533 534 ret = nvmm_assist_mem(mach, vcpu); 535 if (ret == -1) { 536 error_report("NVMM: Mem Assist Failed [gpa=%p]", 537 (void *)vcpu->exit->u.mem.gpa); 538 } 539 540 return ret; 541 } 542 543 static int 544 nvmm_handle_io(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu) 545 { 546 int ret; 547 548 ret = nvmm_assist_io(mach, vcpu); 549 if (ret == -1) { 550 error_report("NVMM: I/O Assist Failed [port=%d]", 551 (int)vcpu->exit->u.io.port); 552 } 553 554 return ret; 555 } 556 557 static int 558 nvmm_handle_rdmsr(struct nvmm_machine *mach, CPUState *cpu, 559 struct nvmm_vcpu_exit *exit) 560 { 561 AccelCPUState *qcpu = cpu->accel; 562 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 563 X86CPU *x86_cpu = X86_CPU(cpu); 564 struct nvmm_x64_state *state = vcpu->state; 565 uint64_t val; 566 int ret; 567 568 switch (exit->u.rdmsr.msr) { 569 case MSR_IA32_APICBASE: 570 val = cpu_get_apic_base(x86_cpu->apic_state); 571 break; 572 case MSR_MTRRcap: 573 case MSR_MTRRdefType: 574 case MSR_MCG_CAP: 575 case MSR_MCG_STATUS: 576 val = 0; 577 break; 578 default: /* More MSRs to add? */ 579 val = 0; 580 error_report("NVMM: Unexpected RDMSR 0x%x, ignored", 581 exit->u.rdmsr.msr); 582 break; 583 } 584 585 ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS); 586 if (ret == -1) { 587 return -1; 588 } 589 590 state->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF); 591 state->gprs[NVMM_X64_GPR_RDX] = (val >> 32); 592 state->gprs[NVMM_X64_GPR_RIP] = exit->u.rdmsr.npc; 593 594 ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS); 595 if (ret == -1) { 596 return -1; 597 } 598 599 return 0; 600 } 601 602 static int 603 nvmm_handle_wrmsr(struct nvmm_machine *mach, CPUState *cpu, 604 struct nvmm_vcpu_exit *exit) 605 { 606 AccelCPUState *qcpu = cpu->accel; 607 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 608 X86CPU *x86_cpu = X86_CPU(cpu); 609 struct nvmm_x64_state *state = vcpu->state; 610 uint64_t val; 611 int ret; 612 613 val = exit->u.wrmsr.val; 614 615 switch (exit->u.wrmsr.msr) { 616 case MSR_IA32_APICBASE: 617 cpu_set_apic_base(x86_cpu->apic_state, val); 618 break; 619 case MSR_MTRRdefType: 620 case MSR_MCG_STATUS: 621 break; 622 default: /* More MSRs to add? */ 623 error_report("NVMM: Unexpected WRMSR 0x%x [val=0x%lx], ignored", 624 exit->u.wrmsr.msr, val); 625 break; 626 } 627 628 ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS); 629 if (ret == -1) { 630 return -1; 631 } 632 633 state->gprs[NVMM_X64_GPR_RIP] = exit->u.wrmsr.npc; 634 635 ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS); 636 if (ret == -1) { 637 return -1; 638 } 639 640 return 0; 641 } 642 643 static int 644 nvmm_handle_halted(struct nvmm_machine *mach, CPUState *cpu, 645 struct nvmm_vcpu_exit *exit) 646 { 647 int ret = 0; 648 649 bql_lock(); 650 651 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 652 (cpu_env(cpu)->eflags & IF_MASK)) && 653 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 654 cpu->exception_index = EXCP_HLT; 655 cpu->halted = true; 656 ret = 1; 657 } 658 659 bql_unlock(); 660 661 return ret; 662 } 663 664 static int 665 nvmm_inject_ud(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu) 666 { 667 struct nvmm_vcpu_event *event = vcpu->event; 668 669 event->type = NVMM_VCPU_EVENT_EXCP; 670 event->vector = 6; 671 event->u.excp.error = 0; 672 673 return nvmm_vcpu_inject(mach, vcpu); 674 } 675 676 static int 677 nvmm_vcpu_loop(CPUState *cpu) 678 { 679 struct nvmm_machine *mach = get_nvmm_mach(); 680 AccelCPUState *qcpu = cpu->accel; 681 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 682 X86CPU *x86_cpu = X86_CPU(cpu); 683 CPUX86State *env = &x86_cpu->env; 684 struct nvmm_vcpu_exit *exit = vcpu->exit; 685 int ret; 686 687 /* 688 * Some asynchronous events must be handled outside of the inner 689 * VCPU loop. They are handled here. 690 */ 691 if (cpu->interrupt_request & CPU_INTERRUPT_INIT) { 692 nvmm_cpu_synchronize_state(cpu); 693 do_cpu_init(x86_cpu); 694 /* set int/nmi windows back to the reset state */ 695 } 696 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 697 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 698 apic_poll_irq(x86_cpu->apic_state); 699 } 700 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 701 (env->eflags & IF_MASK)) || 702 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 703 cpu->halted = false; 704 } 705 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 706 nvmm_cpu_synchronize_state(cpu); 707 do_cpu_sipi(x86_cpu); 708 } 709 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 710 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 711 nvmm_cpu_synchronize_state(cpu); 712 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 713 env->tpr_access_type); 714 } 715 716 if (cpu->halted) { 717 cpu->exception_index = EXCP_HLT; 718 qatomic_set(&cpu->exit_request, false); 719 return 0; 720 } 721 722 bql_unlock(); 723 cpu_exec_start(cpu); 724 725 /* 726 * Inner VCPU loop. 727 */ 728 do { 729 if (cpu->vcpu_dirty) { 730 nvmm_set_registers(cpu); 731 cpu->vcpu_dirty = false; 732 } 733 734 if (qcpu->stop) { 735 cpu->exception_index = EXCP_INTERRUPT; 736 qcpu->stop = false; 737 ret = 1; 738 break; 739 } 740 741 nvmm_vcpu_pre_run(cpu); 742 743 if (qatomic_read(&cpu->exit_request)) { 744 #if NVMM_USER_VERSION >= 2 745 nvmm_vcpu_stop(vcpu); 746 #else 747 qemu_cpu_kick_self(); 748 #endif 749 } 750 751 /* Read exit_request before the kernel reads the immediate exit flag */ 752 smp_rmb(); 753 ret = nvmm_vcpu_run(mach, vcpu); 754 if (ret == -1) { 755 error_report("NVMM: Failed to exec a virtual processor," 756 " error=%d", errno); 757 break; 758 } 759 760 nvmm_vcpu_post_run(cpu, exit); 761 762 switch (exit->reason) { 763 case NVMM_VCPU_EXIT_NONE: 764 break; 765 #if NVMM_USER_VERSION >= 2 766 case NVMM_VCPU_EXIT_STOPPED: 767 /* 768 * The kernel cleared the immediate exit flag; cpu->exit_request 769 * must be cleared after 770 */ 771 smp_wmb(); 772 qcpu->stop = true; 773 break; 774 #endif 775 case NVMM_VCPU_EXIT_MEMORY: 776 ret = nvmm_handle_mem(mach, vcpu); 777 break; 778 case NVMM_VCPU_EXIT_IO: 779 ret = nvmm_handle_io(mach, vcpu); 780 break; 781 case NVMM_VCPU_EXIT_INT_READY: 782 case NVMM_VCPU_EXIT_NMI_READY: 783 case NVMM_VCPU_EXIT_TPR_CHANGED: 784 break; 785 case NVMM_VCPU_EXIT_HALTED: 786 ret = nvmm_handle_halted(mach, cpu, exit); 787 break; 788 case NVMM_VCPU_EXIT_SHUTDOWN: 789 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 790 cpu->exception_index = EXCP_INTERRUPT; 791 ret = 1; 792 break; 793 case NVMM_VCPU_EXIT_RDMSR: 794 ret = nvmm_handle_rdmsr(mach, cpu, exit); 795 break; 796 case NVMM_VCPU_EXIT_WRMSR: 797 ret = nvmm_handle_wrmsr(mach, cpu, exit); 798 break; 799 case NVMM_VCPU_EXIT_MONITOR: 800 case NVMM_VCPU_EXIT_MWAIT: 801 ret = nvmm_inject_ud(mach, vcpu); 802 break; 803 default: 804 error_report("NVMM: Unexpected VM exit code 0x%lx [hw=0x%lx]", 805 exit->reason, exit->u.inv.hwcode); 806 nvmm_get_registers(cpu); 807 bql_lock(); 808 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 809 bql_unlock(); 810 ret = -1; 811 break; 812 } 813 } while (ret == 0); 814 815 cpu_exec_end(cpu); 816 bql_lock(); 817 818 qatomic_set(&cpu->exit_request, false); 819 820 return ret < 0; 821 } 822 823 /* -------------------------------------------------------------------------- */ 824 825 static void 826 do_nvmm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 827 { 828 nvmm_get_registers(cpu); 829 cpu->vcpu_dirty = true; 830 } 831 832 static void 833 do_nvmm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 834 { 835 nvmm_set_registers(cpu); 836 cpu->vcpu_dirty = false; 837 } 838 839 static void 840 do_nvmm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 841 { 842 nvmm_set_registers(cpu); 843 cpu->vcpu_dirty = false; 844 } 845 846 static void 847 do_nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 848 { 849 cpu->vcpu_dirty = true; 850 } 851 852 void nvmm_cpu_synchronize_state(CPUState *cpu) 853 { 854 if (!cpu->vcpu_dirty) { 855 run_on_cpu(cpu, do_nvmm_cpu_synchronize_state, RUN_ON_CPU_NULL); 856 } 857 } 858 859 void nvmm_cpu_synchronize_post_reset(CPUState *cpu) 860 { 861 run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 862 } 863 864 void nvmm_cpu_synchronize_post_init(CPUState *cpu) 865 { 866 run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 867 } 868 869 void nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu) 870 { 871 run_on_cpu(cpu, do_nvmm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 872 } 873 874 /* -------------------------------------------------------------------------- */ 875 876 static Error *nvmm_migration_blocker; 877 878 /* 879 * The nvmm_vcpu_stop() mechanism breaks races between entering the VMM 880 * and another thread signaling the vCPU thread to exit. 881 */ 882 883 static void 884 nvmm_ipi_signal(int sigcpu) 885 { 886 if (current_cpu) { 887 AccelCPUState *qcpu = current_cpu->accel; 888 #if NVMM_USER_VERSION >= 2 889 struct nvmm_vcpu *vcpu = &qcpu->vcpu; 890 nvmm_vcpu_stop(vcpu); 891 #else 892 qcpu->stop = true; 893 #endif 894 } 895 } 896 897 static void 898 nvmm_init_cpu_signals(void) 899 { 900 struct sigaction sigact; 901 sigset_t set; 902 903 /* Install the IPI handler. */ 904 memset(&sigact, 0, sizeof(sigact)); 905 sigact.sa_handler = nvmm_ipi_signal; 906 sigaction(SIG_IPI, &sigact, NULL); 907 908 /* Allow IPIs on the current thread. */ 909 sigprocmask(SIG_BLOCK, NULL, &set); 910 sigdelset(&set, SIG_IPI); 911 pthread_sigmask(SIG_SETMASK, &set, NULL); 912 } 913 914 int 915 nvmm_init_vcpu(CPUState *cpu) 916 { 917 struct nvmm_machine *mach = get_nvmm_mach(); 918 struct nvmm_vcpu_conf_cpuid cpuid; 919 struct nvmm_vcpu_conf_tpr tpr; 920 Error *local_error = NULL; 921 AccelCPUState *qcpu; 922 int ret, err; 923 924 nvmm_init_cpu_signals(); 925 926 if (nvmm_migration_blocker == NULL) { 927 error_setg(&nvmm_migration_blocker, 928 "NVMM: Migration not supported"); 929 930 if (migrate_add_blocker(&nvmm_migration_blocker, &local_error) < 0) { 931 error_report_err(local_error); 932 return -EINVAL; 933 } 934 } 935 936 qcpu = g_new0(AccelCPUState, 1); 937 938 ret = nvmm_vcpu_create(mach, cpu->cpu_index, &qcpu->vcpu); 939 if (ret == -1) { 940 err = errno; 941 error_report("NVMM: Failed to create a virtual processor," 942 " error=%d", err); 943 g_free(qcpu); 944 return -err; 945 } 946 947 memset(&cpuid, 0, sizeof(cpuid)); 948 cpuid.mask = 1; 949 cpuid.leaf = 0x00000001; 950 cpuid.u.mask.set.edx = CPUID_MCE | CPUID_MCA | CPUID_MTRR; 951 ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CPUID, 952 &cpuid); 953 if (ret == -1) { 954 err = errno; 955 error_report("NVMM: Failed to configure a virtual processor," 956 " error=%d", err); 957 g_free(qcpu); 958 return -err; 959 } 960 961 ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CALLBACKS, 962 &nvmm_callbacks); 963 if (ret == -1) { 964 err = errno; 965 error_report("NVMM: Failed to configure a virtual processor," 966 " error=%d", err); 967 g_free(qcpu); 968 return -err; 969 } 970 971 if (qemu_mach.cap.arch.vcpu_conf_support & NVMM_CAP_ARCH_VCPU_CONF_TPR) { 972 memset(&tpr, 0, sizeof(tpr)); 973 tpr.exit_changed = 1; 974 ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_TPR, &tpr); 975 if (ret == -1) { 976 err = errno; 977 error_report("NVMM: Failed to configure a virtual processor," 978 " error=%d", err); 979 g_free(qcpu); 980 return -err; 981 } 982 } 983 984 cpu->vcpu_dirty = true; 985 cpu->accel = qcpu; 986 987 return 0; 988 } 989 990 int 991 nvmm_vcpu_exec(CPUState *cpu) 992 { 993 int ret, fatal; 994 995 while (1) { 996 if (cpu->exception_index >= EXCP_INTERRUPT) { 997 ret = cpu->exception_index; 998 cpu->exception_index = -1; 999 break; 1000 } 1001 1002 fatal = nvmm_vcpu_loop(cpu); 1003 1004 if (fatal) { 1005 error_report("NVMM: Failed to execute a VCPU."); 1006 abort(); 1007 } 1008 } 1009 1010 return ret; 1011 } 1012 1013 void 1014 nvmm_destroy_vcpu(CPUState *cpu) 1015 { 1016 struct nvmm_machine *mach = get_nvmm_mach(); 1017 AccelCPUState *qcpu = cpu->accel; 1018 1019 nvmm_vcpu_destroy(mach, &qcpu->vcpu); 1020 g_free(cpu->accel); 1021 } 1022 1023 /* -------------------------------------------------------------------------- */ 1024 1025 static void 1026 nvmm_update_mapping(hwaddr start_pa, ram_addr_t size, uintptr_t hva, 1027 bool add, bool rom, const char *name) 1028 { 1029 struct nvmm_machine *mach = get_nvmm_mach(); 1030 int ret, prot; 1031 1032 if (add) { 1033 prot = PROT_READ | PROT_EXEC; 1034 if (!rom) { 1035 prot |= PROT_WRITE; 1036 } 1037 ret = nvmm_gpa_map(mach, hva, start_pa, size, prot); 1038 } else { 1039 ret = nvmm_gpa_unmap(mach, hva, start_pa, size); 1040 } 1041 1042 if (ret == -1) { 1043 error_report("NVMM: Failed to %s GPA range '%s' PA:%p, " 1044 "Size:%p bytes, HostVA:%p, error=%d", 1045 (add ? "map" : "unmap"), name, (void *)(uintptr_t)start_pa, 1046 (void *)size, (void *)hva, errno); 1047 } 1048 } 1049 1050 static void 1051 nvmm_process_section(MemoryRegionSection *section, int add) 1052 { 1053 MemoryRegion *mr = section->mr; 1054 hwaddr start_pa = section->offset_within_address_space; 1055 ram_addr_t size = int128_get64(section->size); 1056 unsigned int delta; 1057 uintptr_t hva; 1058 1059 if (!memory_region_is_ram(mr)) { 1060 return; 1061 } 1062 1063 /* Adjust start_pa and size so that they are page-aligned. */ 1064 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 1065 delta &= ~qemu_real_host_page_mask(); 1066 if (delta > size) { 1067 return; 1068 } 1069 start_pa += delta; 1070 size -= delta; 1071 size &= qemu_real_host_page_mask(); 1072 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 1073 return; 1074 } 1075 1076 hva = (uintptr_t)memory_region_get_ram_ptr(mr) + 1077 section->offset_within_region + delta; 1078 1079 nvmm_update_mapping(start_pa, size, hva, add, 1080 memory_region_is_rom(mr), mr->name); 1081 } 1082 1083 static void 1084 nvmm_region_add(MemoryListener *listener, MemoryRegionSection *section) 1085 { 1086 memory_region_ref(section->mr); 1087 nvmm_process_section(section, 1); 1088 } 1089 1090 static void 1091 nvmm_region_del(MemoryListener *listener, MemoryRegionSection *section) 1092 { 1093 nvmm_process_section(section, 0); 1094 memory_region_unref(section->mr); 1095 } 1096 1097 static void 1098 nvmm_transaction_begin(MemoryListener *listener) 1099 { 1100 /* nothing */ 1101 } 1102 1103 static void 1104 nvmm_transaction_commit(MemoryListener *listener) 1105 { 1106 /* nothing */ 1107 } 1108 1109 static void 1110 nvmm_log_sync(MemoryListener *listener, MemoryRegionSection *section) 1111 { 1112 MemoryRegion *mr = section->mr; 1113 1114 if (!memory_region_is_ram(mr)) { 1115 return; 1116 } 1117 1118 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 1119 } 1120 1121 static MemoryListener nvmm_memory_listener = { 1122 .name = "nvmm", 1123 .begin = nvmm_transaction_begin, 1124 .commit = nvmm_transaction_commit, 1125 .region_add = nvmm_region_add, 1126 .region_del = nvmm_region_del, 1127 .log_sync = nvmm_log_sync, 1128 .priority = MEMORY_LISTENER_PRIORITY_ACCEL, 1129 }; 1130 1131 static void 1132 nvmm_ram_block_added(RAMBlockNotifier *n, void *host, size_t size, 1133 size_t max_size) 1134 { 1135 struct nvmm_machine *mach = get_nvmm_mach(); 1136 uintptr_t hva = (uintptr_t)host; 1137 int ret; 1138 1139 ret = nvmm_hva_map(mach, hva, max_size); 1140 1141 if (ret == -1) { 1142 error_report("NVMM: Failed to map HVA, HostVA:%p " 1143 "Size:%p bytes, error=%d", 1144 (void *)hva, (void *)size, errno); 1145 } 1146 } 1147 1148 static struct RAMBlockNotifier nvmm_ram_notifier = { 1149 .ram_block_added = nvmm_ram_block_added 1150 }; 1151 1152 /* -------------------------------------------------------------------------- */ 1153 1154 static int 1155 nvmm_accel_init(MachineState *ms) 1156 { 1157 int ret, err; 1158 1159 ret = nvmm_init(); 1160 if (ret == -1) { 1161 err = errno; 1162 error_report("NVMM: Initialization failed, error=%d", errno); 1163 return -err; 1164 } 1165 1166 ret = nvmm_capability(&qemu_mach.cap); 1167 if (ret == -1) { 1168 err = errno; 1169 error_report("NVMM: Unable to fetch capability, error=%d", errno); 1170 return -err; 1171 } 1172 if (qemu_mach.cap.version < NVMM_KERN_VERSION) { 1173 error_report("NVMM: Unsupported version %u", qemu_mach.cap.version); 1174 return -EPROGMISMATCH; 1175 } 1176 if (qemu_mach.cap.state_size != sizeof(struct nvmm_x64_state)) { 1177 error_report("NVMM: Wrong state size %u", qemu_mach.cap.state_size); 1178 return -EPROGMISMATCH; 1179 } 1180 1181 ret = nvmm_machine_create(&qemu_mach.mach); 1182 if (ret == -1) { 1183 err = errno; 1184 error_report("NVMM: Machine creation failed, error=%d", errno); 1185 return -err; 1186 } 1187 1188 memory_listener_register(&nvmm_memory_listener, &address_space_memory); 1189 ram_block_notifier_add(&nvmm_ram_notifier); 1190 1191 printf("NetBSD Virtual Machine Monitor accelerator is operational\n"); 1192 return 0; 1193 } 1194 1195 int 1196 nvmm_enabled(void) 1197 { 1198 return nvmm_allowed; 1199 } 1200 1201 static void 1202 nvmm_accel_class_init(ObjectClass *oc, void *data) 1203 { 1204 AccelClass *ac = ACCEL_CLASS(oc); 1205 ac->name = "NVMM"; 1206 ac->init_machine = nvmm_accel_init; 1207 ac->allowed = &nvmm_allowed; 1208 } 1209 1210 static const TypeInfo nvmm_accel_type = { 1211 .name = ACCEL_CLASS_NAME("nvmm"), 1212 .parent = TYPE_ACCEL, 1213 .class_init = nvmm_accel_class_init, 1214 }; 1215 1216 static void 1217 nvmm_type_init(void) 1218 { 1219 type_register_static(&nvmm_accel_type); 1220 } 1221 1222 type_init(nvmm_type_init); 1223