xref: /openbmc/qemu/target/i386/nvmm/nvmm-all.c (revision 6c10778826a873b9012d95e63298a6f879debcaa)
1 /*
2  * Copyright (c) 2018-2019 Maxime Villard, All rights reserved.
3  *
4  * NetBSD Virtual Machine Monitor (NVMM) accelerator for QEMU.
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "system/address-spaces.h"
13 #include "system/ioport.h"
14 #include "qemu/accel.h"
15 #include "system/nvmm.h"
16 #include "system/cpus.h"
17 #include "system/runstate.h"
18 #include "qemu/main-loop.h"
19 #include "qemu/error-report.h"
20 #include "qapi/error.h"
21 #include "qemu/queue.h"
22 #include "migration/blocker.h"
23 #include "strings.h"
24 
25 #include "nvmm-accel-ops.h"
26 
27 #include <nvmm.h>
28 
29 struct AccelCPUState {
30     struct nvmm_vcpu vcpu;
31     uint8_t tpr;
32     bool stop;
33 
34     /* Window-exiting for INTs/NMIs. */
35     bool int_window_exit;
36     bool nmi_window_exit;
37 
38     /* The guest is in an interrupt shadow (POP SS, etc). */
39     bool int_shadow;
40 };
41 
42 struct qemu_machine {
43     struct nvmm_capability cap;
44     struct nvmm_machine mach;
45 };
46 
47 /* -------------------------------------------------------------------------- */
48 
49 bool nvmm_allowed;
50 static struct qemu_machine qemu_mach;
51 
52 static struct nvmm_machine *
53 get_nvmm_mach(void)
54 {
55     return &qemu_mach.mach;
56 }
57 
58 /* -------------------------------------------------------------------------- */
59 
60 static void
61 nvmm_set_segment(struct nvmm_x64_state_seg *nseg, const SegmentCache *qseg)
62 {
63     uint32_t attrib = qseg->flags;
64 
65     nseg->selector = qseg->selector;
66     nseg->limit = qseg->limit;
67     nseg->base = qseg->base;
68     nseg->attrib.type = __SHIFTOUT(attrib, DESC_TYPE_MASK);
69     nseg->attrib.s = __SHIFTOUT(attrib, DESC_S_MASK);
70     nseg->attrib.dpl = __SHIFTOUT(attrib, DESC_DPL_MASK);
71     nseg->attrib.p = __SHIFTOUT(attrib, DESC_P_MASK);
72     nseg->attrib.avl = __SHIFTOUT(attrib, DESC_AVL_MASK);
73     nseg->attrib.l = __SHIFTOUT(attrib, DESC_L_MASK);
74     nseg->attrib.def = __SHIFTOUT(attrib, DESC_B_MASK);
75     nseg->attrib.g = __SHIFTOUT(attrib, DESC_G_MASK);
76 }
77 
78 static void
79 nvmm_set_registers(CPUState *cpu)
80 {
81     CPUX86State *env = cpu_env(cpu);
82     struct nvmm_machine *mach = get_nvmm_mach();
83     AccelCPUState *qcpu = cpu->accel;
84     struct nvmm_vcpu *vcpu = &qcpu->vcpu;
85     struct nvmm_x64_state *state = vcpu->state;
86     uint64_t bitmap;
87     size_t i;
88     int ret;
89 
90     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
91 
92     /* GPRs. */
93     state->gprs[NVMM_X64_GPR_RAX] = env->regs[R_EAX];
94     state->gprs[NVMM_X64_GPR_RCX] = env->regs[R_ECX];
95     state->gprs[NVMM_X64_GPR_RDX] = env->regs[R_EDX];
96     state->gprs[NVMM_X64_GPR_RBX] = env->regs[R_EBX];
97     state->gprs[NVMM_X64_GPR_RSP] = env->regs[R_ESP];
98     state->gprs[NVMM_X64_GPR_RBP] = env->regs[R_EBP];
99     state->gprs[NVMM_X64_GPR_RSI] = env->regs[R_ESI];
100     state->gprs[NVMM_X64_GPR_RDI] = env->regs[R_EDI];
101 #ifdef TARGET_X86_64
102     state->gprs[NVMM_X64_GPR_R8]  = env->regs[R_R8];
103     state->gprs[NVMM_X64_GPR_R9]  = env->regs[R_R9];
104     state->gprs[NVMM_X64_GPR_R10] = env->regs[R_R10];
105     state->gprs[NVMM_X64_GPR_R11] = env->regs[R_R11];
106     state->gprs[NVMM_X64_GPR_R12] = env->regs[R_R12];
107     state->gprs[NVMM_X64_GPR_R13] = env->regs[R_R13];
108     state->gprs[NVMM_X64_GPR_R14] = env->regs[R_R14];
109     state->gprs[NVMM_X64_GPR_R15] = env->regs[R_R15];
110 #endif
111 
112     /* RIP and RFLAGS. */
113     state->gprs[NVMM_X64_GPR_RIP] = env->eip;
114     state->gprs[NVMM_X64_GPR_RFLAGS] = env->eflags;
115 
116     /* Segments. */
117     nvmm_set_segment(&state->segs[NVMM_X64_SEG_CS], &env->segs[R_CS]);
118     nvmm_set_segment(&state->segs[NVMM_X64_SEG_DS], &env->segs[R_DS]);
119     nvmm_set_segment(&state->segs[NVMM_X64_SEG_ES], &env->segs[R_ES]);
120     nvmm_set_segment(&state->segs[NVMM_X64_SEG_FS], &env->segs[R_FS]);
121     nvmm_set_segment(&state->segs[NVMM_X64_SEG_GS], &env->segs[R_GS]);
122     nvmm_set_segment(&state->segs[NVMM_X64_SEG_SS], &env->segs[R_SS]);
123 
124     /* Special segments. */
125     nvmm_set_segment(&state->segs[NVMM_X64_SEG_GDT], &env->gdt);
126     nvmm_set_segment(&state->segs[NVMM_X64_SEG_LDT], &env->ldt);
127     nvmm_set_segment(&state->segs[NVMM_X64_SEG_TR], &env->tr);
128     nvmm_set_segment(&state->segs[NVMM_X64_SEG_IDT], &env->idt);
129 
130     /* Control registers. */
131     state->crs[NVMM_X64_CR_CR0] = env->cr[0];
132     state->crs[NVMM_X64_CR_CR2] = env->cr[2];
133     state->crs[NVMM_X64_CR_CR3] = env->cr[3];
134     state->crs[NVMM_X64_CR_CR4] = env->cr[4];
135     state->crs[NVMM_X64_CR_CR8] = qcpu->tpr;
136     state->crs[NVMM_X64_CR_XCR0] = env->xcr0;
137 
138     /* Debug registers. */
139     state->drs[NVMM_X64_DR_DR0] = env->dr[0];
140     state->drs[NVMM_X64_DR_DR1] = env->dr[1];
141     state->drs[NVMM_X64_DR_DR2] = env->dr[2];
142     state->drs[NVMM_X64_DR_DR3] = env->dr[3];
143     state->drs[NVMM_X64_DR_DR6] = env->dr[6];
144     state->drs[NVMM_X64_DR_DR7] = env->dr[7];
145 
146     /* FPU. */
147     state->fpu.fx_cw = env->fpuc;
148     state->fpu.fx_sw = (env->fpus & ~0x3800) | ((env->fpstt & 0x7) << 11);
149     state->fpu.fx_tw = 0;
150     for (i = 0; i < 8; i++) {
151         state->fpu.fx_tw |= (!env->fptags[i]) << i;
152     }
153     state->fpu.fx_opcode = env->fpop;
154     state->fpu.fx_ip.fa_64 = env->fpip;
155     state->fpu.fx_dp.fa_64 = env->fpdp;
156     state->fpu.fx_mxcsr = env->mxcsr;
157     state->fpu.fx_mxcsr_mask = 0x0000FFFF;
158     assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs));
159     memcpy(state->fpu.fx_87_ac, env->fpregs, sizeof(env->fpregs));
160     for (i = 0; i < CPU_NB_REGS; i++) {
161         memcpy(&state->fpu.fx_xmm[i].xmm_bytes[0],
162             &env->xmm_regs[i].ZMM_Q(0), 8);
163         memcpy(&state->fpu.fx_xmm[i].xmm_bytes[8],
164             &env->xmm_regs[i].ZMM_Q(1), 8);
165     }
166 
167     /* MSRs. */
168     state->msrs[NVMM_X64_MSR_EFER] = env->efer;
169     state->msrs[NVMM_X64_MSR_STAR] = env->star;
170 #ifdef TARGET_X86_64
171     state->msrs[NVMM_X64_MSR_LSTAR] = env->lstar;
172     state->msrs[NVMM_X64_MSR_CSTAR] = env->cstar;
173     state->msrs[NVMM_X64_MSR_SFMASK] = env->fmask;
174     state->msrs[NVMM_X64_MSR_KERNELGSBASE] = env->kernelgsbase;
175 #endif
176     state->msrs[NVMM_X64_MSR_SYSENTER_CS]  = env->sysenter_cs;
177     state->msrs[NVMM_X64_MSR_SYSENTER_ESP] = env->sysenter_esp;
178     state->msrs[NVMM_X64_MSR_SYSENTER_EIP] = env->sysenter_eip;
179     state->msrs[NVMM_X64_MSR_PAT] = env->pat;
180     state->msrs[NVMM_X64_MSR_TSC] = env->tsc;
181 
182     bitmap =
183         NVMM_X64_STATE_SEGS |
184         NVMM_X64_STATE_GPRS |
185         NVMM_X64_STATE_CRS  |
186         NVMM_X64_STATE_DRS  |
187         NVMM_X64_STATE_MSRS |
188         NVMM_X64_STATE_FPU;
189 
190     ret = nvmm_vcpu_setstate(mach, vcpu, bitmap);
191     if (ret == -1) {
192         error_report("NVMM: Failed to set virtual processor context,"
193             " error=%d", errno);
194     }
195 }
196 
197 static void
198 nvmm_get_segment(SegmentCache *qseg, const struct nvmm_x64_state_seg *nseg)
199 {
200     qseg->selector = nseg->selector;
201     qseg->limit = nseg->limit;
202     qseg->base = nseg->base;
203 
204     qseg->flags =
205         __SHIFTIN((uint32_t)nseg->attrib.type, DESC_TYPE_MASK) |
206         __SHIFTIN((uint32_t)nseg->attrib.s, DESC_S_MASK) |
207         __SHIFTIN((uint32_t)nseg->attrib.dpl, DESC_DPL_MASK) |
208         __SHIFTIN((uint32_t)nseg->attrib.p, DESC_P_MASK) |
209         __SHIFTIN((uint32_t)nseg->attrib.avl, DESC_AVL_MASK) |
210         __SHIFTIN((uint32_t)nseg->attrib.l, DESC_L_MASK) |
211         __SHIFTIN((uint32_t)nseg->attrib.def, DESC_B_MASK) |
212         __SHIFTIN((uint32_t)nseg->attrib.g, DESC_G_MASK);
213 }
214 
215 static void
216 nvmm_get_registers(CPUState *cpu)
217 {
218     CPUX86State *env = cpu_env(cpu);
219     struct nvmm_machine *mach = get_nvmm_mach();
220     AccelCPUState *qcpu = cpu->accel;
221     struct nvmm_vcpu *vcpu = &qcpu->vcpu;
222     X86CPU *x86_cpu = X86_CPU(cpu);
223     struct nvmm_x64_state *state = vcpu->state;
224     uint64_t bitmap, tpr;
225     size_t i;
226     int ret;
227 
228     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
229 
230     bitmap =
231         NVMM_X64_STATE_SEGS |
232         NVMM_X64_STATE_GPRS |
233         NVMM_X64_STATE_CRS  |
234         NVMM_X64_STATE_DRS  |
235         NVMM_X64_STATE_MSRS |
236         NVMM_X64_STATE_FPU;
237 
238     ret = nvmm_vcpu_getstate(mach, vcpu, bitmap);
239     if (ret == -1) {
240         error_report("NVMM: Failed to get virtual processor context,"
241             " error=%d", errno);
242     }
243 
244     /* GPRs. */
245     env->regs[R_EAX] = state->gprs[NVMM_X64_GPR_RAX];
246     env->regs[R_ECX] = state->gprs[NVMM_X64_GPR_RCX];
247     env->regs[R_EDX] = state->gprs[NVMM_X64_GPR_RDX];
248     env->regs[R_EBX] = state->gprs[NVMM_X64_GPR_RBX];
249     env->regs[R_ESP] = state->gprs[NVMM_X64_GPR_RSP];
250     env->regs[R_EBP] = state->gprs[NVMM_X64_GPR_RBP];
251     env->regs[R_ESI] = state->gprs[NVMM_X64_GPR_RSI];
252     env->regs[R_EDI] = state->gprs[NVMM_X64_GPR_RDI];
253 #ifdef TARGET_X86_64
254     env->regs[R_R8]  = state->gprs[NVMM_X64_GPR_R8];
255     env->regs[R_R9]  = state->gprs[NVMM_X64_GPR_R9];
256     env->regs[R_R10] = state->gprs[NVMM_X64_GPR_R10];
257     env->regs[R_R11] = state->gprs[NVMM_X64_GPR_R11];
258     env->regs[R_R12] = state->gprs[NVMM_X64_GPR_R12];
259     env->regs[R_R13] = state->gprs[NVMM_X64_GPR_R13];
260     env->regs[R_R14] = state->gprs[NVMM_X64_GPR_R14];
261     env->regs[R_R15] = state->gprs[NVMM_X64_GPR_R15];
262 #endif
263 
264     /* RIP and RFLAGS. */
265     env->eip = state->gprs[NVMM_X64_GPR_RIP];
266     env->eflags = state->gprs[NVMM_X64_GPR_RFLAGS];
267 
268     /* Segments. */
269     nvmm_get_segment(&env->segs[R_ES], &state->segs[NVMM_X64_SEG_ES]);
270     nvmm_get_segment(&env->segs[R_CS], &state->segs[NVMM_X64_SEG_CS]);
271     nvmm_get_segment(&env->segs[R_SS], &state->segs[NVMM_X64_SEG_SS]);
272     nvmm_get_segment(&env->segs[R_DS], &state->segs[NVMM_X64_SEG_DS]);
273     nvmm_get_segment(&env->segs[R_FS], &state->segs[NVMM_X64_SEG_FS]);
274     nvmm_get_segment(&env->segs[R_GS], &state->segs[NVMM_X64_SEG_GS]);
275 
276     /* Special segments. */
277     nvmm_get_segment(&env->gdt, &state->segs[NVMM_X64_SEG_GDT]);
278     nvmm_get_segment(&env->ldt, &state->segs[NVMM_X64_SEG_LDT]);
279     nvmm_get_segment(&env->tr, &state->segs[NVMM_X64_SEG_TR]);
280     nvmm_get_segment(&env->idt, &state->segs[NVMM_X64_SEG_IDT]);
281 
282     /* Control registers. */
283     env->cr[0] = state->crs[NVMM_X64_CR_CR0];
284     env->cr[2] = state->crs[NVMM_X64_CR_CR2];
285     env->cr[3] = state->crs[NVMM_X64_CR_CR3];
286     env->cr[4] = state->crs[NVMM_X64_CR_CR4];
287     tpr = state->crs[NVMM_X64_CR_CR8];
288     if (tpr != qcpu->tpr) {
289         qcpu->tpr = tpr;
290         cpu_set_apic_tpr(x86_cpu->apic_state, tpr);
291     }
292     env->xcr0 = state->crs[NVMM_X64_CR_XCR0];
293 
294     /* Debug registers. */
295     env->dr[0] = state->drs[NVMM_X64_DR_DR0];
296     env->dr[1] = state->drs[NVMM_X64_DR_DR1];
297     env->dr[2] = state->drs[NVMM_X64_DR_DR2];
298     env->dr[3] = state->drs[NVMM_X64_DR_DR3];
299     env->dr[6] = state->drs[NVMM_X64_DR_DR6];
300     env->dr[7] = state->drs[NVMM_X64_DR_DR7];
301 
302     /* FPU. */
303     env->fpuc = state->fpu.fx_cw;
304     env->fpstt = (state->fpu.fx_sw >> 11) & 0x7;
305     env->fpus = state->fpu.fx_sw & ~0x3800;
306     for (i = 0; i < 8; i++) {
307         env->fptags[i] = !((state->fpu.fx_tw >> i) & 1);
308     }
309     env->fpop = state->fpu.fx_opcode;
310     env->fpip = state->fpu.fx_ip.fa_64;
311     env->fpdp = state->fpu.fx_dp.fa_64;
312     env->mxcsr = state->fpu.fx_mxcsr;
313     assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs));
314     memcpy(env->fpregs, state->fpu.fx_87_ac, sizeof(env->fpregs));
315     for (i = 0; i < CPU_NB_REGS; i++) {
316         memcpy(&env->xmm_regs[i].ZMM_Q(0),
317             &state->fpu.fx_xmm[i].xmm_bytes[0], 8);
318         memcpy(&env->xmm_regs[i].ZMM_Q(1),
319             &state->fpu.fx_xmm[i].xmm_bytes[8], 8);
320     }
321 
322     /* MSRs. */
323     env->efer = state->msrs[NVMM_X64_MSR_EFER];
324     env->star = state->msrs[NVMM_X64_MSR_STAR];
325 #ifdef TARGET_X86_64
326     env->lstar = state->msrs[NVMM_X64_MSR_LSTAR];
327     env->cstar = state->msrs[NVMM_X64_MSR_CSTAR];
328     env->fmask = state->msrs[NVMM_X64_MSR_SFMASK];
329     env->kernelgsbase = state->msrs[NVMM_X64_MSR_KERNELGSBASE];
330 #endif
331     env->sysenter_cs  = state->msrs[NVMM_X64_MSR_SYSENTER_CS];
332     env->sysenter_esp = state->msrs[NVMM_X64_MSR_SYSENTER_ESP];
333     env->sysenter_eip = state->msrs[NVMM_X64_MSR_SYSENTER_EIP];
334     env->pat = state->msrs[NVMM_X64_MSR_PAT];
335     env->tsc = state->msrs[NVMM_X64_MSR_TSC];
336 
337     x86_update_hflags(env);
338 }
339 
340 static bool
341 nvmm_can_take_int(CPUState *cpu)
342 {
343     AccelCPUState *qcpu = cpu->accel;
344     struct nvmm_vcpu *vcpu = &qcpu->vcpu;
345     struct nvmm_machine *mach = get_nvmm_mach();
346 
347     if (qcpu->int_window_exit) {
348         return false;
349     }
350 
351     if (qcpu->int_shadow || !(cpu_env(cpu)->eflags & IF_MASK)) {
352         struct nvmm_x64_state *state = vcpu->state;
353 
354         /* Exit on interrupt window. */
355         nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_INTR);
356         state->intr.int_window_exiting = 1;
357         nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_INTR);
358 
359         return false;
360     }
361 
362     return true;
363 }
364 
365 static bool
366 nvmm_can_take_nmi(CPUState *cpu)
367 {
368     AccelCPUState *qcpu = cpu->accel;
369 
370     /*
371      * Contrary to INTs, NMIs always schedule an exit when they are
372      * completed. Therefore, if window-exiting is enabled, it means
373      * NMIs are blocked.
374      */
375     if (qcpu->nmi_window_exit) {
376         return false;
377     }
378 
379     return true;
380 }
381 
382 /*
383  * Called before the VCPU is run. We inject events generated by the I/O
384  * thread, and synchronize the guest TPR.
385  */
386 static void
387 nvmm_vcpu_pre_run(CPUState *cpu)
388 {
389     CPUX86State *env = cpu_env(cpu);
390     struct nvmm_machine *mach = get_nvmm_mach();
391     AccelCPUState *qcpu = cpu->accel;
392     struct nvmm_vcpu *vcpu = &qcpu->vcpu;
393     X86CPU *x86_cpu = X86_CPU(cpu);
394     struct nvmm_x64_state *state = vcpu->state;
395     struct nvmm_vcpu_event *event = vcpu->event;
396     bool has_event = false;
397     bool sync_tpr = false;
398     uint8_t tpr;
399     int ret;
400 
401     bql_lock();
402 
403     tpr = cpu_get_apic_tpr(x86_cpu->apic_state);
404     if (tpr != qcpu->tpr) {
405         qcpu->tpr = tpr;
406         sync_tpr = true;
407     }
408 
409     /*
410      * Force the VCPU out of its inner loop to process any INIT requests
411      * or commit pending TPR access.
412      */
413     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
414         cpu->exit_request = 1;
415     }
416 
417     if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
418         if (nvmm_can_take_nmi(cpu)) {
419             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
420             event->type = NVMM_VCPU_EVENT_INTR;
421             event->vector = 2;
422             has_event = true;
423         }
424     }
425 
426     if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
427         if (nvmm_can_take_int(cpu)) {
428             cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
429             event->type = NVMM_VCPU_EVENT_INTR;
430             event->vector = cpu_get_pic_interrupt(env);
431             has_event = true;
432         }
433     }
434 
435     /* Don't want SMIs. */
436     if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
437         cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
438     }
439 
440     if (sync_tpr) {
441         ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_CRS);
442         if (ret == -1) {
443             error_report("NVMM: Failed to get CPU state,"
444                 " error=%d", errno);
445         }
446 
447         state->crs[NVMM_X64_CR_CR8] = qcpu->tpr;
448 
449         ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_CRS);
450         if (ret == -1) {
451             error_report("NVMM: Failed to set CPU state,"
452                 " error=%d", errno);
453         }
454     }
455 
456     if (has_event) {
457         ret = nvmm_vcpu_inject(mach, vcpu);
458         if (ret == -1) {
459             error_report("NVMM: Failed to inject event,"
460                 " error=%d", errno);
461         }
462     }
463 
464     bql_unlock();
465 }
466 
467 /*
468  * Called after the VCPU ran. We synchronize the host view of the TPR and
469  * RFLAGS.
470  */
471 static void
472 nvmm_vcpu_post_run(CPUState *cpu, struct nvmm_vcpu_exit *exit)
473 {
474     AccelCPUState *qcpu = cpu->accel;
475     X86CPU *x86_cpu = X86_CPU(cpu);
476     CPUX86State *env = &x86_cpu->env;
477     uint64_t tpr;
478 
479     env->eflags = exit->exitstate.rflags;
480     qcpu->int_shadow = exit->exitstate.int_shadow;
481     qcpu->int_window_exit = exit->exitstate.int_window_exiting;
482     qcpu->nmi_window_exit = exit->exitstate.nmi_window_exiting;
483 
484     tpr = exit->exitstate.cr8;
485     if (qcpu->tpr != tpr) {
486         qcpu->tpr = tpr;
487         bql_lock();
488         cpu_set_apic_tpr(x86_cpu->apic_state, qcpu->tpr);
489         bql_unlock();
490     }
491 }
492 
493 /* -------------------------------------------------------------------------- */
494 
495 static void
496 nvmm_io_callback(struct nvmm_io *io)
497 {
498     MemTxAttrs attrs = { 0 };
499     int ret;
500 
501     ret = address_space_rw(&address_space_io, io->port, attrs, io->data,
502         io->size, !io->in);
503     if (ret != MEMTX_OK) {
504         error_report("NVMM: I/O Transaction Failed "
505             "[%s, port=%u, size=%zu]", (io->in ? "in" : "out"),
506             io->port, io->size);
507     }
508 
509     /* Needed, otherwise infinite loop. */
510     current_cpu->vcpu_dirty = false;
511 }
512 
513 static void
514 nvmm_mem_callback(struct nvmm_mem *mem)
515 {
516     cpu_physical_memory_rw(mem->gpa, mem->data, mem->size, mem->write);
517 
518     /* Needed, otherwise infinite loop. */
519     current_cpu->vcpu_dirty = false;
520 }
521 
522 static struct nvmm_assist_callbacks nvmm_callbacks = {
523     .io = nvmm_io_callback,
524     .mem = nvmm_mem_callback
525 };
526 
527 /* -------------------------------------------------------------------------- */
528 
529 static int
530 nvmm_handle_mem(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
531 {
532     int ret;
533 
534     ret = nvmm_assist_mem(mach, vcpu);
535     if (ret == -1) {
536         error_report("NVMM: Mem Assist Failed [gpa=%p]",
537             (void *)vcpu->exit->u.mem.gpa);
538     }
539 
540     return ret;
541 }
542 
543 static int
544 nvmm_handle_io(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
545 {
546     int ret;
547 
548     ret = nvmm_assist_io(mach, vcpu);
549     if (ret == -1) {
550         error_report("NVMM: I/O Assist Failed [port=%d]",
551             (int)vcpu->exit->u.io.port);
552     }
553 
554     return ret;
555 }
556 
557 static int
558 nvmm_handle_rdmsr(struct nvmm_machine *mach, CPUState *cpu,
559     struct nvmm_vcpu_exit *exit)
560 {
561     AccelCPUState *qcpu = cpu->accel;
562     struct nvmm_vcpu *vcpu = &qcpu->vcpu;
563     X86CPU *x86_cpu = X86_CPU(cpu);
564     struct nvmm_x64_state *state = vcpu->state;
565     uint64_t val;
566     int ret;
567 
568     switch (exit->u.rdmsr.msr) {
569     case MSR_IA32_APICBASE:
570         val = cpu_get_apic_base(x86_cpu->apic_state);
571         break;
572     case MSR_MTRRcap:
573     case MSR_MTRRdefType:
574     case MSR_MCG_CAP:
575     case MSR_MCG_STATUS:
576         val = 0;
577         break;
578     default: /* More MSRs to add? */
579         val = 0;
580         error_report("NVMM: Unexpected RDMSR 0x%x, ignored",
581             exit->u.rdmsr.msr);
582         break;
583     }
584 
585     ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS);
586     if (ret == -1) {
587         return -1;
588     }
589 
590     state->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF);
591     state->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
592     state->gprs[NVMM_X64_GPR_RIP] = exit->u.rdmsr.npc;
593 
594     ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS);
595     if (ret == -1) {
596         return -1;
597     }
598 
599     return 0;
600 }
601 
602 static int
603 nvmm_handle_wrmsr(struct nvmm_machine *mach, CPUState *cpu,
604     struct nvmm_vcpu_exit *exit)
605 {
606     AccelCPUState *qcpu = cpu->accel;
607     struct nvmm_vcpu *vcpu = &qcpu->vcpu;
608     X86CPU *x86_cpu = X86_CPU(cpu);
609     struct nvmm_x64_state *state = vcpu->state;
610     uint64_t val;
611     int ret;
612 
613     val = exit->u.wrmsr.val;
614 
615     switch (exit->u.wrmsr.msr) {
616     case MSR_IA32_APICBASE:
617         cpu_set_apic_base(x86_cpu->apic_state, val);
618         break;
619     case MSR_MTRRdefType:
620     case MSR_MCG_STATUS:
621         break;
622     default: /* More MSRs to add? */
623         error_report("NVMM: Unexpected WRMSR 0x%x [val=0x%lx], ignored",
624             exit->u.wrmsr.msr, val);
625         break;
626     }
627 
628     ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS);
629     if (ret == -1) {
630         return -1;
631     }
632 
633     state->gprs[NVMM_X64_GPR_RIP] = exit->u.wrmsr.npc;
634 
635     ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS);
636     if (ret == -1) {
637         return -1;
638     }
639 
640     return 0;
641 }
642 
643 static int
644 nvmm_handle_halted(struct nvmm_machine *mach, CPUState *cpu,
645     struct nvmm_vcpu_exit *exit)
646 {
647     int ret = 0;
648 
649     bql_lock();
650 
651     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
652           (cpu_env(cpu)->eflags & IF_MASK)) &&
653         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
654         cpu->exception_index = EXCP_HLT;
655         cpu->halted = true;
656         ret = 1;
657     }
658 
659     bql_unlock();
660 
661     return ret;
662 }
663 
664 static int
665 nvmm_inject_ud(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
666 {
667     struct nvmm_vcpu_event *event = vcpu->event;
668 
669     event->type = NVMM_VCPU_EVENT_EXCP;
670     event->vector = 6;
671     event->u.excp.error = 0;
672 
673     return nvmm_vcpu_inject(mach, vcpu);
674 }
675 
676 static int
677 nvmm_vcpu_loop(CPUState *cpu)
678 {
679     struct nvmm_machine *mach = get_nvmm_mach();
680     AccelCPUState *qcpu = cpu->accel;
681     struct nvmm_vcpu *vcpu = &qcpu->vcpu;
682     X86CPU *x86_cpu = X86_CPU(cpu);
683     CPUX86State *env = &x86_cpu->env;
684     struct nvmm_vcpu_exit *exit = vcpu->exit;
685     int ret;
686 
687     /*
688      * Some asynchronous events must be handled outside of the inner
689      * VCPU loop. They are handled here.
690      */
691     if (cpu->interrupt_request & CPU_INTERRUPT_INIT) {
692         nvmm_cpu_synchronize_state(cpu);
693         do_cpu_init(x86_cpu);
694         /* set int/nmi windows back to the reset state */
695     }
696     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
697         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
698         apic_poll_irq(x86_cpu->apic_state);
699     }
700     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
701          (env->eflags & IF_MASK)) ||
702         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
703         cpu->halted = false;
704     }
705     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
706         nvmm_cpu_synchronize_state(cpu);
707         do_cpu_sipi(x86_cpu);
708     }
709     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
710         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
711         nvmm_cpu_synchronize_state(cpu);
712         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
713             env->tpr_access_type);
714     }
715 
716     if (cpu->halted) {
717         cpu->exception_index = EXCP_HLT;
718         qatomic_set(&cpu->exit_request, false);
719         return 0;
720     }
721 
722     bql_unlock();
723     cpu_exec_start(cpu);
724 
725     /*
726      * Inner VCPU loop.
727      */
728     do {
729         if (cpu->vcpu_dirty) {
730             nvmm_set_registers(cpu);
731             cpu->vcpu_dirty = false;
732         }
733 
734         if (qcpu->stop) {
735             cpu->exception_index = EXCP_INTERRUPT;
736             qcpu->stop = false;
737             ret = 1;
738             break;
739         }
740 
741         nvmm_vcpu_pre_run(cpu);
742 
743         if (qatomic_read(&cpu->exit_request)) {
744 #if NVMM_USER_VERSION >= 2
745             nvmm_vcpu_stop(vcpu);
746 #else
747             qemu_cpu_kick_self();
748 #endif
749         }
750 
751         /* Read exit_request before the kernel reads the immediate exit flag */
752         smp_rmb();
753         ret = nvmm_vcpu_run(mach, vcpu);
754         if (ret == -1) {
755             error_report("NVMM: Failed to exec a virtual processor,"
756                 " error=%d", errno);
757             break;
758         }
759 
760         nvmm_vcpu_post_run(cpu, exit);
761 
762         switch (exit->reason) {
763         case NVMM_VCPU_EXIT_NONE:
764             break;
765 #if NVMM_USER_VERSION >= 2
766         case NVMM_VCPU_EXIT_STOPPED:
767             /*
768              * The kernel cleared the immediate exit flag; cpu->exit_request
769              * must be cleared after
770              */
771             smp_wmb();
772             qcpu->stop = true;
773             break;
774 #endif
775         case NVMM_VCPU_EXIT_MEMORY:
776             ret = nvmm_handle_mem(mach, vcpu);
777             break;
778         case NVMM_VCPU_EXIT_IO:
779             ret = nvmm_handle_io(mach, vcpu);
780             break;
781         case NVMM_VCPU_EXIT_INT_READY:
782         case NVMM_VCPU_EXIT_NMI_READY:
783         case NVMM_VCPU_EXIT_TPR_CHANGED:
784             break;
785         case NVMM_VCPU_EXIT_HALTED:
786             ret = nvmm_handle_halted(mach, cpu, exit);
787             break;
788         case NVMM_VCPU_EXIT_SHUTDOWN:
789             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
790             cpu->exception_index = EXCP_INTERRUPT;
791             ret = 1;
792             break;
793         case NVMM_VCPU_EXIT_RDMSR:
794             ret = nvmm_handle_rdmsr(mach, cpu, exit);
795             break;
796         case NVMM_VCPU_EXIT_WRMSR:
797             ret = nvmm_handle_wrmsr(mach, cpu, exit);
798             break;
799         case NVMM_VCPU_EXIT_MONITOR:
800         case NVMM_VCPU_EXIT_MWAIT:
801             ret = nvmm_inject_ud(mach, vcpu);
802             break;
803         default:
804             error_report("NVMM: Unexpected VM exit code 0x%lx [hw=0x%lx]",
805                 exit->reason, exit->u.inv.hwcode);
806             nvmm_get_registers(cpu);
807             bql_lock();
808             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
809             bql_unlock();
810             ret = -1;
811             break;
812         }
813     } while (ret == 0);
814 
815     cpu_exec_end(cpu);
816     bql_lock();
817 
818     qatomic_set(&cpu->exit_request, false);
819 
820     return ret < 0;
821 }
822 
823 /* -------------------------------------------------------------------------- */
824 
825 static void
826 do_nvmm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
827 {
828     nvmm_get_registers(cpu);
829     cpu->vcpu_dirty = true;
830 }
831 
832 static void
833 do_nvmm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
834 {
835     nvmm_set_registers(cpu);
836     cpu->vcpu_dirty = false;
837 }
838 
839 static void
840 do_nvmm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
841 {
842     nvmm_set_registers(cpu);
843     cpu->vcpu_dirty = false;
844 }
845 
846 static void
847 do_nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
848 {
849     cpu->vcpu_dirty = true;
850 }
851 
852 void nvmm_cpu_synchronize_state(CPUState *cpu)
853 {
854     if (!cpu->vcpu_dirty) {
855         run_on_cpu(cpu, do_nvmm_cpu_synchronize_state, RUN_ON_CPU_NULL);
856     }
857 }
858 
859 void nvmm_cpu_synchronize_post_reset(CPUState *cpu)
860 {
861     run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
862 }
863 
864 void nvmm_cpu_synchronize_post_init(CPUState *cpu)
865 {
866     run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
867 }
868 
869 void nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu)
870 {
871     run_on_cpu(cpu, do_nvmm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
872 }
873 
874 /* -------------------------------------------------------------------------- */
875 
876 static Error *nvmm_migration_blocker;
877 
878 /*
879  * The nvmm_vcpu_stop() mechanism breaks races between entering the VMM
880  * and another thread signaling the vCPU thread to exit.
881  */
882 
883 static void
884 nvmm_ipi_signal(int sigcpu)
885 {
886     if (current_cpu) {
887         AccelCPUState *qcpu = current_cpu->accel;
888 #if NVMM_USER_VERSION >= 2
889         struct nvmm_vcpu *vcpu = &qcpu->vcpu;
890         nvmm_vcpu_stop(vcpu);
891 #else
892         qcpu->stop = true;
893 #endif
894     }
895 }
896 
897 static void
898 nvmm_init_cpu_signals(void)
899 {
900     struct sigaction sigact;
901     sigset_t set;
902 
903     /* Install the IPI handler. */
904     memset(&sigact, 0, sizeof(sigact));
905     sigact.sa_handler = nvmm_ipi_signal;
906     sigaction(SIG_IPI, &sigact, NULL);
907 
908     /* Allow IPIs on the current thread. */
909     sigprocmask(SIG_BLOCK, NULL, &set);
910     sigdelset(&set, SIG_IPI);
911     pthread_sigmask(SIG_SETMASK, &set, NULL);
912 }
913 
914 int
915 nvmm_init_vcpu(CPUState *cpu)
916 {
917     struct nvmm_machine *mach = get_nvmm_mach();
918     struct nvmm_vcpu_conf_cpuid cpuid;
919     struct nvmm_vcpu_conf_tpr tpr;
920     Error *local_error = NULL;
921     AccelCPUState *qcpu;
922     int ret, err;
923 
924     nvmm_init_cpu_signals();
925 
926     if (nvmm_migration_blocker == NULL) {
927         error_setg(&nvmm_migration_blocker,
928             "NVMM: Migration not supported");
929 
930         if (migrate_add_blocker(&nvmm_migration_blocker, &local_error) < 0) {
931             error_report_err(local_error);
932             return -EINVAL;
933         }
934     }
935 
936     qcpu = g_new0(AccelCPUState, 1);
937 
938     ret = nvmm_vcpu_create(mach, cpu->cpu_index, &qcpu->vcpu);
939     if (ret == -1) {
940         err = errno;
941         error_report("NVMM: Failed to create a virtual processor,"
942             " error=%d", err);
943         g_free(qcpu);
944         return -err;
945     }
946 
947     memset(&cpuid, 0, sizeof(cpuid));
948     cpuid.mask = 1;
949     cpuid.leaf = 0x00000001;
950     cpuid.u.mask.set.edx = CPUID_MCE | CPUID_MCA | CPUID_MTRR;
951     ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CPUID,
952         &cpuid);
953     if (ret == -1) {
954         err = errno;
955         error_report("NVMM: Failed to configure a virtual processor,"
956             " error=%d", err);
957         g_free(qcpu);
958         return -err;
959     }
960 
961     ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CALLBACKS,
962         &nvmm_callbacks);
963     if (ret == -1) {
964         err = errno;
965         error_report("NVMM: Failed to configure a virtual processor,"
966             " error=%d", err);
967         g_free(qcpu);
968         return -err;
969     }
970 
971     if (qemu_mach.cap.arch.vcpu_conf_support & NVMM_CAP_ARCH_VCPU_CONF_TPR) {
972         memset(&tpr, 0, sizeof(tpr));
973         tpr.exit_changed = 1;
974         ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_TPR, &tpr);
975         if (ret == -1) {
976             err = errno;
977             error_report("NVMM: Failed to configure a virtual processor,"
978                 " error=%d", err);
979             g_free(qcpu);
980             return -err;
981         }
982     }
983 
984     qcpu->vcpu_dirty = true;
985     cpu->accel = qcpu;
986 
987     return 0;
988 }
989 
990 int
991 nvmm_vcpu_exec(CPUState *cpu)
992 {
993     int ret, fatal;
994 
995     while (1) {
996         if (cpu->exception_index >= EXCP_INTERRUPT) {
997             ret = cpu->exception_index;
998             cpu->exception_index = -1;
999             break;
1000         }
1001 
1002         fatal = nvmm_vcpu_loop(cpu);
1003 
1004         if (fatal) {
1005             error_report("NVMM: Failed to execute a VCPU.");
1006             abort();
1007         }
1008     }
1009 
1010     return ret;
1011 }
1012 
1013 void
1014 nvmm_destroy_vcpu(CPUState *cpu)
1015 {
1016     struct nvmm_machine *mach = get_nvmm_mach();
1017     AccelCPUState *qcpu = cpu->accel;
1018 
1019     nvmm_vcpu_destroy(mach, &qcpu->vcpu);
1020     g_free(cpu->accel);
1021 }
1022 
1023 /* -------------------------------------------------------------------------- */
1024 
1025 static void
1026 nvmm_update_mapping(hwaddr start_pa, ram_addr_t size, uintptr_t hva,
1027     bool add, bool rom, const char *name)
1028 {
1029     struct nvmm_machine *mach = get_nvmm_mach();
1030     int ret, prot;
1031 
1032     if (add) {
1033         prot = PROT_READ | PROT_EXEC;
1034         if (!rom) {
1035             prot |= PROT_WRITE;
1036         }
1037         ret = nvmm_gpa_map(mach, hva, start_pa, size, prot);
1038     } else {
1039         ret = nvmm_gpa_unmap(mach, hva, start_pa, size);
1040     }
1041 
1042     if (ret == -1) {
1043         error_report("NVMM: Failed to %s GPA range '%s' PA:%p, "
1044             "Size:%p bytes, HostVA:%p, error=%d",
1045             (add ? "map" : "unmap"), name, (void *)(uintptr_t)start_pa,
1046             (void *)size, (void *)hva, errno);
1047     }
1048 }
1049 
1050 static void
1051 nvmm_process_section(MemoryRegionSection *section, int add)
1052 {
1053     MemoryRegion *mr = section->mr;
1054     hwaddr start_pa = section->offset_within_address_space;
1055     ram_addr_t size = int128_get64(section->size);
1056     unsigned int delta;
1057     uintptr_t hva;
1058 
1059     if (!memory_region_is_ram(mr)) {
1060         return;
1061     }
1062 
1063     /* Adjust start_pa and size so that they are page-aligned. */
1064     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
1065     delta &= ~qemu_real_host_page_mask();
1066     if (delta > size) {
1067         return;
1068     }
1069     start_pa += delta;
1070     size -= delta;
1071     size &= qemu_real_host_page_mask();
1072     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
1073         return;
1074     }
1075 
1076     hva = (uintptr_t)memory_region_get_ram_ptr(mr) +
1077         section->offset_within_region + delta;
1078 
1079     nvmm_update_mapping(start_pa, size, hva, add,
1080         memory_region_is_rom(mr), mr->name);
1081 }
1082 
1083 static void
1084 nvmm_region_add(MemoryListener *listener, MemoryRegionSection *section)
1085 {
1086     memory_region_ref(section->mr);
1087     nvmm_process_section(section, 1);
1088 }
1089 
1090 static void
1091 nvmm_region_del(MemoryListener *listener, MemoryRegionSection *section)
1092 {
1093     nvmm_process_section(section, 0);
1094     memory_region_unref(section->mr);
1095 }
1096 
1097 static void
1098 nvmm_transaction_begin(MemoryListener *listener)
1099 {
1100     /* nothing */
1101 }
1102 
1103 static void
1104 nvmm_transaction_commit(MemoryListener *listener)
1105 {
1106     /* nothing */
1107 }
1108 
1109 static void
1110 nvmm_log_sync(MemoryListener *listener, MemoryRegionSection *section)
1111 {
1112     MemoryRegion *mr = section->mr;
1113 
1114     if (!memory_region_is_ram(mr)) {
1115         return;
1116     }
1117 
1118     memory_region_set_dirty(mr, 0, int128_get64(section->size));
1119 }
1120 
1121 static MemoryListener nvmm_memory_listener = {
1122     .name = "nvmm",
1123     .begin = nvmm_transaction_begin,
1124     .commit = nvmm_transaction_commit,
1125     .region_add = nvmm_region_add,
1126     .region_del = nvmm_region_del,
1127     .log_sync = nvmm_log_sync,
1128     .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
1129 };
1130 
1131 static void
1132 nvmm_ram_block_added(RAMBlockNotifier *n, void *host, size_t size,
1133                      size_t max_size)
1134 {
1135     struct nvmm_machine *mach = get_nvmm_mach();
1136     uintptr_t hva = (uintptr_t)host;
1137     int ret;
1138 
1139     ret = nvmm_hva_map(mach, hva, max_size);
1140 
1141     if (ret == -1) {
1142         error_report("NVMM: Failed to map HVA, HostVA:%p "
1143             "Size:%p bytes, error=%d",
1144             (void *)hva, (void *)size, errno);
1145     }
1146 }
1147 
1148 static struct RAMBlockNotifier nvmm_ram_notifier = {
1149     .ram_block_added = nvmm_ram_block_added
1150 };
1151 
1152 /* -------------------------------------------------------------------------- */
1153 
1154 static int
1155 nvmm_accel_init(AccelState *as, MachineState *ms)
1156 {
1157     int ret, err;
1158 
1159     ret = nvmm_init();
1160     if (ret == -1) {
1161         err = errno;
1162         error_report("NVMM: Initialization failed, error=%d", errno);
1163         return -err;
1164     }
1165 
1166     ret = nvmm_capability(&qemu_mach.cap);
1167     if (ret == -1) {
1168         err = errno;
1169         error_report("NVMM: Unable to fetch capability, error=%d", errno);
1170         return -err;
1171     }
1172     if (qemu_mach.cap.version < NVMM_KERN_VERSION) {
1173         error_report("NVMM: Unsupported version %u", qemu_mach.cap.version);
1174         return -EPROGMISMATCH;
1175     }
1176     if (qemu_mach.cap.state_size != sizeof(struct nvmm_x64_state)) {
1177         error_report("NVMM: Wrong state size %u", qemu_mach.cap.state_size);
1178         return -EPROGMISMATCH;
1179     }
1180 
1181     ret = nvmm_machine_create(&qemu_mach.mach);
1182     if (ret == -1) {
1183         err = errno;
1184         error_report("NVMM: Machine creation failed, error=%d", errno);
1185         return -err;
1186     }
1187 
1188     memory_listener_register(&nvmm_memory_listener, &address_space_memory);
1189     ram_block_notifier_add(&nvmm_ram_notifier);
1190 
1191     printf("NetBSD Virtual Machine Monitor accelerator is operational\n");
1192     return 0;
1193 }
1194 
1195 static void
1196 nvmm_accel_class_init(ObjectClass *oc, const void *data)
1197 {
1198     AccelClass *ac = ACCEL_CLASS(oc);
1199     ac->name = "NVMM";
1200     ac->init_machine = nvmm_accel_init;
1201     ac->allowed = &nvmm_allowed;
1202 }
1203 
1204 static const TypeInfo nvmm_accel_type = {
1205     .name = ACCEL_CLASS_NAME("nvmm"),
1206     .parent = TYPE_ACCEL,
1207     .class_init = nvmm_accel_class_init,
1208 };
1209 
1210 static void
1211 nvmm_type_init(void)
1212 {
1213     type_register_static(&nvmm_accel_type);
1214 }
1215 
1216 type_init(nvmm_type_init);
1217