xref: /openbmc/qemu/target/arm/kvm64.c (revision 7bdd67a5)
1 /*
2  * ARM implementation of KVM hooks, 64 bit specific code
3  *
4  * Copyright Mian-M. Hamayun 2013, Virtual Open Systems
5  * Copyright Alex Bennée 2014, Linaro
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include <sys/ioctl.h>
14 #include <sys/ptrace.h>
15 
16 #include <linux/elf.h>
17 #include <linux/kvm.h>
18 
19 #include "qapi/error.h"
20 #include "cpu.h"
21 #include "qemu/timer.h"
22 #include "qemu/error-report.h"
23 #include "qemu/host-utils.h"
24 #include "qemu/main-loop.h"
25 #include "exec/gdbstub.h"
26 #include "sysemu/runstate.h"
27 #include "sysemu/kvm.h"
28 #include "sysemu/kvm_int.h"
29 #include "kvm_arm.h"
30 #include "internals.h"
31 #include "hw/acpi/acpi.h"
32 #include "hw/acpi/ghes.h"
33 #include "hw/arm/virt.h"
34 
35 static bool have_guest_debug;
36 
37 /*
38  * Although the ARM implementation of hardware assisted debugging
39  * allows for different breakpoints per-core, the current GDB
40  * interface treats them as a global pool of registers (which seems to
41  * be the case for x86, ppc and s390). As a result we store one copy
42  * of registers which is used for all active cores.
43  *
44  * Write access is serialised by virtue of the GDB protocol which
45  * updates things. Read access (i.e. when the values are copied to the
46  * vCPU) is also gated by GDB's run control.
47  *
48  * This is not unreasonable as most of the time debugging kernels you
49  * never know which core will eventually execute your function.
50  */
51 
52 typedef struct {
53     uint64_t bcr;
54     uint64_t bvr;
55 } HWBreakpoint;
56 
57 /* The watchpoint registers can cover more area than the requested
58  * watchpoint so we need to store the additional information
59  * somewhere. We also need to supply a CPUWatchpoint to the GDB stub
60  * when the watchpoint is hit.
61  */
62 typedef struct {
63     uint64_t wcr;
64     uint64_t wvr;
65     CPUWatchpoint details;
66 } HWWatchpoint;
67 
68 /* Maximum and current break/watch point counts */
69 int max_hw_bps, max_hw_wps;
70 GArray *hw_breakpoints, *hw_watchpoints;
71 
72 #define cur_hw_wps      (hw_watchpoints->len)
73 #define cur_hw_bps      (hw_breakpoints->len)
74 #define get_hw_bp(i)    (&g_array_index(hw_breakpoints, HWBreakpoint, i))
75 #define get_hw_wp(i)    (&g_array_index(hw_watchpoints, HWWatchpoint, i))
76 
77 void kvm_arm_init_debug(KVMState *s)
78 {
79     have_guest_debug = kvm_check_extension(s,
80                                            KVM_CAP_SET_GUEST_DEBUG);
81 
82     max_hw_wps = kvm_check_extension(s, KVM_CAP_GUEST_DEBUG_HW_WPS);
83     hw_watchpoints = g_array_sized_new(true, true,
84                                        sizeof(HWWatchpoint), max_hw_wps);
85 
86     max_hw_bps = kvm_check_extension(s, KVM_CAP_GUEST_DEBUG_HW_BPS);
87     hw_breakpoints = g_array_sized_new(true, true,
88                                        sizeof(HWBreakpoint), max_hw_bps);
89     return;
90 }
91 
92 /**
93  * insert_hw_breakpoint()
94  * @addr: address of breakpoint
95  *
96  * See ARM ARM D2.9.1 for details but here we are only going to create
97  * simple un-linked breakpoints (i.e. we don't chain breakpoints
98  * together to match address and context or vmid). The hardware is
99  * capable of fancier matching but that will require exposing that
100  * fanciness to GDB's interface
101  *
102  * DBGBCR<n>_EL1, Debug Breakpoint Control Registers
103  *
104  *  31  24 23  20 19   16 15 14  13  12   9 8   5 4    3 2   1  0
105  * +------+------+-------+-----+----+------+-----+------+-----+---+
106  * | RES0 |  BT  |  LBN  | SSC | HMC| RES0 | BAS | RES0 | PMC | E |
107  * +------+------+-------+-----+----+------+-----+------+-----+---+
108  *
109  * BT: Breakpoint type (0 = unlinked address match)
110  * LBN: Linked BP number (0 = unused)
111  * SSC/HMC/PMC: Security, Higher and Priv access control (Table D-12)
112  * BAS: Byte Address Select (RES1 for AArch64)
113  * E: Enable bit
114  *
115  * DBGBVR<n>_EL1, Debug Breakpoint Value Registers
116  *
117  *  63  53 52       49 48       2  1 0
118  * +------+-----------+----------+-----+
119  * | RESS | VA[52:49] | VA[48:2] | 0 0 |
120  * +------+-----------+----------+-----+
121  *
122  * Depending on the addressing mode bits the top bits of the register
123  * are a sign extension of the highest applicable VA bit. Some
124  * versions of GDB don't do it correctly so we ensure they are correct
125  * here so future PC comparisons will work properly.
126  */
127 
128 static int insert_hw_breakpoint(target_ulong addr)
129 {
130     HWBreakpoint brk = {
131         .bcr = 0x1,                             /* BCR E=1, enable */
132         .bvr = sextract64(addr, 0, 53)
133     };
134 
135     if (cur_hw_bps >= max_hw_bps) {
136         return -ENOBUFS;
137     }
138 
139     brk.bcr = deposit32(brk.bcr, 1, 2, 0x3);   /* PMC = 11 */
140     brk.bcr = deposit32(brk.bcr, 5, 4, 0xf);   /* BAS = RES1 */
141 
142     g_array_append_val(hw_breakpoints, brk);
143 
144     return 0;
145 }
146 
147 /**
148  * delete_hw_breakpoint()
149  * @pc: address of breakpoint
150  *
151  * Delete a breakpoint and shuffle any above down
152  */
153 
154 static int delete_hw_breakpoint(target_ulong pc)
155 {
156     int i;
157     for (i = 0; i < hw_breakpoints->len; i++) {
158         HWBreakpoint *brk = get_hw_bp(i);
159         if (brk->bvr == pc) {
160             g_array_remove_index(hw_breakpoints, i);
161             return 0;
162         }
163     }
164     return -ENOENT;
165 }
166 
167 /**
168  * insert_hw_watchpoint()
169  * @addr: address of watch point
170  * @len: size of area
171  * @type: type of watch point
172  *
173  * See ARM ARM D2.10. As with the breakpoints we can do some advanced
174  * stuff if we want to. The watch points can be linked with the break
175  * points above to make them context aware. However for simplicity
176  * currently we only deal with simple read/write watch points.
177  *
178  * D7.3.11 DBGWCR<n>_EL1, Debug Watchpoint Control Registers
179  *
180  *  31  29 28   24 23  21  20  19 16 15 14  13   12  5 4   3 2   1  0
181  * +------+-------+------+----+-----+-----+-----+-----+-----+-----+---+
182  * | RES0 |  MASK | RES0 | WT | LBN | SSC | HMC | BAS | LSC | PAC | E |
183  * +------+-------+------+----+-----+-----+-----+-----+-----+-----+---+
184  *
185  * MASK: num bits addr mask (0=none,01/10=res,11=3 bits (8 bytes))
186  * WT: 0 - unlinked, 1 - linked (not currently used)
187  * LBN: Linked BP number (not currently used)
188  * SSC/HMC/PAC: Security, Higher and Priv access control (Table D2-11)
189  * BAS: Byte Address Select
190  * LSC: Load/Store control (01: load, 10: store, 11: both)
191  * E: Enable
192  *
193  * The bottom 2 bits of the value register are masked. Therefore to
194  * break on any sizes smaller than an unaligned word you need to set
195  * MASK=0, BAS=bit per byte in question. For larger regions (^2) you
196  * need to ensure you mask the address as required and set BAS=0xff
197  */
198 
199 static int insert_hw_watchpoint(target_ulong addr,
200                                 target_ulong len, int type)
201 {
202     HWWatchpoint wp = {
203         .wcr = R_DBGWCR_E_MASK, /* E=1, enable */
204         .wvr = addr & (~0x7ULL),
205         .details = { .vaddr = addr, .len = len }
206     };
207 
208     if (cur_hw_wps >= max_hw_wps) {
209         return -ENOBUFS;
210     }
211 
212     /*
213      * HMC=0 SSC=0 PAC=3 will hit EL0 or EL1, any security state,
214      * valid whether EL3 is implemented or not
215      */
216     wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, PAC, 3);
217 
218     switch (type) {
219     case GDB_WATCHPOINT_READ:
220         wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, LSC, 1);
221         wp.details.flags = BP_MEM_READ;
222         break;
223     case GDB_WATCHPOINT_WRITE:
224         wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, LSC, 2);
225         wp.details.flags = BP_MEM_WRITE;
226         break;
227     case GDB_WATCHPOINT_ACCESS:
228         wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, LSC, 3);
229         wp.details.flags = BP_MEM_ACCESS;
230         break;
231     default:
232         g_assert_not_reached();
233         break;
234     }
235     if (len <= 8) {
236         /* we align the address and set the bits in BAS */
237         int off = addr & 0x7;
238         int bas = (1 << len) - 1;
239 
240         wp.wcr = deposit32(wp.wcr, 5 + off, 8 - off, bas);
241     } else {
242         /* For ranges above 8 bytes we need to be a power of 2 */
243         if (is_power_of_2(len)) {
244             int bits = ctz64(len);
245 
246             wp.wvr &= ~((1 << bits) - 1);
247             wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, MASK, bits);
248             wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, BAS, 0xff);
249         } else {
250             return -ENOBUFS;
251         }
252     }
253 
254     g_array_append_val(hw_watchpoints, wp);
255     return 0;
256 }
257 
258 
259 static bool check_watchpoint_in_range(int i, target_ulong addr)
260 {
261     HWWatchpoint *wp = get_hw_wp(i);
262     uint64_t addr_top, addr_bottom = wp->wvr;
263     int bas = extract32(wp->wcr, 5, 8);
264     int mask = extract32(wp->wcr, 24, 4);
265 
266     if (mask) {
267         addr_top = addr_bottom + (1 << mask);
268     } else {
269         /* BAS must be contiguous but can offset against the base
270          * address in DBGWVR */
271         addr_bottom = addr_bottom + ctz32(bas);
272         addr_top = addr_bottom + clo32(bas);
273     }
274 
275     if (addr >= addr_bottom && addr <= addr_top) {
276         return true;
277     }
278 
279     return false;
280 }
281 
282 /**
283  * delete_hw_watchpoint()
284  * @addr: address of breakpoint
285  *
286  * Delete a breakpoint and shuffle any above down
287  */
288 
289 static int delete_hw_watchpoint(target_ulong addr,
290                                 target_ulong len, int type)
291 {
292     int i;
293     for (i = 0; i < cur_hw_wps; i++) {
294         if (check_watchpoint_in_range(i, addr)) {
295             g_array_remove_index(hw_watchpoints, i);
296             return 0;
297         }
298     }
299     return -ENOENT;
300 }
301 
302 
303 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
304                                   target_ulong len, int type)
305 {
306     switch (type) {
307     case GDB_BREAKPOINT_HW:
308         return insert_hw_breakpoint(addr);
309         break;
310     case GDB_WATCHPOINT_READ:
311     case GDB_WATCHPOINT_WRITE:
312     case GDB_WATCHPOINT_ACCESS:
313         return insert_hw_watchpoint(addr, len, type);
314     default:
315         return -ENOSYS;
316     }
317 }
318 
319 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
320                                   target_ulong len, int type)
321 {
322     switch (type) {
323     case GDB_BREAKPOINT_HW:
324         return delete_hw_breakpoint(addr);
325     case GDB_WATCHPOINT_READ:
326     case GDB_WATCHPOINT_WRITE:
327     case GDB_WATCHPOINT_ACCESS:
328         return delete_hw_watchpoint(addr, len, type);
329     default:
330         return -ENOSYS;
331     }
332 }
333 
334 
335 void kvm_arch_remove_all_hw_breakpoints(void)
336 {
337     if (cur_hw_wps > 0) {
338         g_array_remove_range(hw_watchpoints, 0, cur_hw_wps);
339     }
340     if (cur_hw_bps > 0) {
341         g_array_remove_range(hw_breakpoints, 0, cur_hw_bps);
342     }
343 }
344 
345 void kvm_arm_copy_hw_debug_data(struct kvm_guest_debug_arch *ptr)
346 {
347     int i;
348     memset(ptr, 0, sizeof(struct kvm_guest_debug_arch));
349 
350     for (i = 0; i < max_hw_wps; i++) {
351         HWWatchpoint *wp = get_hw_wp(i);
352         ptr->dbg_wcr[i] = wp->wcr;
353         ptr->dbg_wvr[i] = wp->wvr;
354     }
355     for (i = 0; i < max_hw_bps; i++) {
356         HWBreakpoint *bp = get_hw_bp(i);
357         ptr->dbg_bcr[i] = bp->bcr;
358         ptr->dbg_bvr[i] = bp->bvr;
359     }
360 }
361 
362 bool kvm_arm_hw_debug_active(CPUState *cs)
363 {
364     return ((cur_hw_wps > 0) || (cur_hw_bps > 0));
365 }
366 
367 static bool find_hw_breakpoint(CPUState *cpu, target_ulong pc)
368 {
369     int i;
370 
371     for (i = 0; i < cur_hw_bps; i++) {
372         HWBreakpoint *bp = get_hw_bp(i);
373         if (bp->bvr == pc) {
374             return true;
375         }
376     }
377     return false;
378 }
379 
380 static CPUWatchpoint *find_hw_watchpoint(CPUState *cpu, target_ulong addr)
381 {
382     int i;
383 
384     for (i = 0; i < cur_hw_wps; i++) {
385         if (check_watchpoint_in_range(i, addr)) {
386             return &get_hw_wp(i)->details;
387         }
388     }
389     return NULL;
390 }
391 
392 static bool kvm_arm_set_device_attr(CPUState *cs, struct kvm_device_attr *attr,
393                                     const char *name)
394 {
395     int err;
396 
397     err = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, attr);
398     if (err != 0) {
399         error_report("%s: KVM_HAS_DEVICE_ATTR: %s", name, strerror(-err));
400         return false;
401     }
402 
403     err = kvm_vcpu_ioctl(cs, KVM_SET_DEVICE_ATTR, attr);
404     if (err != 0) {
405         error_report("%s: KVM_SET_DEVICE_ATTR: %s", name, strerror(-err));
406         return false;
407     }
408 
409     return true;
410 }
411 
412 void kvm_arm_pmu_init(CPUState *cs)
413 {
414     struct kvm_device_attr attr = {
415         .group = KVM_ARM_VCPU_PMU_V3_CTRL,
416         .attr = KVM_ARM_VCPU_PMU_V3_INIT,
417     };
418 
419     if (!ARM_CPU(cs)->has_pmu) {
420         return;
421     }
422     if (!kvm_arm_set_device_attr(cs, &attr, "PMU")) {
423         error_report("failed to init PMU");
424         abort();
425     }
426 }
427 
428 void kvm_arm_pmu_set_irq(CPUState *cs, int irq)
429 {
430     struct kvm_device_attr attr = {
431         .group = KVM_ARM_VCPU_PMU_V3_CTRL,
432         .addr = (intptr_t)&irq,
433         .attr = KVM_ARM_VCPU_PMU_V3_IRQ,
434     };
435 
436     if (!ARM_CPU(cs)->has_pmu) {
437         return;
438     }
439     if (!kvm_arm_set_device_attr(cs, &attr, "PMU")) {
440         error_report("failed to set irq for PMU");
441         abort();
442     }
443 }
444 
445 void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa)
446 {
447     struct kvm_device_attr attr = {
448         .group = KVM_ARM_VCPU_PVTIME_CTRL,
449         .attr = KVM_ARM_VCPU_PVTIME_IPA,
450         .addr = (uint64_t)&ipa,
451     };
452 
453     if (ARM_CPU(cs)->kvm_steal_time == ON_OFF_AUTO_OFF) {
454         return;
455     }
456     if (!kvm_arm_set_device_attr(cs, &attr, "PVTIME IPA")) {
457         error_report("failed to init PVTIME IPA");
458         abort();
459     }
460 }
461 
462 static int read_sys_reg32(int fd, uint32_t *pret, uint64_t id)
463 {
464     uint64_t ret;
465     struct kvm_one_reg idreg = { .id = id, .addr = (uintptr_t)&ret };
466     int err;
467 
468     assert((id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64);
469     err = ioctl(fd, KVM_GET_ONE_REG, &idreg);
470     if (err < 0) {
471         return -1;
472     }
473     *pret = ret;
474     return 0;
475 }
476 
477 static int read_sys_reg64(int fd, uint64_t *pret, uint64_t id)
478 {
479     struct kvm_one_reg idreg = { .id = id, .addr = (uintptr_t)pret };
480 
481     assert((id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64);
482     return ioctl(fd, KVM_GET_ONE_REG, &idreg);
483 }
484 
485 static bool kvm_arm_pauth_supported(void)
486 {
487     return (kvm_check_extension(kvm_state, KVM_CAP_ARM_PTRAUTH_ADDRESS) &&
488             kvm_check_extension(kvm_state, KVM_CAP_ARM_PTRAUTH_GENERIC));
489 }
490 
491 bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
492 {
493     /* Identify the feature bits corresponding to the host CPU, and
494      * fill out the ARMHostCPUClass fields accordingly. To do this
495      * we have to create a scratch VM, create a single CPU inside it,
496      * and then query that CPU for the relevant ID registers.
497      */
498     int fdarray[3];
499     bool sve_supported;
500     bool pmu_supported = false;
501     uint64_t features = 0;
502     int err;
503 
504     /* Old kernels may not know about the PREFERRED_TARGET ioctl: however
505      * we know these will only support creating one kind of guest CPU,
506      * which is its preferred CPU type. Fortunately these old kernels
507      * support only a very limited number of CPUs.
508      */
509     static const uint32_t cpus_to_try[] = {
510         KVM_ARM_TARGET_AEM_V8,
511         KVM_ARM_TARGET_FOUNDATION_V8,
512         KVM_ARM_TARGET_CORTEX_A57,
513         QEMU_KVM_ARM_TARGET_NONE
514     };
515     /*
516      * target = -1 informs kvm_arm_create_scratch_host_vcpu()
517      * to use the preferred target
518      */
519     struct kvm_vcpu_init init = { .target = -1, };
520 
521     /*
522      * Ask for SVE if supported, so that we can query ID_AA64ZFR0,
523      * which is otherwise RAZ.
524      */
525     sve_supported = kvm_arm_sve_supported();
526     if (sve_supported) {
527         init.features[0] |= 1 << KVM_ARM_VCPU_SVE;
528     }
529 
530     /*
531      * Ask for Pointer Authentication if supported, so that we get
532      * the unsanitized field values for AA64ISAR1_EL1.
533      */
534     if (kvm_arm_pauth_supported()) {
535         init.features[0] |= (1 << KVM_ARM_VCPU_PTRAUTH_ADDRESS |
536                              1 << KVM_ARM_VCPU_PTRAUTH_GENERIC);
537     }
538 
539     if (kvm_arm_pmu_supported()) {
540         init.features[0] |= 1 << KVM_ARM_VCPU_PMU_V3;
541         pmu_supported = true;
542     }
543 
544     if (!kvm_arm_create_scratch_host_vcpu(cpus_to_try, fdarray, &init)) {
545         return false;
546     }
547 
548     ahcf->target = init.target;
549     ahcf->dtb_compatible = "arm,arm-v8";
550 
551     err = read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64pfr0,
552                          ARM64_SYS_REG(3, 0, 0, 4, 0));
553     if (unlikely(err < 0)) {
554         /*
555          * Before v4.15, the kernel only exposed a limited number of system
556          * registers, not including any of the interesting AArch64 ID regs.
557          * For the most part we could leave these fields as zero with minimal
558          * effect, since this does not affect the values seen by the guest.
559          *
560          * However, it could cause problems down the line for QEMU,
561          * so provide a minimal v8.0 default.
562          *
563          * ??? Could read MIDR and use knowledge from cpu64.c.
564          * ??? Could map a page of memory into our temp guest and
565          *     run the tiniest of hand-crafted kernels to extract
566          *     the values seen by the guest.
567          * ??? Either of these sounds like too much effort just
568          *     to work around running a modern host kernel.
569          */
570         ahcf->isar.id_aa64pfr0 = 0x00000011; /* EL1&0, AArch64 only */
571         err = 0;
572     } else {
573         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64pfr1,
574                               ARM64_SYS_REG(3, 0, 0, 4, 1));
575         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64smfr0,
576                               ARM64_SYS_REG(3, 0, 0, 4, 5));
577         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64dfr0,
578                               ARM64_SYS_REG(3, 0, 0, 5, 0));
579         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64dfr1,
580                               ARM64_SYS_REG(3, 0, 0, 5, 1));
581         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64isar0,
582                               ARM64_SYS_REG(3, 0, 0, 6, 0));
583         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64isar1,
584                               ARM64_SYS_REG(3, 0, 0, 6, 1));
585         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64mmfr0,
586                               ARM64_SYS_REG(3, 0, 0, 7, 0));
587         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64mmfr1,
588                               ARM64_SYS_REG(3, 0, 0, 7, 1));
589         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64mmfr2,
590                               ARM64_SYS_REG(3, 0, 0, 7, 2));
591 
592         /*
593          * Note that if AArch32 support is not present in the host,
594          * the AArch32 sysregs are present to be read, but will
595          * return UNKNOWN values.  This is neither better nor worse
596          * than skipping the reads and leaving 0, as we must avoid
597          * considering the values in every case.
598          */
599         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_pfr0,
600                               ARM64_SYS_REG(3, 0, 0, 1, 0));
601         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_pfr1,
602                               ARM64_SYS_REG(3, 0, 0, 1, 1));
603         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_dfr0,
604                               ARM64_SYS_REG(3, 0, 0, 1, 2));
605         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr0,
606                               ARM64_SYS_REG(3, 0, 0, 1, 4));
607         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr1,
608                               ARM64_SYS_REG(3, 0, 0, 1, 5));
609         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr2,
610                               ARM64_SYS_REG(3, 0, 0, 1, 6));
611         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr3,
612                               ARM64_SYS_REG(3, 0, 0, 1, 7));
613         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar0,
614                               ARM64_SYS_REG(3, 0, 0, 2, 0));
615         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar1,
616                               ARM64_SYS_REG(3, 0, 0, 2, 1));
617         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar2,
618                               ARM64_SYS_REG(3, 0, 0, 2, 2));
619         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar3,
620                               ARM64_SYS_REG(3, 0, 0, 2, 3));
621         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar4,
622                               ARM64_SYS_REG(3, 0, 0, 2, 4));
623         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar5,
624                               ARM64_SYS_REG(3, 0, 0, 2, 5));
625         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr4,
626                               ARM64_SYS_REG(3, 0, 0, 2, 6));
627         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar6,
628                               ARM64_SYS_REG(3, 0, 0, 2, 7));
629 
630         err |= read_sys_reg32(fdarray[2], &ahcf->isar.mvfr0,
631                               ARM64_SYS_REG(3, 0, 0, 3, 0));
632         err |= read_sys_reg32(fdarray[2], &ahcf->isar.mvfr1,
633                               ARM64_SYS_REG(3, 0, 0, 3, 1));
634         err |= read_sys_reg32(fdarray[2], &ahcf->isar.mvfr2,
635                               ARM64_SYS_REG(3, 0, 0, 3, 2));
636         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_pfr2,
637                               ARM64_SYS_REG(3, 0, 0, 3, 4));
638         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_dfr1,
639                               ARM64_SYS_REG(3, 0, 0, 3, 5));
640         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr5,
641                               ARM64_SYS_REG(3, 0, 0, 3, 6));
642 
643         /*
644          * DBGDIDR is a bit complicated because the kernel doesn't
645          * provide an accessor for it in 64-bit mode, which is what this
646          * scratch VM is in, and there's no architected "64-bit sysreg
647          * which reads the same as the 32-bit register" the way there is
648          * for other ID registers. Instead we synthesize a value from the
649          * AArch64 ID_AA64DFR0, the same way the kernel code in
650          * arch/arm64/kvm/sys_regs.c:trap_dbgidr() does.
651          * We only do this if the CPU supports AArch32 at EL1.
652          */
653         if (FIELD_EX32(ahcf->isar.id_aa64pfr0, ID_AA64PFR0, EL1) >= 2) {
654             int wrps = FIELD_EX64(ahcf->isar.id_aa64dfr0, ID_AA64DFR0, WRPS);
655             int brps = FIELD_EX64(ahcf->isar.id_aa64dfr0, ID_AA64DFR0, BRPS);
656             int ctx_cmps =
657                 FIELD_EX64(ahcf->isar.id_aa64dfr0, ID_AA64DFR0, CTX_CMPS);
658             int version = 6; /* ARMv8 debug architecture */
659             bool has_el3 =
660                 !!FIELD_EX32(ahcf->isar.id_aa64pfr0, ID_AA64PFR0, EL3);
661             uint32_t dbgdidr = 0;
662 
663             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, WRPS, wrps);
664             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, BRPS, brps);
665             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, CTX_CMPS, ctx_cmps);
666             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, VERSION, version);
667             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, NSUHD_IMP, has_el3);
668             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, SE_IMP, has_el3);
669             dbgdidr |= (1 << 15); /* RES1 bit */
670             ahcf->isar.dbgdidr = dbgdidr;
671         }
672 
673         if (pmu_supported) {
674             /* PMCR_EL0 is only accessible if the vCPU has feature PMU_V3 */
675             err |= read_sys_reg64(fdarray[2], &ahcf->isar.reset_pmcr_el0,
676                                   ARM64_SYS_REG(3, 3, 9, 12, 0));
677         }
678 
679         if (sve_supported) {
680             /*
681              * There is a range of kernels between kernel commit 73433762fcae
682              * and f81cb2c3ad41 which have a bug where the kernel doesn't
683              * expose SYS_ID_AA64ZFR0_EL1 via the ONE_REG API unless the VM has
684              * enabled SVE support, which resulted in an error rather than RAZ.
685              * So only read the register if we set KVM_ARM_VCPU_SVE above.
686              */
687             err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64zfr0,
688                                   ARM64_SYS_REG(3, 0, 0, 4, 4));
689         }
690     }
691 
692     kvm_arm_destroy_scratch_host_vcpu(fdarray);
693 
694     if (err < 0) {
695         return false;
696     }
697 
698     /*
699      * We can assume any KVM supporting CPU is at least a v8
700      * with VFPv4+Neon; this in turn implies most of the other
701      * feature bits.
702      */
703     features |= 1ULL << ARM_FEATURE_V8;
704     features |= 1ULL << ARM_FEATURE_NEON;
705     features |= 1ULL << ARM_FEATURE_AARCH64;
706     features |= 1ULL << ARM_FEATURE_PMU;
707     features |= 1ULL << ARM_FEATURE_GENERIC_TIMER;
708 
709     ahcf->features = features;
710 
711     return true;
712 }
713 
714 void kvm_arm_steal_time_finalize(ARMCPU *cpu, Error **errp)
715 {
716     bool has_steal_time = kvm_arm_steal_time_supported();
717 
718     if (cpu->kvm_steal_time == ON_OFF_AUTO_AUTO) {
719         if (!has_steal_time || !arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
720             cpu->kvm_steal_time = ON_OFF_AUTO_OFF;
721         } else {
722             cpu->kvm_steal_time = ON_OFF_AUTO_ON;
723         }
724     } else if (cpu->kvm_steal_time == ON_OFF_AUTO_ON) {
725         if (!has_steal_time) {
726             error_setg(errp, "'kvm-steal-time' cannot be enabled "
727                              "on this host");
728             return;
729         } else if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
730             /*
731              * DEN0057A chapter 2 says "This specification only covers
732              * systems in which the Execution state of the hypervisor
733              * as well as EL1 of virtual machines is AArch64.". And,
734              * to ensure that, the smc/hvc calls are only specified as
735              * smc64/hvc64.
736              */
737             error_setg(errp, "'kvm-steal-time' cannot be enabled "
738                              "for AArch32 guests");
739             return;
740         }
741     }
742 }
743 
744 bool kvm_arm_aarch32_supported(void)
745 {
746     return kvm_check_extension(kvm_state, KVM_CAP_ARM_EL1_32BIT);
747 }
748 
749 bool kvm_arm_sve_supported(void)
750 {
751     return kvm_check_extension(kvm_state, KVM_CAP_ARM_SVE);
752 }
753 
754 bool kvm_arm_steal_time_supported(void)
755 {
756     return kvm_check_extension(kvm_state, KVM_CAP_STEAL_TIME);
757 }
758 
759 QEMU_BUILD_BUG_ON(KVM_ARM64_SVE_VQ_MIN != 1);
760 
761 uint32_t kvm_arm_sve_get_vls(CPUState *cs)
762 {
763     /* Only call this function if kvm_arm_sve_supported() returns true. */
764     static uint64_t vls[KVM_ARM64_SVE_VLS_WORDS];
765     static bool probed;
766     uint32_t vq = 0;
767     int i;
768 
769     /*
770      * KVM ensures all host CPUs support the same set of vector lengths.
771      * So we only need to create the scratch VCPUs once and then cache
772      * the results.
773      */
774     if (!probed) {
775         struct kvm_vcpu_init init = {
776             .target = -1,
777             .features[0] = (1 << KVM_ARM_VCPU_SVE),
778         };
779         struct kvm_one_reg reg = {
780             .id = KVM_REG_ARM64_SVE_VLS,
781             .addr = (uint64_t)&vls[0],
782         };
783         int fdarray[3], ret;
784 
785         probed = true;
786 
787         if (!kvm_arm_create_scratch_host_vcpu(NULL, fdarray, &init)) {
788             error_report("failed to create scratch VCPU with SVE enabled");
789             abort();
790         }
791         ret = ioctl(fdarray[2], KVM_GET_ONE_REG, &reg);
792         kvm_arm_destroy_scratch_host_vcpu(fdarray);
793         if (ret) {
794             error_report("failed to get KVM_REG_ARM64_SVE_VLS: %s",
795                          strerror(errno));
796             abort();
797         }
798 
799         for (i = KVM_ARM64_SVE_VLS_WORDS - 1; i >= 0; --i) {
800             if (vls[i]) {
801                 vq = 64 - clz64(vls[i]) + i * 64;
802                 break;
803             }
804         }
805         if (vq > ARM_MAX_VQ) {
806             warn_report("KVM supports vector lengths larger than "
807                         "QEMU can enable");
808             vls[0] &= MAKE_64BIT_MASK(0, ARM_MAX_VQ);
809         }
810     }
811 
812     return vls[0];
813 }
814 
815 static int kvm_arm_sve_set_vls(CPUState *cs)
816 {
817     ARMCPU *cpu = ARM_CPU(cs);
818     uint64_t vls[KVM_ARM64_SVE_VLS_WORDS] = { cpu->sve_vq.map };
819     struct kvm_one_reg reg = {
820         .id = KVM_REG_ARM64_SVE_VLS,
821         .addr = (uint64_t)&vls[0],
822     };
823 
824     assert(cpu->sve_max_vq <= KVM_ARM64_SVE_VQ_MAX);
825 
826     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
827 }
828 
829 #define ARM_CPU_ID_MPIDR       3, 0, 0, 0, 5
830 
831 int kvm_arch_init_vcpu(CPUState *cs)
832 {
833     int ret;
834     uint64_t mpidr;
835     ARMCPU *cpu = ARM_CPU(cs);
836     CPUARMState *env = &cpu->env;
837     uint64_t psciver;
838 
839     if (cpu->kvm_target == QEMU_KVM_ARM_TARGET_NONE ||
840         !object_dynamic_cast(OBJECT(cpu), TYPE_AARCH64_CPU)) {
841         error_report("KVM is not supported for this guest CPU type");
842         return -EINVAL;
843     }
844 
845     qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs);
846 
847     /* Determine init features for this CPU */
848     memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features));
849     if (cs->start_powered_off) {
850         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_POWER_OFF;
851     }
852     if (kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PSCI_0_2)) {
853         cpu->psci_version = QEMU_PSCI_VERSION_0_2;
854         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_PSCI_0_2;
855     }
856     if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
857         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_EL1_32BIT;
858     }
859     if (!kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PMU_V3)) {
860         cpu->has_pmu = false;
861     }
862     if (cpu->has_pmu) {
863         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_PMU_V3;
864     } else {
865         env->features &= ~(1ULL << ARM_FEATURE_PMU);
866     }
867     if (cpu_isar_feature(aa64_sve, cpu)) {
868         assert(kvm_arm_sve_supported());
869         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_SVE;
870     }
871     if (cpu_isar_feature(aa64_pauth, cpu)) {
872         cpu->kvm_init_features[0] |= (1 << KVM_ARM_VCPU_PTRAUTH_ADDRESS |
873                                       1 << KVM_ARM_VCPU_PTRAUTH_GENERIC);
874     }
875 
876     /* Do KVM_ARM_VCPU_INIT ioctl */
877     ret = kvm_arm_vcpu_init(cs);
878     if (ret) {
879         return ret;
880     }
881 
882     if (cpu_isar_feature(aa64_sve, cpu)) {
883         ret = kvm_arm_sve_set_vls(cs);
884         if (ret) {
885             return ret;
886         }
887         ret = kvm_arm_vcpu_finalize(cs, KVM_ARM_VCPU_SVE);
888         if (ret) {
889             return ret;
890         }
891     }
892 
893     /*
894      * KVM reports the exact PSCI version it is implementing via a
895      * special sysreg. If it is present, use its contents to determine
896      * what to report to the guest in the dtb (it is the PSCI version,
897      * in the same 15-bits major 16-bits minor format that PSCI_VERSION
898      * returns).
899      */
900     if (!kvm_get_one_reg(cs, KVM_REG_ARM_PSCI_VERSION, &psciver)) {
901         cpu->psci_version = psciver;
902     }
903 
904     /*
905      * When KVM is in use, PSCI is emulated in-kernel and not by qemu.
906      * Currently KVM has its own idea about MPIDR assignment, so we
907      * override our defaults with what we get from KVM.
908      */
909     ret = kvm_get_one_reg(cs, ARM64_SYS_REG(ARM_CPU_ID_MPIDR), &mpidr);
910     if (ret) {
911         return ret;
912     }
913     cpu->mp_affinity = mpidr & ARM64_AFFINITY_MASK;
914 
915     /* Check whether user space can specify guest syndrome value */
916     kvm_arm_init_serror_injection(cs);
917 
918     return kvm_arm_init_cpreg_list(cpu);
919 }
920 
921 int kvm_arch_destroy_vcpu(CPUState *cs)
922 {
923     return 0;
924 }
925 
926 bool kvm_arm_reg_syncs_via_cpreg_list(uint64_t regidx)
927 {
928     /* Return true if the regidx is a register we should synchronize
929      * via the cpreg_tuples array (ie is not a core or sve reg that
930      * we sync by hand in kvm_arch_get/put_registers())
931      */
932     switch (regidx & KVM_REG_ARM_COPROC_MASK) {
933     case KVM_REG_ARM_CORE:
934     case KVM_REG_ARM64_SVE:
935         return false;
936     default:
937         return true;
938     }
939 }
940 
941 typedef struct CPRegStateLevel {
942     uint64_t regidx;
943     int level;
944 } CPRegStateLevel;
945 
946 /* All system registers not listed in the following table are assumed to be
947  * of the level KVM_PUT_RUNTIME_STATE. If a register should be written less
948  * often, you must add it to this table with a state of either
949  * KVM_PUT_RESET_STATE or KVM_PUT_FULL_STATE.
950  */
951 static const CPRegStateLevel non_runtime_cpregs[] = {
952     { KVM_REG_ARM_TIMER_CNT, KVM_PUT_FULL_STATE },
953 };
954 
955 int kvm_arm_cpreg_level(uint64_t regidx)
956 {
957     int i;
958 
959     for (i = 0; i < ARRAY_SIZE(non_runtime_cpregs); i++) {
960         const CPRegStateLevel *l = &non_runtime_cpregs[i];
961         if (l->regidx == regidx) {
962             return l->level;
963         }
964     }
965 
966     return KVM_PUT_RUNTIME_STATE;
967 }
968 
969 /* Callers must hold the iothread mutex lock */
970 static void kvm_inject_arm_sea(CPUState *c)
971 {
972     ARMCPU *cpu = ARM_CPU(c);
973     CPUARMState *env = &cpu->env;
974     uint32_t esr;
975     bool same_el;
976 
977     c->exception_index = EXCP_DATA_ABORT;
978     env->exception.target_el = 1;
979 
980     /*
981      * Set the DFSC to synchronous external abort and set FnV to not valid,
982      * this will tell guest the FAR_ELx is UNKNOWN for this abort.
983      */
984     same_el = arm_current_el(env) == env->exception.target_el;
985     esr = syn_data_abort_no_iss(same_el, 1, 0, 0, 0, 0, 0x10);
986 
987     env->exception.syndrome = esr;
988 
989     arm_cpu_do_interrupt(c);
990 }
991 
992 #define AARCH64_CORE_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
993                  KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
994 
995 #define AARCH64_SIMD_CORE_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U128 | \
996                  KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
997 
998 #define AARCH64_SIMD_CTRL_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U32 | \
999                  KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
1000 
1001 static int kvm_arch_put_fpsimd(CPUState *cs)
1002 {
1003     CPUARMState *env = &ARM_CPU(cs)->env;
1004     struct kvm_one_reg reg;
1005     int i, ret;
1006 
1007     for (i = 0; i < 32; i++) {
1008         uint64_t *q = aa64_vfp_qreg(env, i);
1009 #if HOST_BIG_ENDIAN
1010         uint64_t fp_val[2] = { q[1], q[0] };
1011         reg.addr = (uintptr_t)fp_val;
1012 #else
1013         reg.addr = (uintptr_t)q;
1014 #endif
1015         reg.id = AARCH64_SIMD_CORE_REG(fp_regs.vregs[i]);
1016         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1017         if (ret) {
1018             return ret;
1019         }
1020     }
1021 
1022     return 0;
1023 }
1024 
1025 /*
1026  * KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits
1027  * and PREGS and the FFR have a slice size of 256 bits. However we simply hard
1028  * code the slice index to zero for now as it's unlikely we'll need more than
1029  * one slice for quite some time.
1030  */
1031 static int kvm_arch_put_sve(CPUState *cs)
1032 {
1033     ARMCPU *cpu = ARM_CPU(cs);
1034     CPUARMState *env = &cpu->env;
1035     uint64_t tmp[ARM_MAX_VQ * 2];
1036     uint64_t *r;
1037     struct kvm_one_reg reg;
1038     int n, ret;
1039 
1040     for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; ++n) {
1041         r = sve_bswap64(tmp, &env->vfp.zregs[n].d[0], cpu->sve_max_vq * 2);
1042         reg.addr = (uintptr_t)r;
1043         reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);
1044         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1045         if (ret) {
1046             return ret;
1047         }
1048     }
1049 
1050     for (n = 0; n < KVM_ARM64_SVE_NUM_PREGS; ++n) {
1051         r = sve_bswap64(tmp, r = &env->vfp.pregs[n].p[0],
1052                         DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
1053         reg.addr = (uintptr_t)r;
1054         reg.id = KVM_REG_ARM64_SVE_PREG(n, 0);
1055         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1056         if (ret) {
1057             return ret;
1058         }
1059     }
1060 
1061     r = sve_bswap64(tmp, &env->vfp.pregs[FFR_PRED_NUM].p[0],
1062                     DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
1063     reg.addr = (uintptr_t)r;
1064     reg.id = KVM_REG_ARM64_SVE_FFR(0);
1065     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1066     if (ret) {
1067         return ret;
1068     }
1069 
1070     return 0;
1071 }
1072 
1073 int kvm_arch_put_registers(CPUState *cs, int level)
1074 {
1075     struct kvm_one_reg reg;
1076     uint64_t val;
1077     uint32_t fpr;
1078     int i, ret;
1079     unsigned int el;
1080 
1081     ARMCPU *cpu = ARM_CPU(cs);
1082     CPUARMState *env = &cpu->env;
1083 
1084     /* If we are in AArch32 mode then we need to copy the AArch32 regs to the
1085      * AArch64 registers before pushing them out to 64-bit KVM.
1086      */
1087     if (!is_a64(env)) {
1088         aarch64_sync_32_to_64(env);
1089     }
1090 
1091     for (i = 0; i < 31; i++) {
1092         reg.id = AARCH64_CORE_REG(regs.regs[i]);
1093         reg.addr = (uintptr_t) &env->xregs[i];
1094         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1095         if (ret) {
1096             return ret;
1097         }
1098     }
1099 
1100     /* KVM puts SP_EL0 in regs.sp and SP_EL1 in regs.sp_el1. On the
1101      * QEMU side we keep the current SP in xregs[31] as well.
1102      */
1103     aarch64_save_sp(env, 1);
1104 
1105     reg.id = AARCH64_CORE_REG(regs.sp);
1106     reg.addr = (uintptr_t) &env->sp_el[0];
1107     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1108     if (ret) {
1109         return ret;
1110     }
1111 
1112     reg.id = AARCH64_CORE_REG(sp_el1);
1113     reg.addr = (uintptr_t) &env->sp_el[1];
1114     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1115     if (ret) {
1116         return ret;
1117     }
1118 
1119     /* Note that KVM thinks pstate is 64 bit but we use a uint32_t */
1120     if (is_a64(env)) {
1121         val = pstate_read(env);
1122     } else {
1123         val = cpsr_read(env);
1124     }
1125     reg.id = AARCH64_CORE_REG(regs.pstate);
1126     reg.addr = (uintptr_t) &val;
1127     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1128     if (ret) {
1129         return ret;
1130     }
1131 
1132     reg.id = AARCH64_CORE_REG(regs.pc);
1133     reg.addr = (uintptr_t) &env->pc;
1134     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1135     if (ret) {
1136         return ret;
1137     }
1138 
1139     reg.id = AARCH64_CORE_REG(elr_el1);
1140     reg.addr = (uintptr_t) &env->elr_el[1];
1141     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1142     if (ret) {
1143         return ret;
1144     }
1145 
1146     /* Saved Program State Registers
1147      *
1148      * Before we restore from the banked_spsr[] array we need to
1149      * ensure that any modifications to env->spsr are correctly
1150      * reflected in the banks.
1151      */
1152     el = arm_current_el(env);
1153     if (el > 0 && !is_a64(env)) {
1154         i = bank_number(env->uncached_cpsr & CPSR_M);
1155         env->banked_spsr[i] = env->spsr;
1156     }
1157 
1158     /* KVM 0-4 map to QEMU banks 1-5 */
1159     for (i = 0; i < KVM_NR_SPSR; i++) {
1160         reg.id = AARCH64_CORE_REG(spsr[i]);
1161         reg.addr = (uintptr_t) &env->banked_spsr[i + 1];
1162         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1163         if (ret) {
1164             return ret;
1165         }
1166     }
1167 
1168     if (cpu_isar_feature(aa64_sve, cpu)) {
1169         ret = kvm_arch_put_sve(cs);
1170     } else {
1171         ret = kvm_arch_put_fpsimd(cs);
1172     }
1173     if (ret) {
1174         return ret;
1175     }
1176 
1177     reg.addr = (uintptr_t)(&fpr);
1178     fpr = vfp_get_fpsr(env);
1179     reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpsr);
1180     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1181     if (ret) {
1182         return ret;
1183     }
1184 
1185     reg.addr = (uintptr_t)(&fpr);
1186     fpr = vfp_get_fpcr(env);
1187     reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpcr);
1188     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1189     if (ret) {
1190         return ret;
1191     }
1192 
1193     write_cpustate_to_list(cpu, true);
1194 
1195     if (!write_list_to_kvmstate(cpu, level)) {
1196         return -EINVAL;
1197     }
1198 
1199    /*
1200     * Setting VCPU events should be triggered after syncing the registers
1201     * to avoid overwriting potential changes made by KVM upon calling
1202     * KVM_SET_VCPU_EVENTS ioctl
1203     */
1204     ret = kvm_put_vcpu_events(cpu);
1205     if (ret) {
1206         return ret;
1207     }
1208 
1209     kvm_arm_sync_mpstate_to_kvm(cpu);
1210 
1211     return ret;
1212 }
1213 
1214 static int kvm_arch_get_fpsimd(CPUState *cs)
1215 {
1216     CPUARMState *env = &ARM_CPU(cs)->env;
1217     struct kvm_one_reg reg;
1218     int i, ret;
1219 
1220     for (i = 0; i < 32; i++) {
1221         uint64_t *q = aa64_vfp_qreg(env, i);
1222         reg.id = AARCH64_SIMD_CORE_REG(fp_regs.vregs[i]);
1223         reg.addr = (uintptr_t)q;
1224         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1225         if (ret) {
1226             return ret;
1227         } else {
1228 #if HOST_BIG_ENDIAN
1229             uint64_t t;
1230             t = q[0], q[0] = q[1], q[1] = t;
1231 #endif
1232         }
1233     }
1234 
1235     return 0;
1236 }
1237 
1238 /*
1239  * KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits
1240  * and PREGS and the FFR have a slice size of 256 bits. However we simply hard
1241  * code the slice index to zero for now as it's unlikely we'll need more than
1242  * one slice for quite some time.
1243  */
1244 static int kvm_arch_get_sve(CPUState *cs)
1245 {
1246     ARMCPU *cpu = ARM_CPU(cs);
1247     CPUARMState *env = &cpu->env;
1248     struct kvm_one_reg reg;
1249     uint64_t *r;
1250     int n, ret;
1251 
1252     for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; ++n) {
1253         r = &env->vfp.zregs[n].d[0];
1254         reg.addr = (uintptr_t)r;
1255         reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);
1256         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1257         if (ret) {
1258             return ret;
1259         }
1260         sve_bswap64(r, r, cpu->sve_max_vq * 2);
1261     }
1262 
1263     for (n = 0; n < KVM_ARM64_SVE_NUM_PREGS; ++n) {
1264         r = &env->vfp.pregs[n].p[0];
1265         reg.addr = (uintptr_t)r;
1266         reg.id = KVM_REG_ARM64_SVE_PREG(n, 0);
1267         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1268         if (ret) {
1269             return ret;
1270         }
1271         sve_bswap64(r, r, DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
1272     }
1273 
1274     r = &env->vfp.pregs[FFR_PRED_NUM].p[0];
1275     reg.addr = (uintptr_t)r;
1276     reg.id = KVM_REG_ARM64_SVE_FFR(0);
1277     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1278     if (ret) {
1279         return ret;
1280     }
1281     sve_bswap64(r, r, DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
1282 
1283     return 0;
1284 }
1285 
1286 int kvm_arch_get_registers(CPUState *cs)
1287 {
1288     struct kvm_one_reg reg;
1289     uint64_t val;
1290     unsigned int el;
1291     uint32_t fpr;
1292     int i, ret;
1293 
1294     ARMCPU *cpu = ARM_CPU(cs);
1295     CPUARMState *env = &cpu->env;
1296 
1297     for (i = 0; i < 31; i++) {
1298         reg.id = AARCH64_CORE_REG(regs.regs[i]);
1299         reg.addr = (uintptr_t) &env->xregs[i];
1300         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1301         if (ret) {
1302             return ret;
1303         }
1304     }
1305 
1306     reg.id = AARCH64_CORE_REG(regs.sp);
1307     reg.addr = (uintptr_t) &env->sp_el[0];
1308     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1309     if (ret) {
1310         return ret;
1311     }
1312 
1313     reg.id = AARCH64_CORE_REG(sp_el1);
1314     reg.addr = (uintptr_t) &env->sp_el[1];
1315     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1316     if (ret) {
1317         return ret;
1318     }
1319 
1320     reg.id = AARCH64_CORE_REG(regs.pstate);
1321     reg.addr = (uintptr_t) &val;
1322     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1323     if (ret) {
1324         return ret;
1325     }
1326 
1327     env->aarch64 = ((val & PSTATE_nRW) == 0);
1328     if (is_a64(env)) {
1329         pstate_write(env, val);
1330     } else {
1331         cpsr_write(env, val, 0xffffffff, CPSRWriteRaw);
1332     }
1333 
1334     /* KVM puts SP_EL0 in regs.sp and SP_EL1 in regs.sp_el1. On the
1335      * QEMU side we keep the current SP in xregs[31] as well.
1336      */
1337     aarch64_restore_sp(env, 1);
1338 
1339     reg.id = AARCH64_CORE_REG(regs.pc);
1340     reg.addr = (uintptr_t) &env->pc;
1341     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1342     if (ret) {
1343         return ret;
1344     }
1345 
1346     /* If we are in AArch32 mode then we need to sync the AArch32 regs with the
1347      * incoming AArch64 regs received from 64-bit KVM.
1348      * We must perform this after all of the registers have been acquired from
1349      * the kernel.
1350      */
1351     if (!is_a64(env)) {
1352         aarch64_sync_64_to_32(env);
1353     }
1354 
1355     reg.id = AARCH64_CORE_REG(elr_el1);
1356     reg.addr = (uintptr_t) &env->elr_el[1];
1357     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1358     if (ret) {
1359         return ret;
1360     }
1361 
1362     /* Fetch the SPSR registers
1363      *
1364      * KVM SPSRs 0-4 map to QEMU banks 1-5
1365      */
1366     for (i = 0; i < KVM_NR_SPSR; i++) {
1367         reg.id = AARCH64_CORE_REG(spsr[i]);
1368         reg.addr = (uintptr_t) &env->banked_spsr[i + 1];
1369         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1370         if (ret) {
1371             return ret;
1372         }
1373     }
1374 
1375     el = arm_current_el(env);
1376     if (el > 0 && !is_a64(env)) {
1377         i = bank_number(env->uncached_cpsr & CPSR_M);
1378         env->spsr = env->banked_spsr[i];
1379     }
1380 
1381     if (cpu_isar_feature(aa64_sve, cpu)) {
1382         ret = kvm_arch_get_sve(cs);
1383     } else {
1384         ret = kvm_arch_get_fpsimd(cs);
1385     }
1386     if (ret) {
1387         return ret;
1388     }
1389 
1390     reg.addr = (uintptr_t)(&fpr);
1391     reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpsr);
1392     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1393     if (ret) {
1394         return ret;
1395     }
1396     vfp_set_fpsr(env, fpr);
1397 
1398     reg.addr = (uintptr_t)(&fpr);
1399     reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpcr);
1400     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1401     if (ret) {
1402         return ret;
1403     }
1404     vfp_set_fpcr(env, fpr);
1405 
1406     ret = kvm_get_vcpu_events(cpu);
1407     if (ret) {
1408         return ret;
1409     }
1410 
1411     if (!write_kvmstate_to_list(cpu)) {
1412         return -EINVAL;
1413     }
1414     /* Note that it's OK to have registers which aren't in CPUState,
1415      * so we can ignore a failure return here.
1416      */
1417     write_list_to_cpustate(cpu);
1418 
1419     kvm_arm_sync_mpstate_to_qemu(cpu);
1420 
1421     /* TODO: other registers */
1422     return ret;
1423 }
1424 
1425 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
1426 {
1427     ram_addr_t ram_addr;
1428     hwaddr paddr;
1429 
1430     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
1431 
1432     if (acpi_ghes_present() && addr) {
1433         ram_addr = qemu_ram_addr_from_host(addr);
1434         if (ram_addr != RAM_ADDR_INVALID &&
1435             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
1436             kvm_hwpoison_page_add(ram_addr);
1437             /*
1438              * If this is a BUS_MCEERR_AR, we know we have been called
1439              * synchronously from the vCPU thread, so we can easily
1440              * synchronize the state and inject an error.
1441              *
1442              * TODO: we currently don't tell the guest at all about
1443              * BUS_MCEERR_AO. In that case we might either be being
1444              * called synchronously from the vCPU thread, or a bit
1445              * later from the main thread, so doing the injection of
1446              * the error would be more complicated.
1447              */
1448             if (code == BUS_MCEERR_AR) {
1449                 kvm_cpu_synchronize_state(c);
1450                 if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
1451                     kvm_inject_arm_sea(c);
1452                 } else {
1453                     error_report("failed to record the error");
1454                     abort();
1455                 }
1456             }
1457             return;
1458         }
1459         if (code == BUS_MCEERR_AO) {
1460             error_report("Hardware memory error at addr %p for memory used by "
1461                 "QEMU itself instead of guest system!", addr);
1462         }
1463     }
1464 
1465     if (code == BUS_MCEERR_AR) {
1466         error_report("Hardware memory error!");
1467         exit(1);
1468     }
1469 }
1470 
1471 /* C6.6.29 BRK instruction */
1472 static const uint32_t brk_insn = 0xd4200000;
1473 
1474 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1475 {
1476     if (have_guest_debug) {
1477         if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 0) ||
1478             cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk_insn, 4, 1)) {
1479             return -EINVAL;
1480         }
1481         return 0;
1482     } else {
1483         error_report("guest debug not supported on this kernel");
1484         return -EINVAL;
1485     }
1486 }
1487 
1488 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1489 {
1490     static uint32_t brk;
1491 
1492     if (have_guest_debug) {
1493         if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk, 4, 0) ||
1494             brk != brk_insn ||
1495             cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 1)) {
1496             return -EINVAL;
1497         }
1498         return 0;
1499     } else {
1500         error_report("guest debug not supported on this kernel");
1501         return -EINVAL;
1502     }
1503 }
1504 
1505 /* See v8 ARM ARM D7.2.27 ESR_ELx, Exception Syndrome Register
1506  *
1507  * To minimise translating between kernel and user-space the kernel
1508  * ABI just provides user-space with the full exception syndrome
1509  * register value to be decoded in QEMU.
1510  */
1511 
1512 bool kvm_arm_handle_debug(CPUState *cs, struct kvm_debug_exit_arch *debug_exit)
1513 {
1514     int hsr_ec = syn_get_ec(debug_exit->hsr);
1515     ARMCPU *cpu = ARM_CPU(cs);
1516     CPUARMState *env = &cpu->env;
1517 
1518     /* Ensure PC is synchronised */
1519     kvm_cpu_synchronize_state(cs);
1520 
1521     switch (hsr_ec) {
1522     case EC_SOFTWARESTEP:
1523         if (cs->singlestep_enabled) {
1524             return true;
1525         } else {
1526             /*
1527              * The kernel should have suppressed the guest's ability to
1528              * single step at this point so something has gone wrong.
1529              */
1530             error_report("%s: guest single-step while debugging unsupported"
1531                          " (%"PRIx64", %"PRIx32")",
1532                          __func__, env->pc, debug_exit->hsr);
1533             return false;
1534         }
1535         break;
1536     case EC_AA64_BKPT:
1537         if (kvm_find_sw_breakpoint(cs, env->pc)) {
1538             return true;
1539         }
1540         break;
1541     case EC_BREAKPOINT:
1542         if (find_hw_breakpoint(cs, env->pc)) {
1543             return true;
1544         }
1545         break;
1546     case EC_WATCHPOINT:
1547     {
1548         CPUWatchpoint *wp = find_hw_watchpoint(cs, debug_exit->far);
1549         if (wp) {
1550             cs->watchpoint_hit = wp;
1551             return true;
1552         }
1553         break;
1554     }
1555     default:
1556         error_report("%s: unhandled debug exit (%"PRIx32", %"PRIx64")",
1557                      __func__, debug_exit->hsr, env->pc);
1558     }
1559 
1560     /* If we are not handling the debug exception it must belong to
1561      * the guest. Let's re-use the existing TCG interrupt code to set
1562      * everything up properly.
1563      */
1564     cs->exception_index = EXCP_BKPT;
1565     env->exception.syndrome = debug_exit->hsr;
1566     env->exception.vaddress = debug_exit->far;
1567     env->exception.target_el = 1;
1568     qemu_mutex_lock_iothread();
1569     arm_cpu_do_interrupt(cs);
1570     qemu_mutex_unlock_iothread();
1571 
1572     return false;
1573 }
1574 
1575 #define ARM64_REG_ESR_EL1 ARM64_SYS_REG(3, 0, 5, 2, 0)
1576 #define ARM64_REG_TCR_EL1 ARM64_SYS_REG(3, 0, 2, 0, 2)
1577 
1578 /*
1579  * ESR_EL1
1580  * ISS encoding
1581  * AARCH64: DFSC,   bits [5:0]
1582  * AARCH32:
1583  *      TTBCR.EAE == 0
1584  *          FS[4]   - DFSR[10]
1585  *          FS[3:0] - DFSR[3:0]
1586  *      TTBCR.EAE == 1
1587  *          FS, bits [5:0]
1588  */
1589 #define ESR_DFSC(aarch64, lpae, v)        \
1590     ((aarch64 || (lpae)) ? ((v) & 0x3F)   \
1591                : (((v) >> 6) | ((v) & 0x1F)))
1592 
1593 #define ESR_DFSC_EXTABT(aarch64, lpae) \
1594     ((aarch64) ? 0x10 : (lpae) ? 0x10 : 0x8)
1595 
1596 bool kvm_arm_verify_ext_dabt_pending(CPUState *cs)
1597 {
1598     uint64_t dfsr_val;
1599 
1600     if (!kvm_get_one_reg(cs, ARM64_REG_ESR_EL1, &dfsr_val)) {
1601         ARMCPU *cpu = ARM_CPU(cs);
1602         CPUARMState *env = &cpu->env;
1603         int aarch64_mode = arm_feature(env, ARM_FEATURE_AARCH64);
1604         int lpae = 0;
1605 
1606         if (!aarch64_mode) {
1607             uint64_t ttbcr;
1608 
1609             if (!kvm_get_one_reg(cs, ARM64_REG_TCR_EL1, &ttbcr)) {
1610                 lpae = arm_feature(env, ARM_FEATURE_LPAE)
1611                         && (ttbcr & TTBCR_EAE);
1612             }
1613         }
1614         /*
1615          * The verification here is based on the DFSC bits
1616          * of the ESR_EL1 reg only
1617          */
1618          return (ESR_DFSC(aarch64_mode, lpae, dfsr_val) ==
1619                 ESR_DFSC_EXTABT(aarch64_mode, lpae));
1620     }
1621     return false;
1622 }
1623