xref: /openbmc/qemu/target/arm/tcg/sme_helper.c (revision ea2fde5b)
1 /*
2  * ARM SME Operations
3  *
4  * Copyright (c) 2022 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "exec/helper-proto.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/exec-all.h"
27 #include "qemu/int128.h"
28 #include "fpu/softfloat.h"
29 #include "vec_internal.h"
30 #include "sve_ldst_internal.h"
31 
32 void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
33 {
34     aarch64_set_svcr(env, val, mask);
35 }
36 
37 void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
38 {
39     uint32_t i;
40 
41     /*
42      * Special case clearing the entire ZA space.
43      * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
44      * parts of the ZA storage outside of SVL.
45      */
46     if (imm == 0xff) {
47         memset(env->zarray, 0, sizeof(env->zarray));
48         return;
49     }
50 
51     /*
52      * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
53      * so each row is discontiguous within ZA[].
54      */
55     for (i = 0; i < svl; i++) {
56         if (imm & (1 << (i % 8))) {
57             memset(&env->zarray[i], 0, svl);
58         }
59     }
60 }
61 
62 
63 /*
64  * When considering the ZA storage as an array of elements of
65  * type T, the index within that array of the Nth element of
66  * a vertical slice of a tile can be calculated like this,
67  * regardless of the size of type T. This is because the tiles
68  * are interleaved, so if type T is size N bytes then row 1 of
69  * the tile is N rows away from row 0. The division by N to
70  * convert a byte offset into an array index and the multiplication
71  * by N to convert from vslice-index-within-the-tile to
72  * the index within the ZA storage cancel out.
73  */
74 #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
75 
76 /*
77  * When doing byte arithmetic on the ZA storage, the element
78  * byteoff bytes away in a tile vertical slice is always this
79  * many bytes away in the ZA storage, regardless of the
80  * size of the tile element, assuming that byteoff is a multiple
81  * of the element size. Again this is because of the interleaving
82  * of the tiles. For instance if we have 1 byte per element then
83  * each row of the ZA storage has one byte of the vslice data,
84  * and (counting from 0) byte 8 goes in row 8 of the storage
85  * at offset (8 * row-size-in-bytes).
86  * If we have 8 bytes per element then each row of the ZA storage
87  * has 8 bytes of the data, but there are 8 interleaved tiles and
88  * so byte 8 of the data goes into row 1 of the tile,
89  * which is again row 8 of the storage, so the offset is still
90  * (8 * row-size-in-bytes). Similarly for other element sizes.
91  */
92 #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
93 
94 
95 /*
96  * Move Zreg vector to ZArray column.
97  */
98 #define DO_MOVA_C(NAME, TYPE, H)                                        \
99 void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
100 {                                                                       \
101     int i, oprsz = simd_oprsz(desc);                                    \
102     for (i = 0; i < oprsz; ) {                                          \
103         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
104         do {                                                            \
105             if (pg & 1) {                                               \
106                 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
107             }                                                           \
108             i += sizeof(TYPE);                                          \
109             pg >>= sizeof(TYPE);                                        \
110         } while (i & 15);                                               \
111     }                                                                   \
112 }
113 
114 DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
115 DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
116 DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
117 
118 void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
119 {
120     int i, oprsz = simd_oprsz(desc) / 8;
121     uint8_t *pg = vg;
122     uint64_t *n = vn;
123     uint64_t *a = za;
124 
125     for (i = 0; i < oprsz; i++) {
126         if (pg[H1(i)] & 1) {
127             a[tile_vslice_index(i)] = n[i];
128         }
129     }
130 }
131 
132 void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
133 {
134     int i, oprsz = simd_oprsz(desc) / 16;
135     uint16_t *pg = vg;
136     Int128 *n = vn;
137     Int128 *a = za;
138 
139     /*
140      * Int128 is used here simply to copy 16 bytes, and to simplify
141      * the address arithmetic.
142      */
143     for (i = 0; i < oprsz; i++) {
144         if (pg[H2(i)] & 1) {
145             a[tile_vslice_index(i)] = n[i];
146         }
147     }
148 }
149 
150 #undef DO_MOVA_C
151 
152 /*
153  * Move ZArray column to Zreg vector.
154  */
155 #define DO_MOVA_Z(NAME, TYPE, H)                                        \
156 void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
157 {                                                                       \
158     int i, oprsz = simd_oprsz(desc);                                    \
159     for (i = 0; i < oprsz; ) {                                          \
160         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
161         do {                                                            \
162             if (pg & 1) {                                               \
163                 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
164             }                                                           \
165             i += sizeof(TYPE);                                          \
166             pg >>= sizeof(TYPE);                                        \
167         } while (i & 15);                                               \
168     }                                                                   \
169 }
170 
171 DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
172 DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
173 DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
174 
175 void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
176 {
177     int i, oprsz = simd_oprsz(desc) / 8;
178     uint8_t *pg = vg;
179     uint64_t *d = vd;
180     uint64_t *a = za;
181 
182     for (i = 0; i < oprsz; i++) {
183         if (pg[H1(i)] & 1) {
184             d[i] = a[tile_vslice_index(i)];
185         }
186     }
187 }
188 
189 void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
190 {
191     int i, oprsz = simd_oprsz(desc) / 16;
192     uint16_t *pg = vg;
193     Int128 *d = vd;
194     Int128 *a = za;
195 
196     /*
197      * Int128 is used here simply to copy 16 bytes, and to simplify
198      * the address arithmetic.
199      */
200     for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
201         if (pg[H2(i)] & 1) {
202             d[i] = a[tile_vslice_index(i)];
203         }
204     }
205 }
206 
207 #undef DO_MOVA_Z
208 
209 /*
210  * Clear elements in a tile slice comprising len bytes.
211  */
212 
213 typedef void ClearFn(void *ptr, size_t off, size_t len);
214 
215 static void clear_horizontal(void *ptr, size_t off, size_t len)
216 {
217     memset(ptr + off, 0, len);
218 }
219 
220 static void clear_vertical_b(void *vptr, size_t off, size_t len)
221 {
222     for (size_t i = 0; i < len; ++i) {
223         *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
224     }
225 }
226 
227 static void clear_vertical_h(void *vptr, size_t off, size_t len)
228 {
229     for (size_t i = 0; i < len; i += 2) {
230         *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
231     }
232 }
233 
234 static void clear_vertical_s(void *vptr, size_t off, size_t len)
235 {
236     for (size_t i = 0; i < len; i += 4) {
237         *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
238     }
239 }
240 
241 static void clear_vertical_d(void *vptr, size_t off, size_t len)
242 {
243     for (size_t i = 0; i < len; i += 8) {
244         *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
245     }
246 }
247 
248 static void clear_vertical_q(void *vptr, size_t off, size_t len)
249 {
250     for (size_t i = 0; i < len; i += 16) {
251         memset(vptr + tile_vslice_offset(i + off), 0, 16);
252     }
253 }
254 
255 /*
256  * Copy elements from an array into a tile slice comprising len bytes.
257  */
258 
259 typedef void CopyFn(void *dst, const void *src, size_t len);
260 
261 static void copy_horizontal(void *dst, const void *src, size_t len)
262 {
263     memcpy(dst, src, len);
264 }
265 
266 static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
267 {
268     const uint8_t *src = vsrc;
269     uint8_t *dst = vdst;
270     size_t i;
271 
272     for (i = 0; i < len; ++i) {
273         dst[tile_vslice_index(i)] = src[i];
274     }
275 }
276 
277 static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
278 {
279     const uint16_t *src = vsrc;
280     uint16_t *dst = vdst;
281     size_t i;
282 
283     for (i = 0; i < len / 2; ++i) {
284         dst[tile_vslice_index(i)] = src[i];
285     }
286 }
287 
288 static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
289 {
290     const uint32_t *src = vsrc;
291     uint32_t *dst = vdst;
292     size_t i;
293 
294     for (i = 0; i < len / 4; ++i) {
295         dst[tile_vslice_index(i)] = src[i];
296     }
297 }
298 
299 static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
300 {
301     const uint64_t *src = vsrc;
302     uint64_t *dst = vdst;
303     size_t i;
304 
305     for (i = 0; i < len / 8; ++i) {
306         dst[tile_vslice_index(i)] = src[i];
307     }
308 }
309 
310 static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
311 {
312     for (size_t i = 0; i < len; i += 16) {
313         memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
314     }
315 }
316 
317 /*
318  * Host and TLB primitives for vertical tile slice addressing.
319  */
320 
321 #define DO_LD(NAME, TYPE, HOST, TLB)                                        \
322 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
323 {                                                                           \
324     TYPE val = HOST(host);                                                  \
325     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
326 }                                                                           \
327 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
328                         intptr_t off, target_ulong addr, uintptr_t ra)      \
329 {                                                                           \
330     TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
331     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
332 }
333 
334 #define DO_ST(NAME, TYPE, HOST, TLB)                                        \
335 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
336 {                                                                           \
337     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
338     HOST(host, val);                                                        \
339 }                                                                           \
340 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
341                         intptr_t off, target_ulong addr, uintptr_t ra)      \
342 {                                                                           \
343     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
344     TLB(env, useronly_clean_ptr(addr), val, ra);                            \
345 }
346 
347 /*
348  * The ARMVectorReg elements are stored in host-endian 64-bit units.
349  * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
350  * corresponds to storing the two 64-bit pieces in little-endian order.
351  */
352 #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
353 static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
354 {                                                                           \
355     uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
356     uint64_t *ptr = za + off;                                               \
357     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
358 }                                                                           \
359 static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
360 {                                                                           \
361     HNAME##_host(za, tile_vslice_offset(off), host);                        \
362 }                                                                           \
363 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
364                                target_ulong addr, uintptr_t ra)             \
365 {                                                                           \
366     uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
367     uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
368     uint64_t *ptr = za + off;                                               \
369     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
370 }                                                                           \
371 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
372                                target_ulong addr, uintptr_t ra)             \
373 {                                                                           \
374     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
375 }
376 
377 #define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
378 static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
379 {                                                                           \
380     uint64_t *ptr = za + off;                                               \
381     HOST(host, ptr[BE]);                                                    \
382     HOST(host + 8, ptr[!BE]);                                               \
383 }                                                                           \
384 static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
385 {                                                                           \
386     HNAME##_host(za, tile_vslice_offset(off), host);                        \
387 }                                                                           \
388 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
389                                target_ulong addr, uintptr_t ra)             \
390 {                                                                           \
391     uint64_t *ptr = za + off;                                               \
392     TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
393     TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
394 }                                                                           \
395 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
396                                target_ulong addr, uintptr_t ra)             \
397 {                                                                           \
398     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
399 }
400 
401 DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
402 DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
403 DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
404 DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
405 DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
406 DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
407 DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
408 
409 DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
410 DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
411 
412 DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
413 DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
414 DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
415 DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
416 DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
417 DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
418 DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
419 
420 DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
421 DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
422 
423 #undef DO_LD
424 #undef DO_ST
425 #undef DO_LDQ
426 #undef DO_STQ
427 
428 /*
429  * Common helper for all contiguous predicated loads.
430  */
431 
432 static inline QEMU_ALWAYS_INLINE
433 void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
434              const target_ulong addr, uint32_t desc, const uintptr_t ra,
435              const int esz, uint32_t mtedesc, bool vertical,
436              sve_ldst1_host_fn *host_fn,
437              sve_ldst1_tlb_fn *tlb_fn,
438              ClearFn *clr_fn,
439              CopyFn *cpy_fn)
440 {
441     const intptr_t reg_max = simd_oprsz(desc);
442     const intptr_t esize = 1 << esz;
443     intptr_t reg_off, reg_last;
444     SVEContLdSt info;
445     void *host;
446     int flags;
447 
448     /* Find the active elements.  */
449     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
450         /* The entire predicate was false; no load occurs.  */
451         clr_fn(za, 0, reg_max);
452         return;
453     }
454 
455     /* Probe the page(s).  Exit with exception for any invalid page. */
456     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
457 
458     /* Handle watchpoints for all active elements. */
459     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
460                               BP_MEM_READ, ra);
461 
462     /*
463      * Handle mte checks for all active elements.
464      * Since TBI must be set for MTE, !mtedesc => !mte_active.
465      */
466     if (mtedesc) {
467         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
468                                 mtedesc, ra);
469     }
470 
471     flags = info.page[0].flags | info.page[1].flags;
472     if (unlikely(flags != 0)) {
473 #ifdef CONFIG_USER_ONLY
474         g_assert_not_reached();
475 #else
476         /*
477          * At least one page includes MMIO.
478          * Any bus operation can fail with cpu_transaction_failed,
479          * which for ARM will raise SyncExternal.  Perform the load
480          * into scratch memory to preserve register state until the end.
481          */
482         ARMVectorReg scratch = { };
483 
484         reg_off = info.reg_off_first[0];
485         reg_last = info.reg_off_last[1];
486         if (reg_last < 0) {
487             reg_last = info.reg_off_split;
488             if (reg_last < 0) {
489                 reg_last = info.reg_off_last[0];
490             }
491         }
492 
493         do {
494             uint64_t pg = vg[reg_off >> 6];
495             do {
496                 if ((pg >> (reg_off & 63)) & 1) {
497                     tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
498                 }
499                 reg_off += esize;
500             } while (reg_off & 63);
501         } while (reg_off <= reg_last);
502 
503         cpy_fn(za, &scratch, reg_max);
504         return;
505 #endif
506     }
507 
508     /* The entire operation is in RAM, on valid pages. */
509 
510     reg_off = info.reg_off_first[0];
511     reg_last = info.reg_off_last[0];
512     host = info.page[0].host;
513 
514     if (!vertical) {
515         memset(za, 0, reg_max);
516     } else if (reg_off) {
517         clr_fn(za, 0, reg_off);
518     }
519 
520     while (reg_off <= reg_last) {
521         uint64_t pg = vg[reg_off >> 6];
522         do {
523             if ((pg >> (reg_off & 63)) & 1) {
524                 host_fn(za, reg_off, host + reg_off);
525             } else if (vertical) {
526                 clr_fn(za, reg_off, esize);
527             }
528             reg_off += esize;
529         } while (reg_off <= reg_last && (reg_off & 63));
530     }
531 
532     /*
533      * Use the slow path to manage the cross-page misalignment.
534      * But we know this is RAM and cannot trap.
535      */
536     reg_off = info.reg_off_split;
537     if (unlikely(reg_off >= 0)) {
538         tlb_fn(env, za, reg_off, addr + reg_off, ra);
539     }
540 
541     reg_off = info.reg_off_first[1];
542     if (unlikely(reg_off >= 0)) {
543         reg_last = info.reg_off_last[1];
544         host = info.page[1].host;
545 
546         do {
547             uint64_t pg = vg[reg_off >> 6];
548             do {
549                 if ((pg >> (reg_off & 63)) & 1) {
550                     host_fn(za, reg_off, host + reg_off);
551                 } else if (vertical) {
552                     clr_fn(za, reg_off, esize);
553                 }
554                 reg_off += esize;
555             } while (reg_off & 63);
556         } while (reg_off <= reg_last);
557     }
558 }
559 
560 static inline QEMU_ALWAYS_INLINE
561 void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
562                  target_ulong addr, uint32_t desc, uintptr_t ra,
563                  const int esz, bool vertical,
564                  sve_ldst1_host_fn *host_fn,
565                  sve_ldst1_tlb_fn *tlb_fn,
566                  ClearFn *clr_fn,
567                  CopyFn *cpy_fn)
568 {
569     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
570     int bit55 = extract64(addr, 55, 1);
571 
572     /* Remove mtedesc from the normal sve descriptor. */
573     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
574 
575     /* Perform gross MTE suppression early. */
576     if (!tbi_check(mtedesc, bit55) ||
577         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
578         mtedesc = 0;
579     }
580 
581     sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
582             host_fn, tlb_fn, clr_fn, cpy_fn);
583 }
584 
585 #define DO_LD(L, END, ESZ)                                                 \
586 void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
587                                  target_ulong addr, uint32_t desc)         \
588 {                                                                          \
589     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
590             sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
591             clear_horizontal, copy_horizontal);                            \
592 }                                                                          \
593 void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
594                                  target_ulong addr, uint32_t desc)         \
595 {                                                                          \
596     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
597             sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
598             clear_vertical_##L, copy_vertical_##L);                        \
599 }                                                                          \
600 void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
601                                      target_ulong addr, uint32_t desc)     \
602 {                                                                          \
603     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
604                 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
605                 clear_horizontal, copy_horizontal);                        \
606 }                                                                          \
607 void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
608                                      target_ulong addr, uint32_t desc)     \
609 {                                                                          \
610     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
611                 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
612                 clear_vertical_##L, copy_vertical_##L);                    \
613 }
614 
615 DO_LD(b, , MO_8)
616 DO_LD(h, _be, MO_16)
617 DO_LD(h, _le, MO_16)
618 DO_LD(s, _be, MO_32)
619 DO_LD(s, _le, MO_32)
620 DO_LD(d, _be, MO_64)
621 DO_LD(d, _le, MO_64)
622 DO_LD(q, _be, MO_128)
623 DO_LD(q, _le, MO_128)
624 
625 #undef DO_LD
626 
627 /*
628  * Common helper for all contiguous predicated stores.
629  */
630 
631 static inline QEMU_ALWAYS_INLINE
632 void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
633              const target_ulong addr, uint32_t desc, const uintptr_t ra,
634              const int esz, uint32_t mtedesc, bool vertical,
635              sve_ldst1_host_fn *host_fn,
636              sve_ldst1_tlb_fn *tlb_fn)
637 {
638     const intptr_t reg_max = simd_oprsz(desc);
639     const intptr_t esize = 1 << esz;
640     intptr_t reg_off, reg_last;
641     SVEContLdSt info;
642     void *host;
643     int flags;
644 
645     /* Find the active elements.  */
646     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
647         /* The entire predicate was false; no store occurs.  */
648         return;
649     }
650 
651     /* Probe the page(s).  Exit with exception for any invalid page. */
652     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
653 
654     /* Handle watchpoints for all active elements. */
655     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
656                               BP_MEM_WRITE, ra);
657 
658     /*
659      * Handle mte checks for all active elements.
660      * Since TBI must be set for MTE, !mtedesc => !mte_active.
661      */
662     if (mtedesc) {
663         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
664                                 mtedesc, ra);
665     }
666 
667     flags = info.page[0].flags | info.page[1].flags;
668     if (unlikely(flags != 0)) {
669 #ifdef CONFIG_USER_ONLY
670         g_assert_not_reached();
671 #else
672         /*
673          * At least one page includes MMIO.
674          * Any bus operation can fail with cpu_transaction_failed,
675          * which for ARM will raise SyncExternal.  We cannot avoid
676          * this fault and will leave with the store incomplete.
677          */
678         reg_off = info.reg_off_first[0];
679         reg_last = info.reg_off_last[1];
680         if (reg_last < 0) {
681             reg_last = info.reg_off_split;
682             if (reg_last < 0) {
683                 reg_last = info.reg_off_last[0];
684             }
685         }
686 
687         do {
688             uint64_t pg = vg[reg_off >> 6];
689             do {
690                 if ((pg >> (reg_off & 63)) & 1) {
691                     tlb_fn(env, za, reg_off, addr + reg_off, ra);
692                 }
693                 reg_off += esize;
694             } while (reg_off & 63);
695         } while (reg_off <= reg_last);
696         return;
697 #endif
698     }
699 
700     reg_off = info.reg_off_first[0];
701     reg_last = info.reg_off_last[0];
702     host = info.page[0].host;
703 
704     while (reg_off <= reg_last) {
705         uint64_t pg = vg[reg_off >> 6];
706         do {
707             if ((pg >> (reg_off & 63)) & 1) {
708                 host_fn(za, reg_off, host + reg_off);
709             }
710             reg_off += 1 << esz;
711         } while (reg_off <= reg_last && (reg_off & 63));
712     }
713 
714     /*
715      * Use the slow path to manage the cross-page misalignment.
716      * But we know this is RAM and cannot trap.
717      */
718     reg_off = info.reg_off_split;
719     if (unlikely(reg_off >= 0)) {
720         tlb_fn(env, za, reg_off, addr + reg_off, ra);
721     }
722 
723     reg_off = info.reg_off_first[1];
724     if (unlikely(reg_off >= 0)) {
725         reg_last = info.reg_off_last[1];
726         host = info.page[1].host;
727 
728         do {
729             uint64_t pg = vg[reg_off >> 6];
730             do {
731                 if ((pg >> (reg_off & 63)) & 1) {
732                     host_fn(za, reg_off, host + reg_off);
733                 }
734                 reg_off += 1 << esz;
735             } while (reg_off & 63);
736         } while (reg_off <= reg_last);
737     }
738 }
739 
740 static inline QEMU_ALWAYS_INLINE
741 void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
742                  uint32_t desc, uintptr_t ra, int esz, bool vertical,
743                  sve_ldst1_host_fn *host_fn,
744                  sve_ldst1_tlb_fn *tlb_fn)
745 {
746     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
747     int bit55 = extract64(addr, 55, 1);
748 
749     /* Remove mtedesc from the normal sve descriptor. */
750     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
751 
752     /* Perform gross MTE suppression early. */
753     if (!tbi_check(mtedesc, bit55) ||
754         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
755         mtedesc = 0;
756     }
757 
758     sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
759             vertical, host_fn, tlb_fn);
760 }
761 
762 #define DO_ST(L, END, ESZ)                                                 \
763 void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
764                                  target_ulong addr, uint32_t desc)         \
765 {                                                                          \
766     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
767             sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
768 }                                                                          \
769 void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
770                                  target_ulong addr, uint32_t desc)         \
771 {                                                                          \
772     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
773             sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
774 }                                                                          \
775 void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
776                                      target_ulong addr, uint32_t desc)     \
777 {                                                                          \
778     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
779                 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
780 }                                                                          \
781 void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
782                                      target_ulong addr, uint32_t desc)     \
783 {                                                                          \
784     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
785                 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
786 }
787 
788 DO_ST(b, , MO_8)
789 DO_ST(h, _be, MO_16)
790 DO_ST(h, _le, MO_16)
791 DO_ST(s, _be, MO_32)
792 DO_ST(s, _le, MO_32)
793 DO_ST(d, _be, MO_64)
794 DO_ST(d, _le, MO_64)
795 DO_ST(q, _be, MO_128)
796 DO_ST(q, _le, MO_128)
797 
798 #undef DO_ST
799 
800 void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
801                          void *vpm, uint32_t desc)
802 {
803     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
804     uint64_t *pn = vpn, *pm = vpm;
805     uint32_t *zda = vzda, *zn = vzn;
806 
807     for (row = 0; row < oprsz; ) {
808         uint64_t pa = pn[row >> 4];
809         do {
810             if (pa & 1) {
811                 for (col = 0; col < oprsz; ) {
812                     uint64_t pb = pm[col >> 4];
813                     do {
814                         if (pb & 1) {
815                             zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
816                         }
817                         pb >>= 4;
818                     } while (++col & 15);
819                 }
820             }
821             pa >>= 4;
822         } while (++row & 15);
823     }
824 }
825 
826 void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
827                          void *vpm, uint32_t desc)
828 {
829     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
830     uint8_t *pn = vpn, *pm = vpm;
831     uint64_t *zda = vzda, *zn = vzn;
832 
833     for (row = 0; row < oprsz; ++row) {
834         if (pn[H1(row)] & 1) {
835             for (col = 0; col < oprsz; ++col) {
836                 if (pm[H1(col)] & 1) {
837                     zda[tile_vslice_index(row) + col] += zn[col];
838                 }
839             }
840         }
841     }
842 }
843 
844 void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
845                          void *vpm, uint32_t desc)
846 {
847     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
848     uint64_t *pn = vpn, *pm = vpm;
849     uint32_t *zda = vzda, *zn = vzn;
850 
851     for (row = 0; row < oprsz; ) {
852         uint64_t pa = pn[row >> 4];
853         do {
854             if (pa & 1) {
855                 uint32_t zn_row = zn[H4(row)];
856                 for (col = 0; col < oprsz; ) {
857                     uint64_t pb = pm[col >> 4];
858                     do {
859                         if (pb & 1) {
860                             zda[tile_vslice_index(row) + H4(col)] += zn_row;
861                         }
862                         pb >>= 4;
863                     } while (++col & 15);
864                 }
865             }
866             pa >>= 4;
867         } while (++row & 15);
868     }
869 }
870 
871 void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
872                          void *vpm, uint32_t desc)
873 {
874     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
875     uint8_t *pn = vpn, *pm = vpm;
876     uint64_t *zda = vzda, *zn = vzn;
877 
878     for (row = 0; row < oprsz; ++row) {
879         if (pn[H1(row)] & 1) {
880             uint64_t zn_row = zn[row];
881             for (col = 0; col < oprsz; ++col) {
882                 if (pm[H1(col)] & 1) {
883                     zda[tile_vslice_index(row) + col] += zn_row;
884                 }
885             }
886         }
887     }
888 }
889 
890 void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
891                          void *vpm, void *vst, uint32_t desc)
892 {
893     intptr_t row, col, oprsz = simd_maxsz(desc);
894     uint32_t neg = simd_data(desc) << 31;
895     uint16_t *pn = vpn, *pm = vpm;
896     float_status fpst;
897 
898     /*
899      * Make a copy of float_status because this operation does not
900      * update the cumulative fp exception status.  It also produces
901      * default nans.
902      */
903     fpst = *(float_status *)vst;
904     set_default_nan_mode(true, &fpst);
905 
906     for (row = 0; row < oprsz; ) {
907         uint16_t pa = pn[H2(row >> 4)];
908         do {
909             if (pa & 1) {
910                 void *vza_row = vza + tile_vslice_offset(row);
911                 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
912 
913                 for (col = 0; col < oprsz; ) {
914                     uint16_t pb = pm[H2(col >> 4)];
915                     do {
916                         if (pb & 1) {
917                             uint32_t *a = vza_row + H1_4(col);
918                             uint32_t *m = vzm + H1_4(col);
919                             *a = float32_muladd(n, *m, *a, 0, vst);
920                         }
921                         col += 4;
922                         pb >>= 4;
923                     } while (col & 15);
924                 }
925             }
926             row += 4;
927             pa >>= 4;
928         } while (row & 15);
929     }
930 }
931 
932 void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
933                          void *vpm, void *vst, uint32_t desc)
934 {
935     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
936     uint64_t neg = (uint64_t)simd_data(desc) << 63;
937     uint64_t *za = vza, *zn = vzn, *zm = vzm;
938     uint8_t *pn = vpn, *pm = vpm;
939     float_status fpst = *(float_status *)vst;
940 
941     set_default_nan_mode(true, &fpst);
942 
943     for (row = 0; row < oprsz; ++row) {
944         if (pn[H1(row)] & 1) {
945             uint64_t *za_row = &za[tile_vslice_index(row)];
946             uint64_t n = zn[row] ^ neg;
947 
948             for (col = 0; col < oprsz; ++col) {
949                 if (pm[H1(col)] & 1) {
950                     uint64_t *a = &za_row[col];
951                     *a = float64_muladd(n, zm[col], *a, 0, &fpst);
952                 }
953             }
954         }
955     }
956 }
957 
958 /*
959  * Alter PAIR as needed for controlling predicates being false,
960  * and for NEG on an enabled row element.
961  */
962 static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
963 {
964     /*
965      * The pseudocode uses a conditional negate after the conditional zero.
966      * It is simpler here to unconditionally negate before conditional zero.
967      */
968     pair ^= neg;
969     if (!(pg & 1)) {
970         pair &= 0xffff0000u;
971     }
972     if (!(pg & 4)) {
973         pair &= 0x0000ffffu;
974     }
975     return pair;
976 }
977 
978 static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
979                           float_status *s_std, float_status *s_odd)
980 {
981     float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
982     float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
983     float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
984     float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
985     float64 t64;
986     float32 t32;
987 
988     /*
989      * The ARM pseudocode function FPDot performs both multiplies
990      * and the add with a single rounding operation.  Emulate this
991      * by performing the first multiply in round-to-odd, then doing
992      * the second multiply as fused multiply-add, and rounding to
993      * float32 all in one step.
994      */
995     t64 = float64_mul(e1r, e2r, s_odd);
996     t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
997 
998     /* This conversion is exact, because we've already rounded. */
999     t32 = float64_to_float32(t64, s_std);
1000 
1001     /* The final accumulation step is not fused. */
1002     return float32_add(sum, t32, s_std);
1003 }
1004 
1005 void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
1006                          void *vpm, void *vst, uint32_t desc)
1007 {
1008     intptr_t row, col, oprsz = simd_maxsz(desc);
1009     uint32_t neg = simd_data(desc) * 0x80008000u;
1010     uint16_t *pn = vpn, *pm = vpm;
1011     float_status fpst_odd, fpst_std;
1012 
1013     /*
1014      * Make a copy of float_status because this operation does not
1015      * update the cumulative fp exception status.  It also produces
1016      * default nans.  Make a second copy with round-to-odd -- see above.
1017      */
1018     fpst_std = *(float_status *)vst;
1019     set_default_nan_mode(true, &fpst_std);
1020     fpst_odd = fpst_std;
1021     set_float_rounding_mode(float_round_to_odd, &fpst_odd);
1022 
1023     for (row = 0; row < oprsz; ) {
1024         uint16_t prow = pn[H2(row >> 4)];
1025         do {
1026             void *vza_row = vza + tile_vslice_offset(row);
1027             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1028 
1029             n = f16mop_adj_pair(n, prow, neg);
1030 
1031             for (col = 0; col < oprsz; ) {
1032                 uint16_t pcol = pm[H2(col >> 4)];
1033                 do {
1034                     if (prow & pcol & 0b0101) {
1035                         uint32_t *a = vza_row + H1_4(col);
1036                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1037 
1038                         m = f16mop_adj_pair(m, pcol, 0);
1039                         *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
1040                     }
1041                     col += 4;
1042                     pcol >>= 4;
1043                 } while (col & 15);
1044             }
1045             row += 4;
1046             prow >>= 4;
1047         } while (row & 15);
1048     }
1049 }
1050 
1051 void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
1052                         void *vpm, uint32_t desc)
1053 {
1054     intptr_t row, col, oprsz = simd_maxsz(desc);
1055     uint32_t neg = simd_data(desc) * 0x80008000u;
1056     uint16_t *pn = vpn, *pm = vpm;
1057 
1058     for (row = 0; row < oprsz; ) {
1059         uint16_t prow = pn[H2(row >> 4)];
1060         do {
1061             void *vza_row = vza + tile_vslice_offset(row);
1062             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1063 
1064             n = f16mop_adj_pair(n, prow, neg);
1065 
1066             for (col = 0; col < oprsz; ) {
1067                 uint16_t pcol = pm[H2(col >> 4)];
1068                 do {
1069                     if (prow & pcol & 0b0101) {
1070                         uint32_t *a = vza_row + H1_4(col);
1071                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1072 
1073                         m = f16mop_adj_pair(m, pcol, 0);
1074                         *a = bfdotadd(*a, n, m);
1075                     }
1076                     col += 4;
1077                     pcol >>= 4;
1078                 } while (col & 15);
1079             }
1080             row += 4;
1081             prow >>= 4;
1082         } while (row & 15);
1083     }
1084 }
1085 
1086 typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
1087 
1088 static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
1089                             uint8_t *pn, uint8_t *pm,
1090                             uint32_t desc, IMOPFn *fn)
1091 {
1092     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
1093     bool neg = simd_data(desc);
1094 
1095     for (row = 0; row < oprsz; ++row) {
1096         uint8_t pa = pn[H1(row)];
1097         uint64_t *za_row = &za[tile_vslice_index(row)];
1098         uint64_t n = zn[row];
1099 
1100         for (col = 0; col < oprsz; ++col) {
1101             uint8_t pb = pm[H1(col)];
1102             uint64_t *a = &za_row[col];
1103 
1104             *a = fn(n, zm[col], *a, pa & pb, neg);
1105         }
1106     }
1107 }
1108 
1109 #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1110 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1111 {                                                                           \
1112     uint32_t sum0 = 0, sum1 = 0;                                            \
1113     /* Apply P to N as a mask, making the inactive elements 0. */           \
1114     n &= expand_pred_b(p);                                                  \
1115     sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                              \
1116     sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                              \
1117     sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                            \
1118     sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                            \
1119     sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                            \
1120     sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40);                            \
1121     sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                            \
1122     sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56);                            \
1123     if (neg) {                                                              \
1124         sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1;       \
1125     } else {                                                                \
1126         sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1;       \
1127     }                                                                       \
1128     return ((uint64_t)sum1 << 32) | sum0;                                   \
1129 }
1130 
1131 #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1132 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1133 {                                                                           \
1134     uint64_t sum = 0;                                                       \
1135     /* Apply P to N as a mask, making the inactive elements 0. */           \
1136     n &= expand_pred_h(p);                                                  \
1137     sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
1138     sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
1139     sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                             \
1140     sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                             \
1141     return neg ? a - sum : a + sum;                                         \
1142 }
1143 
1144 DEF_IMOP_32(smopa_s, int8_t, int8_t)
1145 DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
1146 DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
1147 DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
1148 
1149 DEF_IMOP_64(smopa_d, int16_t, int16_t)
1150 DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
1151 DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
1152 DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
1153 
1154 #define DEF_IMOPH(NAME) \
1155     void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn,      \
1156                             void *vpm, uint32_t desc)                        \
1157     { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
1158 
1159 DEF_IMOPH(smopa_s)
1160 DEF_IMOPH(umopa_s)
1161 DEF_IMOPH(sumopa_s)
1162 DEF_IMOPH(usmopa_s)
1163 DEF_IMOPH(smopa_d)
1164 DEF_IMOPH(umopa_d)
1165 DEF_IMOPH(sumopa_d)
1166 DEF_IMOPH(usmopa_d)
1167