xref: /openbmc/qemu/target/arm/tcg/sme_helper.c (revision 84307cd6027c4602913177ff09aeefa4743b7234)
1 /*
2  * ARM SME Operations
3  *
4  * Copyright (c) 2022 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "exec/helper-proto.h"
25 #include "accel/tcg/cpu-ldst.h"
26 #include "qemu/int128.h"
27 #include "fpu/softfloat.h"
28 #include "vec_internal.h"
29 #include "sve_ldst_internal.h"
30 
31 void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
32 {
33     aarch64_set_svcr(env, val, mask);
34 }
35 
36 void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
37 {
38     uint32_t i;
39 
40     /*
41      * Special case clearing the entire ZA space.
42      * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
43      * parts of the ZA storage outside of SVL.
44      */
45     if (imm == 0xff) {
46         memset(env->zarray, 0, sizeof(env->zarray));
47         return;
48     }
49 
50     /*
51      * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
52      * so each row is discontiguous within ZA[].
53      */
54     for (i = 0; i < svl; i++) {
55         if (imm & (1 << (i % 8))) {
56             memset(&env->zarray[i], 0, svl);
57         }
58     }
59 }
60 
61 
62 /*
63  * When considering the ZA storage as an array of elements of
64  * type T, the index within that array of the Nth element of
65  * a vertical slice of a tile can be calculated like this,
66  * regardless of the size of type T. This is because the tiles
67  * are interleaved, so if type T is size N bytes then row 1 of
68  * the tile is N rows away from row 0. The division by N to
69  * convert a byte offset into an array index and the multiplication
70  * by N to convert from vslice-index-within-the-tile to
71  * the index within the ZA storage cancel out.
72  */
73 #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
74 
75 /*
76  * When doing byte arithmetic on the ZA storage, the element
77  * byteoff bytes away in a tile vertical slice is always this
78  * many bytes away in the ZA storage, regardless of the
79  * size of the tile element, assuming that byteoff is a multiple
80  * of the element size. Again this is because of the interleaving
81  * of the tiles. For instance if we have 1 byte per element then
82  * each row of the ZA storage has one byte of the vslice data,
83  * and (counting from 0) byte 8 goes in row 8 of the storage
84  * at offset (8 * row-size-in-bytes).
85  * If we have 8 bytes per element then each row of the ZA storage
86  * has 8 bytes of the data, but there are 8 interleaved tiles and
87  * so byte 8 of the data goes into row 1 of the tile,
88  * which is again row 8 of the storage, so the offset is still
89  * (8 * row-size-in-bytes). Similarly for other element sizes.
90  */
91 #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
92 
93 
94 /*
95  * Move Zreg vector to ZArray column.
96  */
97 #define DO_MOVA_C(NAME, TYPE, H)                                        \
98 void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
99 {                                                                       \
100     int i, oprsz = simd_oprsz(desc);                                    \
101     for (i = 0; i < oprsz; ) {                                          \
102         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
103         do {                                                            \
104             if (pg & 1) {                                               \
105                 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
106             }                                                           \
107             i += sizeof(TYPE);                                          \
108             pg >>= sizeof(TYPE);                                        \
109         } while (i & 15);                                               \
110     }                                                                   \
111 }
112 
113 DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
114 DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
115 DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
116 
117 void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
118 {
119     int i, oprsz = simd_oprsz(desc) / 8;
120     uint8_t *pg = vg;
121     uint64_t *n = vn;
122     uint64_t *a = za;
123 
124     for (i = 0; i < oprsz; i++) {
125         if (pg[H1(i)] & 1) {
126             a[tile_vslice_index(i)] = n[i];
127         }
128     }
129 }
130 
131 void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
132 {
133     int i, oprsz = simd_oprsz(desc) / 16;
134     uint16_t *pg = vg;
135     Int128 *n = vn;
136     Int128 *a = za;
137 
138     /*
139      * Int128 is used here simply to copy 16 bytes, and to simplify
140      * the address arithmetic.
141      */
142     for (i = 0; i < oprsz; i++) {
143         if (pg[H2(i)] & 1) {
144             a[tile_vslice_index(i)] = n[i];
145         }
146     }
147 }
148 
149 #undef DO_MOVA_C
150 
151 /*
152  * Move ZArray column to Zreg vector.
153  */
154 #define DO_MOVA_Z(NAME, TYPE, H)                                        \
155 void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
156 {                                                                       \
157     int i, oprsz = simd_oprsz(desc);                                    \
158     for (i = 0; i < oprsz; ) {                                          \
159         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
160         do {                                                            \
161             if (pg & 1) {                                               \
162                 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
163             }                                                           \
164             i += sizeof(TYPE);                                          \
165             pg >>= sizeof(TYPE);                                        \
166         } while (i & 15);                                               \
167     }                                                                   \
168 }
169 
170 DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
171 DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
172 DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
173 
174 void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
175 {
176     int i, oprsz = simd_oprsz(desc) / 8;
177     uint8_t *pg = vg;
178     uint64_t *d = vd;
179     uint64_t *a = za;
180 
181     for (i = 0; i < oprsz; i++) {
182         if (pg[H1(i)] & 1) {
183             d[i] = a[tile_vslice_index(i)];
184         }
185     }
186 }
187 
188 void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
189 {
190     int i, oprsz = simd_oprsz(desc) / 16;
191     uint16_t *pg = vg;
192     Int128 *d = vd;
193     Int128 *a = za;
194 
195     /*
196      * Int128 is used here simply to copy 16 bytes, and to simplify
197      * the address arithmetic.
198      */
199     for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
200         if (pg[H2(i)] & 1) {
201             d[i] = a[tile_vslice_index(i)];
202         }
203     }
204 }
205 
206 #undef DO_MOVA_Z
207 
208 /*
209  * Clear elements in a tile slice comprising len bytes.
210  */
211 
212 typedef void ClearFn(void *ptr, size_t off, size_t len);
213 
214 static void clear_horizontal(void *ptr, size_t off, size_t len)
215 {
216     memset(ptr + off, 0, len);
217 }
218 
219 static void clear_vertical_b(void *vptr, size_t off, size_t len)
220 {
221     for (size_t i = 0; i < len; ++i) {
222         *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
223     }
224 }
225 
226 static void clear_vertical_h(void *vptr, size_t off, size_t len)
227 {
228     for (size_t i = 0; i < len; i += 2) {
229         *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
230     }
231 }
232 
233 static void clear_vertical_s(void *vptr, size_t off, size_t len)
234 {
235     for (size_t i = 0; i < len; i += 4) {
236         *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
237     }
238 }
239 
240 static void clear_vertical_d(void *vptr, size_t off, size_t len)
241 {
242     for (size_t i = 0; i < len; i += 8) {
243         *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
244     }
245 }
246 
247 static void clear_vertical_q(void *vptr, size_t off, size_t len)
248 {
249     for (size_t i = 0; i < len; i += 16) {
250         memset(vptr + tile_vslice_offset(i + off), 0, 16);
251     }
252 }
253 
254 /*
255  * Copy elements from an array into a tile slice comprising len bytes.
256  */
257 
258 typedef void CopyFn(void *dst, const void *src, size_t len);
259 
260 static void copy_horizontal(void *dst, const void *src, size_t len)
261 {
262     memcpy(dst, src, len);
263 }
264 
265 static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
266 {
267     const uint8_t *src = vsrc;
268     uint8_t *dst = vdst;
269     size_t i;
270 
271     for (i = 0; i < len; ++i) {
272         dst[tile_vslice_index(i)] = src[i];
273     }
274 }
275 
276 static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
277 {
278     const uint16_t *src = vsrc;
279     uint16_t *dst = vdst;
280     size_t i;
281 
282     for (i = 0; i < len / 2; ++i) {
283         dst[tile_vslice_index(i)] = src[i];
284     }
285 }
286 
287 static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
288 {
289     const uint32_t *src = vsrc;
290     uint32_t *dst = vdst;
291     size_t i;
292 
293     for (i = 0; i < len / 4; ++i) {
294         dst[tile_vslice_index(i)] = src[i];
295     }
296 }
297 
298 static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
299 {
300     const uint64_t *src = vsrc;
301     uint64_t *dst = vdst;
302     size_t i;
303 
304     for (i = 0; i < len / 8; ++i) {
305         dst[tile_vslice_index(i)] = src[i];
306     }
307 }
308 
309 static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
310 {
311     for (size_t i = 0; i < len; i += 16) {
312         memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
313     }
314 }
315 
316 /*
317  * Host and TLB primitives for vertical tile slice addressing.
318  */
319 
320 #define DO_LD(NAME, TYPE, HOST, TLB)                                        \
321 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
322 {                                                                           \
323     TYPE val = HOST(host);                                                  \
324     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
325 }                                                                           \
326 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
327                         intptr_t off, target_ulong addr, uintptr_t ra)      \
328 {                                                                           \
329     TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
330     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
331 }
332 
333 #define DO_ST(NAME, TYPE, HOST, TLB)                                        \
334 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
335 {                                                                           \
336     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
337     HOST(host, val);                                                        \
338 }                                                                           \
339 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
340                         intptr_t off, target_ulong addr, uintptr_t ra)      \
341 {                                                                           \
342     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
343     TLB(env, useronly_clean_ptr(addr), val, ra);                            \
344 }
345 
346 /*
347  * The ARMVectorReg elements are stored in host-endian 64-bit units.
348  * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
349  * corresponds to storing the two 64-bit pieces in little-endian order.
350  */
351 #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
352 static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
353 {                                                                           \
354     uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
355     uint64_t *ptr = za + off;                                               \
356     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
357 }                                                                           \
358 static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
359 {                                                                           \
360     HNAME##_host(za, tile_vslice_offset(off), host);                        \
361 }                                                                           \
362 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
363                                target_ulong addr, uintptr_t ra)             \
364 {                                                                           \
365     uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
366     uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
367     uint64_t *ptr = za + off;                                               \
368     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
369 }                                                                           \
370 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
371                                target_ulong addr, uintptr_t ra)             \
372 {                                                                           \
373     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
374 }
375 
376 #define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
377 static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
378 {                                                                           \
379     uint64_t *ptr = za + off;                                               \
380     HOST(host, ptr[BE]);                                                    \
381     HOST(host + 8, ptr[!BE]);                                               \
382 }                                                                           \
383 static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
384 {                                                                           \
385     HNAME##_host(za, tile_vslice_offset(off), host);                        \
386 }                                                                           \
387 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
388                                target_ulong addr, uintptr_t ra)             \
389 {                                                                           \
390     uint64_t *ptr = za + off;                                               \
391     TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
392     TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
393 }                                                                           \
394 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
395                                target_ulong addr, uintptr_t ra)             \
396 {                                                                           \
397     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
398 }
399 
400 DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
401 DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
402 DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
403 DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
404 DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
405 DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
406 DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
407 
408 DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
409 DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
410 
411 DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
412 DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
413 DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
414 DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
415 DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
416 DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
417 DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
418 
419 DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
420 DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
421 
422 #undef DO_LD
423 #undef DO_ST
424 #undef DO_LDQ
425 #undef DO_STQ
426 
427 /*
428  * Common helper for all contiguous predicated loads.
429  */
430 
431 static inline QEMU_ALWAYS_INLINE
432 void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
433              const target_ulong addr, uint32_t desc, const uintptr_t ra,
434              const int esz, uint32_t mtedesc, bool vertical,
435              sve_ldst1_host_fn *host_fn,
436              sve_ldst1_tlb_fn *tlb_fn,
437              ClearFn *clr_fn,
438              CopyFn *cpy_fn)
439 {
440     const intptr_t reg_max = simd_oprsz(desc);
441     const intptr_t esize = 1 << esz;
442     intptr_t reg_off, reg_last;
443     SVEContLdSt info;
444     void *host;
445     int flags;
446 
447     /* Find the active elements.  */
448     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
449         /* The entire predicate was false; no load occurs.  */
450         clr_fn(za, 0, reg_max);
451         return;
452     }
453 
454     /* Probe the page(s).  Exit with exception for any invalid page. */
455     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
456 
457     /* Handle watchpoints for all active elements. */
458     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
459                               BP_MEM_READ, ra);
460 
461     /*
462      * Handle mte checks for all active elements.
463      * Since TBI must be set for MTE, !mtedesc => !mte_active.
464      */
465     if (mtedesc) {
466         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
467                                 mtedesc, ra);
468     }
469 
470     flags = info.page[0].flags | info.page[1].flags;
471     if (unlikely(flags != 0)) {
472 #ifdef CONFIG_USER_ONLY
473         g_assert_not_reached();
474 #else
475         /*
476          * At least one page includes MMIO.
477          * Any bus operation can fail with cpu_transaction_failed,
478          * which for ARM will raise SyncExternal.  Perform the load
479          * into scratch memory to preserve register state until the end.
480          */
481         ARMVectorReg scratch = { };
482 
483         reg_off = info.reg_off_first[0];
484         reg_last = info.reg_off_last[1];
485         if (reg_last < 0) {
486             reg_last = info.reg_off_split;
487             if (reg_last < 0) {
488                 reg_last = info.reg_off_last[0];
489             }
490         }
491 
492         do {
493             uint64_t pg = vg[reg_off >> 6];
494             do {
495                 if ((pg >> (reg_off & 63)) & 1) {
496                     tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
497                 }
498                 reg_off += esize;
499             } while (reg_off & 63);
500         } while (reg_off <= reg_last);
501 
502         cpy_fn(za, &scratch, reg_max);
503         return;
504 #endif
505     }
506 
507     /* The entire operation is in RAM, on valid pages. */
508 
509     reg_off = info.reg_off_first[0];
510     reg_last = info.reg_off_last[0];
511     host = info.page[0].host;
512 
513     if (!vertical) {
514         memset(za, 0, reg_max);
515     } else if (reg_off) {
516         clr_fn(za, 0, reg_off);
517     }
518 
519     set_helper_retaddr(ra);
520 
521     while (reg_off <= reg_last) {
522         uint64_t pg = vg[reg_off >> 6];
523         do {
524             if ((pg >> (reg_off & 63)) & 1) {
525                 host_fn(za, reg_off, host + reg_off);
526             } else if (vertical) {
527                 clr_fn(za, reg_off, esize);
528             }
529             reg_off += esize;
530         } while (reg_off <= reg_last && (reg_off & 63));
531     }
532 
533     clear_helper_retaddr();
534 
535     /*
536      * Use the slow path to manage the cross-page misalignment.
537      * But we know this is RAM and cannot trap.
538      */
539     reg_off = info.reg_off_split;
540     if (unlikely(reg_off >= 0)) {
541         tlb_fn(env, za, reg_off, addr + reg_off, ra);
542     }
543 
544     reg_off = info.reg_off_first[1];
545     if (unlikely(reg_off >= 0)) {
546         reg_last = info.reg_off_last[1];
547         host = info.page[1].host;
548 
549         set_helper_retaddr(ra);
550 
551         do {
552             uint64_t pg = vg[reg_off >> 6];
553             do {
554                 if ((pg >> (reg_off & 63)) & 1) {
555                     host_fn(za, reg_off, host + reg_off);
556                 } else if (vertical) {
557                     clr_fn(za, reg_off, esize);
558                 }
559                 reg_off += esize;
560             } while (reg_off & 63);
561         } while (reg_off <= reg_last);
562 
563         clear_helper_retaddr();
564     }
565 }
566 
567 static inline QEMU_ALWAYS_INLINE
568 void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
569                  target_ulong addr, uint32_t desc, uintptr_t ra,
570                  const int esz, bool vertical,
571                  sve_ldst1_host_fn *host_fn,
572                  sve_ldst1_tlb_fn *tlb_fn,
573                  ClearFn *clr_fn,
574                  CopyFn *cpy_fn)
575 {
576     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
577     int bit55 = extract64(addr, 55, 1);
578 
579     /* Remove mtedesc from the normal sve descriptor. */
580     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
581 
582     /* Perform gross MTE suppression early. */
583     if (!tbi_check(mtedesc, bit55) ||
584         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
585         mtedesc = 0;
586     }
587 
588     sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
589             host_fn, tlb_fn, clr_fn, cpy_fn);
590 }
591 
592 #define DO_LD(L, END, ESZ)                                                 \
593 void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
594                                  target_ulong addr, uint32_t desc)         \
595 {                                                                          \
596     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
597             sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
598             clear_horizontal, copy_horizontal);                            \
599 }                                                                          \
600 void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
601                                  target_ulong addr, uint32_t desc)         \
602 {                                                                          \
603     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
604             sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
605             clear_vertical_##L, copy_vertical_##L);                        \
606 }                                                                          \
607 void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
608                                      target_ulong addr, uint32_t desc)     \
609 {                                                                          \
610     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
611                 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
612                 clear_horizontal, copy_horizontal);                        \
613 }                                                                          \
614 void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
615                                      target_ulong addr, uint32_t desc)     \
616 {                                                                          \
617     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
618                 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
619                 clear_vertical_##L, copy_vertical_##L);                    \
620 }
621 
622 DO_LD(b, , MO_8)
623 DO_LD(h, _be, MO_16)
624 DO_LD(h, _le, MO_16)
625 DO_LD(s, _be, MO_32)
626 DO_LD(s, _le, MO_32)
627 DO_LD(d, _be, MO_64)
628 DO_LD(d, _le, MO_64)
629 DO_LD(q, _be, MO_128)
630 DO_LD(q, _le, MO_128)
631 
632 #undef DO_LD
633 
634 /*
635  * Common helper for all contiguous predicated stores.
636  */
637 
638 static inline QEMU_ALWAYS_INLINE
639 void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
640              const target_ulong addr, uint32_t desc, const uintptr_t ra,
641              const int esz, uint32_t mtedesc, bool vertical,
642              sve_ldst1_host_fn *host_fn,
643              sve_ldst1_tlb_fn *tlb_fn)
644 {
645     const intptr_t reg_max = simd_oprsz(desc);
646     const intptr_t esize = 1 << esz;
647     intptr_t reg_off, reg_last;
648     SVEContLdSt info;
649     void *host;
650     int flags;
651 
652     /* Find the active elements.  */
653     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
654         /* The entire predicate was false; no store occurs.  */
655         return;
656     }
657 
658     /* Probe the page(s).  Exit with exception for any invalid page. */
659     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
660 
661     /* Handle watchpoints for all active elements. */
662     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
663                               BP_MEM_WRITE, ra);
664 
665     /*
666      * Handle mte checks for all active elements.
667      * Since TBI must be set for MTE, !mtedesc => !mte_active.
668      */
669     if (mtedesc) {
670         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
671                                 mtedesc, ra);
672     }
673 
674     flags = info.page[0].flags | info.page[1].flags;
675     if (unlikely(flags != 0)) {
676 #ifdef CONFIG_USER_ONLY
677         g_assert_not_reached();
678 #else
679         /*
680          * At least one page includes MMIO.
681          * Any bus operation can fail with cpu_transaction_failed,
682          * which for ARM will raise SyncExternal.  We cannot avoid
683          * this fault and will leave with the store incomplete.
684          */
685         reg_off = info.reg_off_first[0];
686         reg_last = info.reg_off_last[1];
687         if (reg_last < 0) {
688             reg_last = info.reg_off_split;
689             if (reg_last < 0) {
690                 reg_last = info.reg_off_last[0];
691             }
692         }
693 
694         do {
695             uint64_t pg = vg[reg_off >> 6];
696             do {
697                 if ((pg >> (reg_off & 63)) & 1) {
698                     tlb_fn(env, za, reg_off, addr + reg_off, ra);
699                 }
700                 reg_off += esize;
701             } while (reg_off & 63);
702         } while (reg_off <= reg_last);
703         return;
704 #endif
705     }
706 
707     reg_off = info.reg_off_first[0];
708     reg_last = info.reg_off_last[0];
709     host = info.page[0].host;
710 
711     set_helper_retaddr(ra);
712 
713     while (reg_off <= reg_last) {
714         uint64_t pg = vg[reg_off >> 6];
715         do {
716             if ((pg >> (reg_off & 63)) & 1) {
717                 host_fn(za, reg_off, host + reg_off);
718             }
719             reg_off += 1 << esz;
720         } while (reg_off <= reg_last && (reg_off & 63));
721     }
722 
723     clear_helper_retaddr();
724 
725     /*
726      * Use the slow path to manage the cross-page misalignment.
727      * But we know this is RAM and cannot trap.
728      */
729     reg_off = info.reg_off_split;
730     if (unlikely(reg_off >= 0)) {
731         tlb_fn(env, za, reg_off, addr + reg_off, ra);
732     }
733 
734     reg_off = info.reg_off_first[1];
735     if (unlikely(reg_off >= 0)) {
736         reg_last = info.reg_off_last[1];
737         host = info.page[1].host;
738 
739         set_helper_retaddr(ra);
740 
741         do {
742             uint64_t pg = vg[reg_off >> 6];
743             do {
744                 if ((pg >> (reg_off & 63)) & 1) {
745                     host_fn(za, reg_off, host + reg_off);
746                 }
747                 reg_off += 1 << esz;
748             } while (reg_off & 63);
749         } while (reg_off <= reg_last);
750 
751         clear_helper_retaddr();
752     }
753 }
754 
755 static inline QEMU_ALWAYS_INLINE
756 void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
757                  uint32_t desc, uintptr_t ra, int esz, bool vertical,
758                  sve_ldst1_host_fn *host_fn,
759                  sve_ldst1_tlb_fn *tlb_fn)
760 {
761     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
762     int bit55 = extract64(addr, 55, 1);
763 
764     /* Remove mtedesc from the normal sve descriptor. */
765     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
766 
767     /* Perform gross MTE suppression early. */
768     if (!tbi_check(mtedesc, bit55) ||
769         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
770         mtedesc = 0;
771     }
772 
773     sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
774             vertical, host_fn, tlb_fn);
775 }
776 
777 #define DO_ST(L, END, ESZ)                                                 \
778 void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
779                                  target_ulong addr, uint32_t desc)         \
780 {                                                                          \
781     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
782             sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
783 }                                                                          \
784 void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
785                                  target_ulong addr, uint32_t desc)         \
786 {                                                                          \
787     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
788             sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
789 }                                                                          \
790 void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
791                                      target_ulong addr, uint32_t desc)     \
792 {                                                                          \
793     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
794                 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
795 }                                                                          \
796 void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
797                                      target_ulong addr, uint32_t desc)     \
798 {                                                                          \
799     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
800                 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
801 }
802 
803 DO_ST(b, , MO_8)
804 DO_ST(h, _be, MO_16)
805 DO_ST(h, _le, MO_16)
806 DO_ST(s, _be, MO_32)
807 DO_ST(s, _le, MO_32)
808 DO_ST(d, _be, MO_64)
809 DO_ST(d, _le, MO_64)
810 DO_ST(q, _be, MO_128)
811 DO_ST(q, _le, MO_128)
812 
813 #undef DO_ST
814 
815 void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
816                          void *vpm, uint32_t desc)
817 {
818     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
819     uint64_t *pn = vpn, *pm = vpm;
820     uint32_t *zda = vzda, *zn = vzn;
821 
822     for (row = 0; row < oprsz; ) {
823         uint64_t pa = pn[row >> 4];
824         do {
825             if (pa & 1) {
826                 for (col = 0; col < oprsz; ) {
827                     uint64_t pb = pm[col >> 4];
828                     do {
829                         if (pb & 1) {
830                             zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
831                         }
832                         pb >>= 4;
833                     } while (++col & 15);
834                 }
835             }
836             pa >>= 4;
837         } while (++row & 15);
838     }
839 }
840 
841 void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
842                          void *vpm, uint32_t desc)
843 {
844     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
845     uint8_t *pn = vpn, *pm = vpm;
846     uint64_t *zda = vzda, *zn = vzn;
847 
848     for (row = 0; row < oprsz; ++row) {
849         if (pn[H1(row)] & 1) {
850             for (col = 0; col < oprsz; ++col) {
851                 if (pm[H1(col)] & 1) {
852                     zda[tile_vslice_index(row) + col] += zn[col];
853                 }
854             }
855         }
856     }
857 }
858 
859 void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
860                          void *vpm, uint32_t desc)
861 {
862     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
863     uint64_t *pn = vpn, *pm = vpm;
864     uint32_t *zda = vzda, *zn = vzn;
865 
866     for (row = 0; row < oprsz; ) {
867         uint64_t pa = pn[row >> 4];
868         do {
869             if (pa & 1) {
870                 uint32_t zn_row = zn[H4(row)];
871                 for (col = 0; col < oprsz; ) {
872                     uint64_t pb = pm[col >> 4];
873                     do {
874                         if (pb & 1) {
875                             zda[tile_vslice_index(row) + H4(col)] += zn_row;
876                         }
877                         pb >>= 4;
878                     } while (++col & 15);
879                 }
880             }
881             pa >>= 4;
882         } while (++row & 15);
883     }
884 }
885 
886 void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
887                          void *vpm, uint32_t desc)
888 {
889     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
890     uint8_t *pn = vpn, *pm = vpm;
891     uint64_t *zda = vzda, *zn = vzn;
892 
893     for (row = 0; row < oprsz; ++row) {
894         if (pn[H1(row)] & 1) {
895             uint64_t zn_row = zn[row];
896             for (col = 0; col < oprsz; ++col) {
897                 if (pm[H1(col)] & 1) {
898                     zda[tile_vslice_index(row) + col] += zn_row;
899                 }
900             }
901         }
902     }
903 }
904 
905 void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
906                          void *vpm, float_status *fpst_in, uint32_t desc)
907 {
908     intptr_t row, col, oprsz = simd_maxsz(desc);
909     uint32_t neg = simd_data(desc) << 31;
910     uint16_t *pn = vpn, *pm = vpm;
911     float_status fpst;
912 
913     /*
914      * Make a copy of float_status because this operation does not
915      * update the cumulative fp exception status.  It also produces
916      * default nans.
917      */
918     fpst = *fpst_in;
919     set_default_nan_mode(true, &fpst);
920 
921     for (row = 0; row < oprsz; ) {
922         uint16_t pa = pn[H2(row >> 4)];
923         do {
924             if (pa & 1) {
925                 void *vza_row = vza + tile_vslice_offset(row);
926                 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
927 
928                 for (col = 0; col < oprsz; ) {
929                     uint16_t pb = pm[H2(col >> 4)];
930                     do {
931                         if (pb & 1) {
932                             uint32_t *a = vza_row + H1_4(col);
933                             uint32_t *m = vzm + H1_4(col);
934                             *a = float32_muladd(n, *m, *a, 0, &fpst);
935                         }
936                         col += 4;
937                         pb >>= 4;
938                     } while (col & 15);
939                 }
940             }
941             row += 4;
942             pa >>= 4;
943         } while (row & 15);
944     }
945 }
946 
947 void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
948                          void *vpm, float_status *fpst_in, uint32_t desc)
949 {
950     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
951     uint64_t neg = (uint64_t)simd_data(desc) << 63;
952     uint64_t *za = vza, *zn = vzn, *zm = vzm;
953     uint8_t *pn = vpn, *pm = vpm;
954     float_status fpst = *fpst_in;
955 
956     set_default_nan_mode(true, &fpst);
957 
958     for (row = 0; row < oprsz; ++row) {
959         if (pn[H1(row)] & 1) {
960             uint64_t *za_row = &za[tile_vslice_index(row)];
961             uint64_t n = zn[row] ^ neg;
962 
963             for (col = 0; col < oprsz; ++col) {
964                 if (pm[H1(col)] & 1) {
965                     uint64_t *a = &za_row[col];
966                     *a = float64_muladd(n, zm[col], *a, 0, &fpst);
967                 }
968             }
969         }
970     }
971 }
972 
973 /*
974  * Alter PAIR as needed for controlling predicates being false,
975  * and for NEG on an enabled row element.
976  */
977 static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
978 {
979     /*
980      * The pseudocode uses a conditional negate after the conditional zero.
981      * It is simpler here to unconditionally negate before conditional zero.
982      */
983     pair ^= neg;
984     if (!(pg & 1)) {
985         pair &= 0xffff0000u;
986     }
987     if (!(pg & 4)) {
988         pair &= 0x0000ffffu;
989     }
990     return pair;
991 }
992 
993 static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
994                           float_status *s_f16, float_status *s_std,
995                           float_status *s_odd)
996 {
997     /*
998      * We need three different float_status for different parts of this
999      * operation:
1000      *  - the input conversion of the float16 values must use the
1001      *    f16-specific float_status, so that the FPCR.FZ16 control is applied
1002      *  - operations on float32 including the final accumulation must use
1003      *    the normal float_status, so that FPCR.FZ is applied
1004      *  - we have pre-set-up copy of s_std which is set to round-to-odd,
1005      *    for the multiply (see below)
1006      */
1007     float64 e1r = float16_to_float64(e1 & 0xffff, true, s_f16);
1008     float64 e1c = float16_to_float64(e1 >> 16, true, s_f16);
1009     float64 e2r = float16_to_float64(e2 & 0xffff, true, s_f16);
1010     float64 e2c = float16_to_float64(e2 >> 16, true, s_f16);
1011     float64 t64;
1012     float32 t32;
1013 
1014     /*
1015      * The ARM pseudocode function FPDot performs both multiplies
1016      * and the add with a single rounding operation.  Emulate this
1017      * by performing the first multiply in round-to-odd, then doing
1018      * the second multiply as fused multiply-add, and rounding to
1019      * float32 all in one step.
1020      */
1021     t64 = float64_mul(e1r, e2r, s_odd);
1022     t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
1023 
1024     /* This conversion is exact, because we've already rounded. */
1025     t32 = float64_to_float32(t64, s_std);
1026 
1027     /* The final accumulation step is not fused. */
1028     return float32_add(sum, t32, s_std);
1029 }
1030 
1031 void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
1032                          void *vpm, CPUARMState *env, uint32_t desc)
1033 {
1034     intptr_t row, col, oprsz = simd_maxsz(desc);
1035     uint32_t neg = simd_data(desc) * 0x80008000u;
1036     uint16_t *pn = vpn, *pm = vpm;
1037     float_status fpst_odd, fpst_std, fpst_f16;
1038 
1039     /*
1040      * Make copies of the fp status fields we use, because this operation
1041      * does not update the cumulative fp exception status.  It also
1042      * produces default NaNs. We also need a second copy of fp_status with
1043      * round-to-odd -- see above.
1044      */
1045     fpst_f16 = env->vfp.fp_status[FPST_A64_F16];
1046     fpst_std = env->vfp.fp_status[FPST_A64];
1047     set_default_nan_mode(true, &fpst_std);
1048     set_default_nan_mode(true, &fpst_f16);
1049     fpst_odd = fpst_std;
1050     set_float_rounding_mode(float_round_to_odd, &fpst_odd);
1051 
1052     for (row = 0; row < oprsz; ) {
1053         uint16_t prow = pn[H2(row >> 4)];
1054         do {
1055             void *vza_row = vza + tile_vslice_offset(row);
1056             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1057 
1058             n = f16mop_adj_pair(n, prow, neg);
1059 
1060             for (col = 0; col < oprsz; ) {
1061                 uint16_t pcol = pm[H2(col >> 4)];
1062                 do {
1063                     if (prow & pcol & 0b0101) {
1064                         uint32_t *a = vza_row + H1_4(col);
1065                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1066 
1067                         m = f16mop_adj_pair(m, pcol, 0);
1068                         *a = f16_dotadd(*a, n, m,
1069                                         &fpst_f16, &fpst_std, &fpst_odd);
1070                     }
1071                     col += 4;
1072                     pcol >>= 4;
1073                 } while (col & 15);
1074             }
1075             row += 4;
1076             prow >>= 4;
1077         } while (row & 15);
1078     }
1079 }
1080 
1081 void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm,
1082                         void *vpn, void *vpm, CPUARMState *env, uint32_t desc)
1083 {
1084     intptr_t row, col, oprsz = simd_maxsz(desc);
1085     uint32_t neg = simd_data(desc) * 0x80008000u;
1086     uint16_t *pn = vpn, *pm = vpm;
1087     float_status fpst, fpst_odd;
1088 
1089     if (is_ebf(env, &fpst, &fpst_odd)) {
1090         for (row = 0; row < oprsz; ) {
1091             uint16_t prow = pn[H2(row >> 4)];
1092             do {
1093                 void *vza_row = vza + tile_vslice_offset(row);
1094                 uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1095 
1096                 n = f16mop_adj_pair(n, prow, neg);
1097 
1098                 for (col = 0; col < oprsz; ) {
1099                     uint16_t pcol = pm[H2(col >> 4)];
1100                     do {
1101                         if (prow & pcol & 0b0101) {
1102                             uint32_t *a = vza_row + H1_4(col);
1103                             uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1104 
1105                             m = f16mop_adj_pair(m, pcol, 0);
1106                             *a = bfdotadd_ebf(*a, n, m, &fpst, &fpst_odd);
1107                         }
1108                         col += 4;
1109                         pcol >>= 4;
1110                     } while (col & 15);
1111                 }
1112                 row += 4;
1113                 prow >>= 4;
1114             } while (row & 15);
1115         }
1116     } else {
1117         for (row = 0; row < oprsz; ) {
1118             uint16_t prow = pn[H2(row >> 4)];
1119             do {
1120                 void *vza_row = vza + tile_vslice_offset(row);
1121                 uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1122 
1123                 n = f16mop_adj_pair(n, prow, neg);
1124 
1125                 for (col = 0; col < oprsz; ) {
1126                     uint16_t pcol = pm[H2(col >> 4)];
1127                     do {
1128                         if (prow & pcol & 0b0101) {
1129                             uint32_t *a = vza_row + H1_4(col);
1130                             uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1131 
1132                             m = f16mop_adj_pair(m, pcol, 0);
1133                             *a = bfdotadd(*a, n, m, &fpst);
1134                         }
1135                         col += 4;
1136                         pcol >>= 4;
1137                     } while (col & 15);
1138                 }
1139                 row += 4;
1140                 prow >>= 4;
1141             } while (row & 15);
1142         }
1143     }
1144 }
1145 
1146 typedef uint32_t IMOPFn32(uint32_t, uint32_t, uint32_t, uint8_t, bool);
1147 static inline void do_imopa_s(uint32_t *za, uint32_t *zn, uint32_t *zm,
1148                               uint8_t *pn, uint8_t *pm,
1149                               uint32_t desc, IMOPFn32 *fn)
1150 {
1151     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
1152     bool neg = simd_data(desc);
1153 
1154     for (row = 0; row < oprsz; ++row) {
1155         uint8_t pa = (pn[H1(row >> 1)] >> ((row & 1) * 4)) & 0xf;
1156         uint32_t *za_row = &za[tile_vslice_index(row)];
1157         uint32_t n = zn[H4(row)];
1158 
1159         for (col = 0; col < oprsz; ++col) {
1160             uint8_t pb = pm[H1(col >> 1)] >> ((col & 1) * 4);
1161             uint32_t *a = &za_row[H4(col)];
1162 
1163             *a = fn(n, zm[H4(col)], *a, pa & pb, neg);
1164         }
1165     }
1166 }
1167 
1168 typedef uint64_t IMOPFn64(uint64_t, uint64_t, uint64_t, uint8_t, bool);
1169 static inline void do_imopa_d(uint64_t *za, uint64_t *zn, uint64_t *zm,
1170                               uint8_t *pn, uint8_t *pm,
1171                               uint32_t desc, IMOPFn64 *fn)
1172 {
1173     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
1174     bool neg = simd_data(desc);
1175 
1176     for (row = 0; row < oprsz; ++row) {
1177         uint8_t pa = pn[H1(row)];
1178         uint64_t *za_row = &za[tile_vslice_index(row)];
1179         uint64_t n = zn[row];
1180 
1181         for (col = 0; col < oprsz; ++col) {
1182             uint8_t pb = pm[H1(col)];
1183             uint64_t *a = &za_row[col];
1184 
1185             *a = fn(n, zm[col], *a, pa & pb, neg);
1186         }
1187     }
1188 }
1189 
1190 #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1191 static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \
1192 {                                                                           \
1193     uint32_t sum = 0;                                                       \
1194     /* Apply P to N as a mask, making the inactive elements 0. */           \
1195     n &= expand_pred_b(p);                                                  \
1196     sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
1197     sum += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                               \
1198     sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
1199     sum += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                             \
1200     return neg ? a - sum : a + sum;                                         \
1201 }
1202 
1203 #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1204 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1205 {                                                                           \
1206     uint64_t sum = 0;                                                       \
1207     /* Apply P to N as a mask, making the inactive elements 0. */           \
1208     n &= expand_pred_h(p);                                                  \
1209     sum += (int64_t)(NTYPE)(n >> 0) * (MTYPE)(m >> 0);                      \
1210     sum += (int64_t)(NTYPE)(n >> 16) * (MTYPE)(m >> 16);                    \
1211     sum += (int64_t)(NTYPE)(n >> 32) * (MTYPE)(m >> 32);                    \
1212     sum += (int64_t)(NTYPE)(n >> 48) * (MTYPE)(m >> 48);                    \
1213     return neg ? a - sum : a + sum;                                         \
1214 }
1215 
1216 DEF_IMOP_32(smopa_s, int8_t, int8_t)
1217 DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
1218 DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
1219 DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
1220 
1221 DEF_IMOP_64(smopa_d, int16_t, int16_t)
1222 DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
1223 DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
1224 DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
1225 
1226 #define DEF_IMOPH(NAME, S) \
1227     void HELPER(sme_##NAME##_##S)(void *vza, void *vzn, void *vzm,          \
1228                                   void *vpn, void *vpm, uint32_t desc)      \
1229     { do_imopa_##S(vza, vzn, vzm, vpn, vpm, desc, NAME##_##S); }
1230 
1231 DEF_IMOPH(smopa, s)
1232 DEF_IMOPH(umopa, s)
1233 DEF_IMOPH(sumopa, s)
1234 DEF_IMOPH(usmopa, s)
1235 
1236 DEF_IMOPH(smopa, d)
1237 DEF_IMOPH(umopa, d)
1238 DEF_IMOPH(sumopa, d)
1239 DEF_IMOPH(usmopa, d)
1240