xref: /openbmc/qemu/target/arm/tcg/sme_helper.c (revision ec08d9a51e6af3cd3edbdbf2ca6e97a1e2b5f0d1)
1a3ef070eSClaudio Fontana /*
2a3ef070eSClaudio Fontana  * ARM SME Operations
3a3ef070eSClaudio Fontana  *
4a3ef070eSClaudio Fontana  * Copyright (c) 2022 Linaro, Ltd.
5a3ef070eSClaudio Fontana  *
6a3ef070eSClaudio Fontana  * This library is free software; you can redistribute it and/or
7a3ef070eSClaudio Fontana  * modify it under the terms of the GNU Lesser General Public
8a3ef070eSClaudio Fontana  * License as published by the Free Software Foundation; either
9a3ef070eSClaudio Fontana  * version 2.1 of the License, or (at your option) any later version.
10a3ef070eSClaudio Fontana  *
11a3ef070eSClaudio Fontana  * This library is distributed in the hope that it will be useful,
12a3ef070eSClaudio Fontana  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13a3ef070eSClaudio Fontana  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14a3ef070eSClaudio Fontana  * Lesser General Public License for more details.
15a3ef070eSClaudio Fontana  *
16a3ef070eSClaudio Fontana  * You should have received a copy of the GNU Lesser General Public
17a3ef070eSClaudio Fontana  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18a3ef070eSClaudio Fontana  */
19a3ef070eSClaudio Fontana 
20a3ef070eSClaudio Fontana #include "qemu/osdep.h"
21a3ef070eSClaudio Fontana #include "cpu.h"
22a3ef070eSClaudio Fontana #include "internals.h"
23a3ef070eSClaudio Fontana #include "tcg/tcg-gvec-desc.h"
24a3ef070eSClaudio Fontana #include "exec/helper-proto.h"
25a3ef070eSClaudio Fontana #include "exec/cpu_ldst.h"
26a3ef070eSClaudio Fontana #include "exec/exec-all.h"
27a3ef070eSClaudio Fontana #include "qemu/int128.h"
28a3ef070eSClaudio Fontana #include "fpu/softfloat.h"
29a3ef070eSClaudio Fontana #include "vec_internal.h"
30a3ef070eSClaudio Fontana #include "sve_ldst_internal.h"
31a3ef070eSClaudio Fontana 
helper_set_svcr(CPUARMState * env,uint32_t val,uint32_t mask)32a3ef070eSClaudio Fontana void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
33a3ef070eSClaudio Fontana {
34a3ef070eSClaudio Fontana     aarch64_set_svcr(env, val, mask);
35a3ef070eSClaudio Fontana }
36a3ef070eSClaudio Fontana 
helper_sme_zero(CPUARMState * env,uint32_t imm,uint32_t svl)37a3ef070eSClaudio Fontana void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
38a3ef070eSClaudio Fontana {
39a3ef070eSClaudio Fontana     uint32_t i;
40a3ef070eSClaudio Fontana 
41a3ef070eSClaudio Fontana     /*
42a3ef070eSClaudio Fontana      * Special case clearing the entire ZA space.
43a3ef070eSClaudio Fontana      * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
44a3ef070eSClaudio Fontana      * parts of the ZA storage outside of SVL.
45a3ef070eSClaudio Fontana      */
46a3ef070eSClaudio Fontana     if (imm == 0xff) {
47a3ef070eSClaudio Fontana         memset(env->zarray, 0, sizeof(env->zarray));
48a3ef070eSClaudio Fontana         return;
49a3ef070eSClaudio Fontana     }
50a3ef070eSClaudio Fontana 
51a3ef070eSClaudio Fontana     /*
52a3ef070eSClaudio Fontana      * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
53a3ef070eSClaudio Fontana      * so each row is discontiguous within ZA[].
54a3ef070eSClaudio Fontana      */
55a3ef070eSClaudio Fontana     for (i = 0; i < svl; i++) {
56a3ef070eSClaudio Fontana         if (imm & (1 << (i % 8))) {
57a3ef070eSClaudio Fontana             memset(&env->zarray[i], 0, svl);
58a3ef070eSClaudio Fontana         }
59a3ef070eSClaudio Fontana     }
60a3ef070eSClaudio Fontana }
61a3ef070eSClaudio Fontana 
62a3ef070eSClaudio Fontana 
63a3ef070eSClaudio Fontana /*
64a3ef070eSClaudio Fontana  * When considering the ZA storage as an array of elements of
65a3ef070eSClaudio Fontana  * type T, the index within that array of the Nth element of
66a3ef070eSClaudio Fontana  * a vertical slice of a tile can be calculated like this,
67a3ef070eSClaudio Fontana  * regardless of the size of type T. This is because the tiles
68a3ef070eSClaudio Fontana  * are interleaved, so if type T is size N bytes then row 1 of
69a3ef070eSClaudio Fontana  * the tile is N rows away from row 0. The division by N to
70a3ef070eSClaudio Fontana  * convert a byte offset into an array index and the multiplication
71a3ef070eSClaudio Fontana  * by N to convert from vslice-index-within-the-tile to
72a3ef070eSClaudio Fontana  * the index within the ZA storage cancel out.
73a3ef070eSClaudio Fontana  */
74a3ef070eSClaudio Fontana #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
75a3ef070eSClaudio Fontana 
76a3ef070eSClaudio Fontana /*
77a3ef070eSClaudio Fontana  * When doing byte arithmetic on the ZA storage, the element
78a3ef070eSClaudio Fontana  * byteoff bytes away in a tile vertical slice is always this
79a3ef070eSClaudio Fontana  * many bytes away in the ZA storage, regardless of the
80a3ef070eSClaudio Fontana  * size of the tile element, assuming that byteoff is a multiple
81a3ef070eSClaudio Fontana  * of the element size. Again this is because of the interleaving
82a3ef070eSClaudio Fontana  * of the tiles. For instance if we have 1 byte per element then
83a3ef070eSClaudio Fontana  * each row of the ZA storage has one byte of the vslice data,
84a3ef070eSClaudio Fontana  * and (counting from 0) byte 8 goes in row 8 of the storage
85a3ef070eSClaudio Fontana  * at offset (8 * row-size-in-bytes).
86a3ef070eSClaudio Fontana  * If we have 8 bytes per element then each row of the ZA storage
87a3ef070eSClaudio Fontana  * has 8 bytes of the data, but there are 8 interleaved tiles and
88a3ef070eSClaudio Fontana  * so byte 8 of the data goes into row 1 of the tile,
89a3ef070eSClaudio Fontana  * which is again row 8 of the storage, so the offset is still
90a3ef070eSClaudio Fontana  * (8 * row-size-in-bytes). Similarly for other element sizes.
91a3ef070eSClaudio Fontana  */
92a3ef070eSClaudio Fontana #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
93a3ef070eSClaudio Fontana 
94a3ef070eSClaudio Fontana 
95a3ef070eSClaudio Fontana /*
96a3ef070eSClaudio Fontana  * Move Zreg vector to ZArray column.
97a3ef070eSClaudio Fontana  */
98a3ef070eSClaudio Fontana #define DO_MOVA_C(NAME, TYPE, H)                                        \
99a3ef070eSClaudio Fontana void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
100a3ef070eSClaudio Fontana {                                                                       \
101a3ef070eSClaudio Fontana     int i, oprsz = simd_oprsz(desc);                                    \
102a3ef070eSClaudio Fontana     for (i = 0; i < oprsz; ) {                                          \
103a3ef070eSClaudio Fontana         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
104a3ef070eSClaudio Fontana         do {                                                            \
105a3ef070eSClaudio Fontana             if (pg & 1) {                                               \
106a3ef070eSClaudio Fontana                 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
107a3ef070eSClaudio Fontana             }                                                           \
108a3ef070eSClaudio Fontana             i += sizeof(TYPE);                                          \
109a3ef070eSClaudio Fontana             pg >>= sizeof(TYPE);                                        \
110a3ef070eSClaudio Fontana         } while (i & 15);                                               \
111a3ef070eSClaudio Fontana     }                                                                   \
112a3ef070eSClaudio Fontana }
113a3ef070eSClaudio Fontana 
DO_MOVA_C(sme_mova_cz_b,uint8_t,H1)114a3ef070eSClaudio Fontana DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
115a3ef070eSClaudio Fontana DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
116a3ef070eSClaudio Fontana DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
117a3ef070eSClaudio Fontana 
118a3ef070eSClaudio Fontana void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
119a3ef070eSClaudio Fontana {
120a3ef070eSClaudio Fontana     int i, oprsz = simd_oprsz(desc) / 8;
121a3ef070eSClaudio Fontana     uint8_t *pg = vg;
122a3ef070eSClaudio Fontana     uint64_t *n = vn;
123a3ef070eSClaudio Fontana     uint64_t *a = za;
124a3ef070eSClaudio Fontana 
125a3ef070eSClaudio Fontana     for (i = 0; i < oprsz; i++) {
126a3ef070eSClaudio Fontana         if (pg[H1(i)] & 1) {
127a3ef070eSClaudio Fontana             a[tile_vslice_index(i)] = n[i];
128a3ef070eSClaudio Fontana         }
129a3ef070eSClaudio Fontana     }
130a3ef070eSClaudio Fontana }
131a3ef070eSClaudio Fontana 
HELPER(sme_mova_cz_q)132a3ef070eSClaudio Fontana void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
133a3ef070eSClaudio Fontana {
134a3ef070eSClaudio Fontana     int i, oprsz = simd_oprsz(desc) / 16;
135a3ef070eSClaudio Fontana     uint16_t *pg = vg;
136a3ef070eSClaudio Fontana     Int128 *n = vn;
137a3ef070eSClaudio Fontana     Int128 *a = za;
138a3ef070eSClaudio Fontana 
139a3ef070eSClaudio Fontana     /*
140a3ef070eSClaudio Fontana      * Int128 is used here simply to copy 16 bytes, and to simplify
141a3ef070eSClaudio Fontana      * the address arithmetic.
142a3ef070eSClaudio Fontana      */
143a3ef070eSClaudio Fontana     for (i = 0; i < oprsz; i++) {
144a3ef070eSClaudio Fontana         if (pg[H2(i)] & 1) {
145a3ef070eSClaudio Fontana             a[tile_vslice_index(i)] = n[i];
146a3ef070eSClaudio Fontana         }
147a3ef070eSClaudio Fontana     }
148a3ef070eSClaudio Fontana }
149a3ef070eSClaudio Fontana 
150a3ef070eSClaudio Fontana #undef DO_MOVA_C
151a3ef070eSClaudio Fontana 
152a3ef070eSClaudio Fontana /*
153a3ef070eSClaudio Fontana  * Move ZArray column to Zreg vector.
154a3ef070eSClaudio Fontana  */
155a3ef070eSClaudio Fontana #define DO_MOVA_Z(NAME, TYPE, H)                                        \
156a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
157a3ef070eSClaudio Fontana {                                                                       \
158a3ef070eSClaudio Fontana     int i, oprsz = simd_oprsz(desc);                                    \
159a3ef070eSClaudio Fontana     for (i = 0; i < oprsz; ) {                                          \
160a3ef070eSClaudio Fontana         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
161a3ef070eSClaudio Fontana         do {                                                            \
162a3ef070eSClaudio Fontana             if (pg & 1) {                                               \
163a3ef070eSClaudio Fontana                 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
164a3ef070eSClaudio Fontana             }                                                           \
165a3ef070eSClaudio Fontana             i += sizeof(TYPE);                                          \
166a3ef070eSClaudio Fontana             pg >>= sizeof(TYPE);                                        \
167a3ef070eSClaudio Fontana         } while (i & 15);                                               \
168a3ef070eSClaudio Fontana     }                                                                   \
169a3ef070eSClaudio Fontana }
170a3ef070eSClaudio Fontana 
DO_MOVA_Z(sme_mova_zc_b,uint8_t,H1)171a3ef070eSClaudio Fontana DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
172a3ef070eSClaudio Fontana DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
173a3ef070eSClaudio Fontana DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
174a3ef070eSClaudio Fontana 
175a3ef070eSClaudio Fontana void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
176a3ef070eSClaudio Fontana {
177a3ef070eSClaudio Fontana     int i, oprsz = simd_oprsz(desc) / 8;
178a3ef070eSClaudio Fontana     uint8_t *pg = vg;
179a3ef070eSClaudio Fontana     uint64_t *d = vd;
180a3ef070eSClaudio Fontana     uint64_t *a = za;
181a3ef070eSClaudio Fontana 
182a3ef070eSClaudio Fontana     for (i = 0; i < oprsz; i++) {
183a3ef070eSClaudio Fontana         if (pg[H1(i)] & 1) {
184a3ef070eSClaudio Fontana             d[i] = a[tile_vslice_index(i)];
185a3ef070eSClaudio Fontana         }
186a3ef070eSClaudio Fontana     }
187a3ef070eSClaudio Fontana }
188a3ef070eSClaudio Fontana 
HELPER(sme_mova_zc_q)189a3ef070eSClaudio Fontana void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
190a3ef070eSClaudio Fontana {
191a3ef070eSClaudio Fontana     int i, oprsz = simd_oprsz(desc) / 16;
192a3ef070eSClaudio Fontana     uint16_t *pg = vg;
193a3ef070eSClaudio Fontana     Int128 *d = vd;
194a3ef070eSClaudio Fontana     Int128 *a = za;
195a3ef070eSClaudio Fontana 
196a3ef070eSClaudio Fontana     /*
197a3ef070eSClaudio Fontana      * Int128 is used here simply to copy 16 bytes, and to simplify
198a3ef070eSClaudio Fontana      * the address arithmetic.
199a3ef070eSClaudio Fontana      */
200a3ef070eSClaudio Fontana     for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
201a3ef070eSClaudio Fontana         if (pg[H2(i)] & 1) {
202a3ef070eSClaudio Fontana             d[i] = a[tile_vslice_index(i)];
203a3ef070eSClaudio Fontana         }
204a3ef070eSClaudio Fontana     }
205a3ef070eSClaudio Fontana }
206a3ef070eSClaudio Fontana 
207a3ef070eSClaudio Fontana #undef DO_MOVA_Z
208a3ef070eSClaudio Fontana 
209a3ef070eSClaudio Fontana /*
210a3ef070eSClaudio Fontana  * Clear elements in a tile slice comprising len bytes.
211a3ef070eSClaudio Fontana  */
212a3ef070eSClaudio Fontana 
213a3ef070eSClaudio Fontana typedef void ClearFn(void *ptr, size_t off, size_t len);
214a3ef070eSClaudio Fontana 
clear_horizontal(void * ptr,size_t off,size_t len)215a3ef070eSClaudio Fontana static void clear_horizontal(void *ptr, size_t off, size_t len)
216a3ef070eSClaudio Fontana {
217a3ef070eSClaudio Fontana     memset(ptr + off, 0, len);
218a3ef070eSClaudio Fontana }
219a3ef070eSClaudio Fontana 
clear_vertical_b(void * vptr,size_t off,size_t len)220a3ef070eSClaudio Fontana static void clear_vertical_b(void *vptr, size_t off, size_t len)
221a3ef070eSClaudio Fontana {
222a3ef070eSClaudio Fontana     for (size_t i = 0; i < len; ++i) {
223a3ef070eSClaudio Fontana         *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
224a3ef070eSClaudio Fontana     }
225a3ef070eSClaudio Fontana }
226a3ef070eSClaudio Fontana 
clear_vertical_h(void * vptr,size_t off,size_t len)227a3ef070eSClaudio Fontana static void clear_vertical_h(void *vptr, size_t off, size_t len)
228a3ef070eSClaudio Fontana {
229a3ef070eSClaudio Fontana     for (size_t i = 0; i < len; i += 2) {
230a3ef070eSClaudio Fontana         *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
231a3ef070eSClaudio Fontana     }
232a3ef070eSClaudio Fontana }
233a3ef070eSClaudio Fontana 
clear_vertical_s(void * vptr,size_t off,size_t len)234a3ef070eSClaudio Fontana static void clear_vertical_s(void *vptr, size_t off, size_t len)
235a3ef070eSClaudio Fontana {
236a3ef070eSClaudio Fontana     for (size_t i = 0; i < len; i += 4) {
237a3ef070eSClaudio Fontana         *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
238a3ef070eSClaudio Fontana     }
239a3ef070eSClaudio Fontana }
240a3ef070eSClaudio Fontana 
clear_vertical_d(void * vptr,size_t off,size_t len)241a3ef070eSClaudio Fontana static void clear_vertical_d(void *vptr, size_t off, size_t len)
242a3ef070eSClaudio Fontana {
243a3ef070eSClaudio Fontana     for (size_t i = 0; i < len; i += 8) {
244a3ef070eSClaudio Fontana         *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
245a3ef070eSClaudio Fontana     }
246a3ef070eSClaudio Fontana }
247a3ef070eSClaudio Fontana 
clear_vertical_q(void * vptr,size_t off,size_t len)248a3ef070eSClaudio Fontana static void clear_vertical_q(void *vptr, size_t off, size_t len)
249a3ef070eSClaudio Fontana {
250a3ef070eSClaudio Fontana     for (size_t i = 0; i < len; i += 16) {
251a3ef070eSClaudio Fontana         memset(vptr + tile_vslice_offset(i + off), 0, 16);
252a3ef070eSClaudio Fontana     }
253a3ef070eSClaudio Fontana }
254a3ef070eSClaudio Fontana 
255a3ef070eSClaudio Fontana /*
256a3ef070eSClaudio Fontana  * Copy elements from an array into a tile slice comprising len bytes.
257a3ef070eSClaudio Fontana  */
258a3ef070eSClaudio Fontana 
259a3ef070eSClaudio Fontana typedef void CopyFn(void *dst, const void *src, size_t len);
260a3ef070eSClaudio Fontana 
copy_horizontal(void * dst,const void * src,size_t len)261a3ef070eSClaudio Fontana static void copy_horizontal(void *dst, const void *src, size_t len)
262a3ef070eSClaudio Fontana {
263a3ef070eSClaudio Fontana     memcpy(dst, src, len);
264a3ef070eSClaudio Fontana }
265a3ef070eSClaudio Fontana 
copy_vertical_b(void * vdst,const void * vsrc,size_t len)266a3ef070eSClaudio Fontana static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
267a3ef070eSClaudio Fontana {
268a3ef070eSClaudio Fontana     const uint8_t *src = vsrc;
269a3ef070eSClaudio Fontana     uint8_t *dst = vdst;
270a3ef070eSClaudio Fontana     size_t i;
271a3ef070eSClaudio Fontana 
272a3ef070eSClaudio Fontana     for (i = 0; i < len; ++i) {
273a3ef070eSClaudio Fontana         dst[tile_vslice_index(i)] = src[i];
274a3ef070eSClaudio Fontana     }
275a3ef070eSClaudio Fontana }
276a3ef070eSClaudio Fontana 
copy_vertical_h(void * vdst,const void * vsrc,size_t len)277a3ef070eSClaudio Fontana static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
278a3ef070eSClaudio Fontana {
279a3ef070eSClaudio Fontana     const uint16_t *src = vsrc;
280a3ef070eSClaudio Fontana     uint16_t *dst = vdst;
281a3ef070eSClaudio Fontana     size_t i;
282a3ef070eSClaudio Fontana 
283a3ef070eSClaudio Fontana     for (i = 0; i < len / 2; ++i) {
284a3ef070eSClaudio Fontana         dst[tile_vslice_index(i)] = src[i];
285a3ef070eSClaudio Fontana     }
286a3ef070eSClaudio Fontana }
287a3ef070eSClaudio Fontana 
copy_vertical_s(void * vdst,const void * vsrc,size_t len)288a3ef070eSClaudio Fontana static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
289a3ef070eSClaudio Fontana {
290a3ef070eSClaudio Fontana     const uint32_t *src = vsrc;
291a3ef070eSClaudio Fontana     uint32_t *dst = vdst;
292a3ef070eSClaudio Fontana     size_t i;
293a3ef070eSClaudio Fontana 
294a3ef070eSClaudio Fontana     for (i = 0; i < len / 4; ++i) {
295a3ef070eSClaudio Fontana         dst[tile_vslice_index(i)] = src[i];
296a3ef070eSClaudio Fontana     }
297a3ef070eSClaudio Fontana }
298a3ef070eSClaudio Fontana 
copy_vertical_d(void * vdst,const void * vsrc,size_t len)299a3ef070eSClaudio Fontana static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
300a3ef070eSClaudio Fontana {
301a3ef070eSClaudio Fontana     const uint64_t *src = vsrc;
302a3ef070eSClaudio Fontana     uint64_t *dst = vdst;
303a3ef070eSClaudio Fontana     size_t i;
304a3ef070eSClaudio Fontana 
305a3ef070eSClaudio Fontana     for (i = 0; i < len / 8; ++i) {
306a3ef070eSClaudio Fontana         dst[tile_vslice_index(i)] = src[i];
307a3ef070eSClaudio Fontana     }
308a3ef070eSClaudio Fontana }
309a3ef070eSClaudio Fontana 
copy_vertical_q(void * vdst,const void * vsrc,size_t len)310a3ef070eSClaudio Fontana static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
311a3ef070eSClaudio Fontana {
312a3ef070eSClaudio Fontana     for (size_t i = 0; i < len; i += 16) {
313a3ef070eSClaudio Fontana         memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
314a3ef070eSClaudio Fontana     }
315a3ef070eSClaudio Fontana }
316a3ef070eSClaudio Fontana 
317a3ef070eSClaudio Fontana /*
318a3ef070eSClaudio Fontana  * Host and TLB primitives for vertical tile slice addressing.
319a3ef070eSClaudio Fontana  */
320a3ef070eSClaudio Fontana 
321a3ef070eSClaudio Fontana #define DO_LD(NAME, TYPE, HOST, TLB)                                        \
322a3ef070eSClaudio Fontana static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
323a3ef070eSClaudio Fontana {                                                                           \
324a3ef070eSClaudio Fontana     TYPE val = HOST(host);                                                  \
325a3ef070eSClaudio Fontana     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
326a3ef070eSClaudio Fontana }                                                                           \
327a3ef070eSClaudio Fontana static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
328a3ef070eSClaudio Fontana                         intptr_t off, target_ulong addr, uintptr_t ra)      \
329a3ef070eSClaudio Fontana {                                                                           \
330a3ef070eSClaudio Fontana     TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
331a3ef070eSClaudio Fontana     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
332a3ef070eSClaudio Fontana }
333a3ef070eSClaudio Fontana 
334a3ef070eSClaudio Fontana #define DO_ST(NAME, TYPE, HOST, TLB)                                        \
335a3ef070eSClaudio Fontana static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
336a3ef070eSClaudio Fontana {                                                                           \
337a3ef070eSClaudio Fontana     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
338a3ef070eSClaudio Fontana     HOST(host, val);                                                        \
339a3ef070eSClaudio Fontana }                                                                           \
340a3ef070eSClaudio Fontana static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
341a3ef070eSClaudio Fontana                         intptr_t off, target_ulong addr, uintptr_t ra)      \
342a3ef070eSClaudio Fontana {                                                                           \
343a3ef070eSClaudio Fontana     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
344a3ef070eSClaudio Fontana     TLB(env, useronly_clean_ptr(addr), val, ra);                            \
345a3ef070eSClaudio Fontana }
346a3ef070eSClaudio Fontana 
347a3ef070eSClaudio Fontana /*
348a3ef070eSClaudio Fontana  * The ARMVectorReg elements are stored in host-endian 64-bit units.
349a3ef070eSClaudio Fontana  * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
350a3ef070eSClaudio Fontana  * corresponds to storing the two 64-bit pieces in little-endian order.
351a3ef070eSClaudio Fontana  */
352a3ef070eSClaudio Fontana #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
353a3ef070eSClaudio Fontana static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
354a3ef070eSClaudio Fontana {                                                                           \
355a3ef070eSClaudio Fontana     uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
356a3ef070eSClaudio Fontana     uint64_t *ptr = za + off;                                               \
357a3ef070eSClaudio Fontana     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
358a3ef070eSClaudio Fontana }                                                                           \
359a3ef070eSClaudio Fontana static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
360a3ef070eSClaudio Fontana {                                                                           \
361a3ef070eSClaudio Fontana     HNAME##_host(za, tile_vslice_offset(off), host);                        \
362a3ef070eSClaudio Fontana }                                                                           \
363a3ef070eSClaudio Fontana static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
364a3ef070eSClaudio Fontana                                target_ulong addr, uintptr_t ra)             \
365a3ef070eSClaudio Fontana {                                                                           \
366a3ef070eSClaudio Fontana     uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
367a3ef070eSClaudio Fontana     uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
368a3ef070eSClaudio Fontana     uint64_t *ptr = za + off;                                               \
369a3ef070eSClaudio Fontana     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
370a3ef070eSClaudio Fontana }                                                                           \
371a3ef070eSClaudio Fontana static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
372a3ef070eSClaudio Fontana                                target_ulong addr, uintptr_t ra)             \
373a3ef070eSClaudio Fontana {                                                                           \
374a3ef070eSClaudio Fontana     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
375a3ef070eSClaudio Fontana }
376a3ef070eSClaudio Fontana 
377a3ef070eSClaudio Fontana #define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
378a3ef070eSClaudio Fontana static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
379a3ef070eSClaudio Fontana {                                                                           \
380a3ef070eSClaudio Fontana     uint64_t *ptr = za + off;                                               \
381a3ef070eSClaudio Fontana     HOST(host, ptr[BE]);                                                    \
3824b3520fdSRichard Henderson     HOST(host + 8, ptr[!BE]);                                               \
383a3ef070eSClaudio Fontana }                                                                           \
384a3ef070eSClaudio Fontana static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
385a3ef070eSClaudio Fontana {                                                                           \
386a3ef070eSClaudio Fontana     HNAME##_host(za, tile_vslice_offset(off), host);                        \
387a3ef070eSClaudio Fontana }                                                                           \
388a3ef070eSClaudio Fontana static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
389a3ef070eSClaudio Fontana                                target_ulong addr, uintptr_t ra)             \
390a3ef070eSClaudio Fontana {                                                                           \
391a3ef070eSClaudio Fontana     uint64_t *ptr = za + off;                                               \
392a3ef070eSClaudio Fontana     TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
393a3ef070eSClaudio Fontana     TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
394a3ef070eSClaudio Fontana }                                                                           \
395a3ef070eSClaudio Fontana static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
396a3ef070eSClaudio Fontana                                target_ulong addr, uintptr_t ra)             \
397a3ef070eSClaudio Fontana {                                                                           \
398a3ef070eSClaudio Fontana     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
399a3ef070eSClaudio Fontana }
400a3ef070eSClaudio Fontana 
DO_LD(ld1b,uint8_t,ldub_p,cpu_ldub_data_ra)401a3ef070eSClaudio Fontana DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
402a3ef070eSClaudio Fontana DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
403a3ef070eSClaudio Fontana DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
404a3ef070eSClaudio Fontana DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
405a3ef070eSClaudio Fontana DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
406a3ef070eSClaudio Fontana DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
407a3ef070eSClaudio Fontana DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
408a3ef070eSClaudio Fontana 
409a3ef070eSClaudio Fontana DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
410a3ef070eSClaudio Fontana DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
411a3ef070eSClaudio Fontana 
412a3ef070eSClaudio Fontana DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
413a3ef070eSClaudio Fontana DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
414a3ef070eSClaudio Fontana DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
415a3ef070eSClaudio Fontana DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
416a3ef070eSClaudio Fontana DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
417a3ef070eSClaudio Fontana DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
418a3ef070eSClaudio Fontana DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
419a3ef070eSClaudio Fontana 
420a3ef070eSClaudio Fontana DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
421a3ef070eSClaudio Fontana DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
422a3ef070eSClaudio Fontana 
423a3ef070eSClaudio Fontana #undef DO_LD
424a3ef070eSClaudio Fontana #undef DO_ST
425a3ef070eSClaudio Fontana #undef DO_LDQ
426a3ef070eSClaudio Fontana #undef DO_STQ
427a3ef070eSClaudio Fontana 
428a3ef070eSClaudio Fontana /*
429a3ef070eSClaudio Fontana  * Common helper for all contiguous predicated loads.
430a3ef070eSClaudio Fontana  */
431a3ef070eSClaudio Fontana 
432a3ef070eSClaudio Fontana static inline QEMU_ALWAYS_INLINE
433a3ef070eSClaudio Fontana void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
434a3ef070eSClaudio Fontana              const target_ulong addr, uint32_t desc, const uintptr_t ra,
435a3ef070eSClaudio Fontana              const int esz, uint32_t mtedesc, bool vertical,
436a3ef070eSClaudio Fontana              sve_ldst1_host_fn *host_fn,
437a3ef070eSClaudio Fontana              sve_ldst1_tlb_fn *tlb_fn,
438a3ef070eSClaudio Fontana              ClearFn *clr_fn,
439a3ef070eSClaudio Fontana              CopyFn *cpy_fn)
440a3ef070eSClaudio Fontana {
441a3ef070eSClaudio Fontana     const intptr_t reg_max = simd_oprsz(desc);
442a3ef070eSClaudio Fontana     const intptr_t esize = 1 << esz;
443a3ef070eSClaudio Fontana     intptr_t reg_off, reg_last;
444a3ef070eSClaudio Fontana     SVEContLdSt info;
445a3ef070eSClaudio Fontana     void *host;
446a3ef070eSClaudio Fontana     int flags;
447a3ef070eSClaudio Fontana 
448a3ef070eSClaudio Fontana     /* Find the active elements.  */
449a3ef070eSClaudio Fontana     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
450a3ef070eSClaudio Fontana         /* The entire predicate was false; no load occurs.  */
451a3ef070eSClaudio Fontana         clr_fn(za, 0, reg_max);
452a3ef070eSClaudio Fontana         return;
453a3ef070eSClaudio Fontana     }
454a3ef070eSClaudio Fontana 
455a3ef070eSClaudio Fontana     /* Probe the page(s).  Exit with exception for any invalid page. */
456a3ef070eSClaudio Fontana     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
457a3ef070eSClaudio Fontana 
458a3ef070eSClaudio Fontana     /* Handle watchpoints for all active elements. */
459a3ef070eSClaudio Fontana     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
460a3ef070eSClaudio Fontana                               BP_MEM_READ, ra);
461a3ef070eSClaudio Fontana 
462a3ef070eSClaudio Fontana     /*
463a3ef070eSClaudio Fontana      * Handle mte checks for all active elements.
464a3ef070eSClaudio Fontana      * Since TBI must be set for MTE, !mtedesc => !mte_active.
465a3ef070eSClaudio Fontana      */
466a3ef070eSClaudio Fontana     if (mtedesc) {
467a3ef070eSClaudio Fontana         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
468a3ef070eSClaudio Fontana                                 mtedesc, ra);
469a3ef070eSClaudio Fontana     }
470a3ef070eSClaudio Fontana 
471a3ef070eSClaudio Fontana     flags = info.page[0].flags | info.page[1].flags;
472a3ef070eSClaudio Fontana     if (unlikely(flags != 0)) {
473a3ef070eSClaudio Fontana #ifdef CONFIG_USER_ONLY
474a3ef070eSClaudio Fontana         g_assert_not_reached();
475a3ef070eSClaudio Fontana #else
476a3ef070eSClaudio Fontana         /*
477a3ef070eSClaudio Fontana          * At least one page includes MMIO.
478a3ef070eSClaudio Fontana          * Any bus operation can fail with cpu_transaction_failed,
479a3ef070eSClaudio Fontana          * which for ARM will raise SyncExternal.  Perform the load
480a3ef070eSClaudio Fontana          * into scratch memory to preserve register state until the end.
481a3ef070eSClaudio Fontana          */
482a3ef070eSClaudio Fontana         ARMVectorReg scratch = { };
483a3ef070eSClaudio Fontana 
484a3ef070eSClaudio Fontana         reg_off = info.reg_off_first[0];
485a3ef070eSClaudio Fontana         reg_last = info.reg_off_last[1];
486a3ef070eSClaudio Fontana         if (reg_last < 0) {
487a3ef070eSClaudio Fontana             reg_last = info.reg_off_split;
488a3ef070eSClaudio Fontana             if (reg_last < 0) {
489a3ef070eSClaudio Fontana                 reg_last = info.reg_off_last[0];
490a3ef070eSClaudio Fontana             }
491a3ef070eSClaudio Fontana         }
492a3ef070eSClaudio Fontana 
493a3ef070eSClaudio Fontana         do {
494a3ef070eSClaudio Fontana             uint64_t pg = vg[reg_off >> 6];
495a3ef070eSClaudio Fontana             do {
496a3ef070eSClaudio Fontana                 if ((pg >> (reg_off & 63)) & 1) {
497a3ef070eSClaudio Fontana                     tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
498a3ef070eSClaudio Fontana                 }
499a3ef070eSClaudio Fontana                 reg_off += esize;
500a3ef070eSClaudio Fontana             } while (reg_off & 63);
501a3ef070eSClaudio Fontana         } while (reg_off <= reg_last);
502a3ef070eSClaudio Fontana 
503a3ef070eSClaudio Fontana         cpy_fn(za, &scratch, reg_max);
504a3ef070eSClaudio Fontana         return;
505a3ef070eSClaudio Fontana #endif
506a3ef070eSClaudio Fontana     }
507a3ef070eSClaudio Fontana 
508a3ef070eSClaudio Fontana     /* The entire operation is in RAM, on valid pages. */
509a3ef070eSClaudio Fontana 
510a3ef070eSClaudio Fontana     reg_off = info.reg_off_first[0];
511a3ef070eSClaudio Fontana     reg_last = info.reg_off_last[0];
512a3ef070eSClaudio Fontana     host = info.page[0].host;
513a3ef070eSClaudio Fontana 
514a3ef070eSClaudio Fontana     if (!vertical) {
515a3ef070eSClaudio Fontana         memset(za, 0, reg_max);
516a3ef070eSClaudio Fontana     } else if (reg_off) {
517a3ef070eSClaudio Fontana         clr_fn(za, 0, reg_off);
518a3ef070eSClaudio Fontana     }
519a3ef070eSClaudio Fontana 
5203b9991e3SRichard Henderson     set_helper_retaddr(ra);
5213b9991e3SRichard Henderson 
522a3ef070eSClaudio Fontana     while (reg_off <= reg_last) {
523a3ef070eSClaudio Fontana         uint64_t pg = vg[reg_off >> 6];
524a3ef070eSClaudio Fontana         do {
525a3ef070eSClaudio Fontana             if ((pg >> (reg_off & 63)) & 1) {
526a3ef070eSClaudio Fontana                 host_fn(za, reg_off, host + reg_off);
527a3ef070eSClaudio Fontana             } else if (vertical) {
528a3ef070eSClaudio Fontana                 clr_fn(za, reg_off, esize);
529a3ef070eSClaudio Fontana             }
530a3ef070eSClaudio Fontana             reg_off += esize;
531a3ef070eSClaudio Fontana         } while (reg_off <= reg_last && (reg_off & 63));
532a3ef070eSClaudio Fontana     }
533a3ef070eSClaudio Fontana 
5343b9991e3SRichard Henderson     clear_helper_retaddr();
5353b9991e3SRichard Henderson 
536a3ef070eSClaudio Fontana     /*
537a3ef070eSClaudio Fontana      * Use the slow path to manage the cross-page misalignment.
538a3ef070eSClaudio Fontana      * But we know this is RAM and cannot trap.
539a3ef070eSClaudio Fontana      */
540a3ef070eSClaudio Fontana     reg_off = info.reg_off_split;
541a3ef070eSClaudio Fontana     if (unlikely(reg_off >= 0)) {
542a3ef070eSClaudio Fontana         tlb_fn(env, za, reg_off, addr + reg_off, ra);
543a3ef070eSClaudio Fontana     }
544a3ef070eSClaudio Fontana 
545a3ef070eSClaudio Fontana     reg_off = info.reg_off_first[1];
546a3ef070eSClaudio Fontana     if (unlikely(reg_off >= 0)) {
547a3ef070eSClaudio Fontana         reg_last = info.reg_off_last[1];
548a3ef070eSClaudio Fontana         host = info.page[1].host;
549a3ef070eSClaudio Fontana 
5503b9991e3SRichard Henderson         set_helper_retaddr(ra);
5513b9991e3SRichard Henderson 
552a3ef070eSClaudio Fontana         do {
553a3ef070eSClaudio Fontana             uint64_t pg = vg[reg_off >> 6];
554a3ef070eSClaudio Fontana             do {
555a3ef070eSClaudio Fontana                 if ((pg >> (reg_off & 63)) & 1) {
556a3ef070eSClaudio Fontana                     host_fn(za, reg_off, host + reg_off);
557a3ef070eSClaudio Fontana                 } else if (vertical) {
558a3ef070eSClaudio Fontana                     clr_fn(za, reg_off, esize);
559a3ef070eSClaudio Fontana                 }
560a3ef070eSClaudio Fontana                 reg_off += esize;
561a3ef070eSClaudio Fontana             } while (reg_off & 63);
562a3ef070eSClaudio Fontana         } while (reg_off <= reg_last);
5633b9991e3SRichard Henderson 
5643b9991e3SRichard Henderson         clear_helper_retaddr();
565a3ef070eSClaudio Fontana     }
566a3ef070eSClaudio Fontana }
567a3ef070eSClaudio Fontana 
568a3ef070eSClaudio Fontana static inline QEMU_ALWAYS_INLINE
sme_ld1_mte(CPUARMState * env,void * za,uint64_t * vg,target_ulong addr,uint32_t desc,uintptr_t ra,const int esz,bool vertical,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn,ClearFn * clr_fn,CopyFn * cpy_fn)569a3ef070eSClaudio Fontana void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
570a3ef070eSClaudio Fontana                  target_ulong addr, uint32_t desc, uintptr_t ra,
571a3ef070eSClaudio Fontana                  const int esz, bool vertical,
572a3ef070eSClaudio Fontana                  sve_ldst1_host_fn *host_fn,
573a3ef070eSClaudio Fontana                  sve_ldst1_tlb_fn *tlb_fn,
574a3ef070eSClaudio Fontana                  ClearFn *clr_fn,
575a3ef070eSClaudio Fontana                  CopyFn *cpy_fn)
576a3ef070eSClaudio Fontana {
577a3ef070eSClaudio Fontana     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
578a3ef070eSClaudio Fontana     int bit55 = extract64(addr, 55, 1);
579a3ef070eSClaudio Fontana 
580a3ef070eSClaudio Fontana     /* Remove mtedesc from the normal sve descriptor. */
581a3ef070eSClaudio Fontana     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
582a3ef070eSClaudio Fontana 
583a3ef070eSClaudio Fontana     /* Perform gross MTE suppression early. */
584855f94ecSRichard Henderson     if (!tbi_check(mtedesc, bit55) ||
585855f94ecSRichard Henderson         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
586a3ef070eSClaudio Fontana         mtedesc = 0;
587a3ef070eSClaudio Fontana     }
588a3ef070eSClaudio Fontana 
589a3ef070eSClaudio Fontana     sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
590a3ef070eSClaudio Fontana             host_fn, tlb_fn, clr_fn, cpy_fn);
591a3ef070eSClaudio Fontana }
592a3ef070eSClaudio Fontana 
593a3ef070eSClaudio Fontana #define DO_LD(L, END, ESZ)                                                 \
594a3ef070eSClaudio Fontana void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
595a3ef070eSClaudio Fontana                                  target_ulong addr, uint32_t desc)         \
596a3ef070eSClaudio Fontana {                                                                          \
597a3ef070eSClaudio Fontana     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
598a3ef070eSClaudio Fontana             sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
599a3ef070eSClaudio Fontana             clear_horizontal, copy_horizontal);                            \
600a3ef070eSClaudio Fontana }                                                                          \
601a3ef070eSClaudio Fontana void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
602a3ef070eSClaudio Fontana                                  target_ulong addr, uint32_t desc)         \
603a3ef070eSClaudio Fontana {                                                                          \
604a3ef070eSClaudio Fontana     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
605a3ef070eSClaudio Fontana             sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
606a3ef070eSClaudio Fontana             clear_vertical_##L, copy_vertical_##L);                        \
607a3ef070eSClaudio Fontana }                                                                          \
608a3ef070eSClaudio Fontana void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
609a3ef070eSClaudio Fontana                                      target_ulong addr, uint32_t desc)     \
610a3ef070eSClaudio Fontana {                                                                          \
611a3ef070eSClaudio Fontana     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
612a3ef070eSClaudio Fontana                 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
613a3ef070eSClaudio Fontana                 clear_horizontal, copy_horizontal);                        \
614a3ef070eSClaudio Fontana }                                                                          \
615a3ef070eSClaudio Fontana void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
616a3ef070eSClaudio Fontana                                      target_ulong addr, uint32_t desc)     \
617a3ef070eSClaudio Fontana {                                                                          \
618a3ef070eSClaudio Fontana     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
619a3ef070eSClaudio Fontana                 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
620a3ef070eSClaudio Fontana                 clear_vertical_##L, copy_vertical_##L);                    \
621a3ef070eSClaudio Fontana }
622a3ef070eSClaudio Fontana 
623a3ef070eSClaudio Fontana DO_LD(b, , MO_8)
DO_LD(h,_be,MO_16)624a3ef070eSClaudio Fontana DO_LD(h, _be, MO_16)
625a3ef070eSClaudio Fontana DO_LD(h, _le, MO_16)
626a3ef070eSClaudio Fontana DO_LD(s, _be, MO_32)
627a3ef070eSClaudio Fontana DO_LD(s, _le, MO_32)
628a3ef070eSClaudio Fontana DO_LD(d, _be, MO_64)
629a3ef070eSClaudio Fontana DO_LD(d, _le, MO_64)
630a3ef070eSClaudio Fontana DO_LD(q, _be, MO_128)
631a3ef070eSClaudio Fontana DO_LD(q, _le, MO_128)
632a3ef070eSClaudio Fontana 
633a3ef070eSClaudio Fontana #undef DO_LD
634a3ef070eSClaudio Fontana 
635a3ef070eSClaudio Fontana /*
636a3ef070eSClaudio Fontana  * Common helper for all contiguous predicated stores.
637a3ef070eSClaudio Fontana  */
638a3ef070eSClaudio Fontana 
639a3ef070eSClaudio Fontana static inline QEMU_ALWAYS_INLINE
640a3ef070eSClaudio Fontana void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
641a3ef070eSClaudio Fontana              const target_ulong addr, uint32_t desc, const uintptr_t ra,
642a3ef070eSClaudio Fontana              const int esz, uint32_t mtedesc, bool vertical,
643a3ef070eSClaudio Fontana              sve_ldst1_host_fn *host_fn,
644a3ef070eSClaudio Fontana              sve_ldst1_tlb_fn *tlb_fn)
645a3ef070eSClaudio Fontana {
646a3ef070eSClaudio Fontana     const intptr_t reg_max = simd_oprsz(desc);
647a3ef070eSClaudio Fontana     const intptr_t esize = 1 << esz;
648a3ef070eSClaudio Fontana     intptr_t reg_off, reg_last;
649a3ef070eSClaudio Fontana     SVEContLdSt info;
650a3ef070eSClaudio Fontana     void *host;
651a3ef070eSClaudio Fontana     int flags;
652a3ef070eSClaudio Fontana 
653a3ef070eSClaudio Fontana     /* Find the active elements.  */
654a3ef070eSClaudio Fontana     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
655a3ef070eSClaudio Fontana         /* The entire predicate was false; no store occurs.  */
656a3ef070eSClaudio Fontana         return;
657a3ef070eSClaudio Fontana     }
658a3ef070eSClaudio Fontana 
659a3ef070eSClaudio Fontana     /* Probe the page(s).  Exit with exception for any invalid page. */
660a3ef070eSClaudio Fontana     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
661a3ef070eSClaudio Fontana 
662a3ef070eSClaudio Fontana     /* Handle watchpoints for all active elements. */
663a3ef070eSClaudio Fontana     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
664a3ef070eSClaudio Fontana                               BP_MEM_WRITE, ra);
665a3ef070eSClaudio Fontana 
666a3ef070eSClaudio Fontana     /*
667a3ef070eSClaudio Fontana      * Handle mte checks for all active elements.
668a3ef070eSClaudio Fontana      * Since TBI must be set for MTE, !mtedesc => !mte_active.
669a3ef070eSClaudio Fontana      */
670a3ef070eSClaudio Fontana     if (mtedesc) {
671a3ef070eSClaudio Fontana         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
672a3ef070eSClaudio Fontana                                 mtedesc, ra);
673a3ef070eSClaudio Fontana     }
674a3ef070eSClaudio Fontana 
675a3ef070eSClaudio Fontana     flags = info.page[0].flags | info.page[1].flags;
676a3ef070eSClaudio Fontana     if (unlikely(flags != 0)) {
677a3ef070eSClaudio Fontana #ifdef CONFIG_USER_ONLY
678a3ef070eSClaudio Fontana         g_assert_not_reached();
679a3ef070eSClaudio Fontana #else
680a3ef070eSClaudio Fontana         /*
681a3ef070eSClaudio Fontana          * At least one page includes MMIO.
682a3ef070eSClaudio Fontana          * Any bus operation can fail with cpu_transaction_failed,
683a3ef070eSClaudio Fontana          * which for ARM will raise SyncExternal.  We cannot avoid
684a3ef070eSClaudio Fontana          * this fault and will leave with the store incomplete.
685a3ef070eSClaudio Fontana          */
686a3ef070eSClaudio Fontana         reg_off = info.reg_off_first[0];
687a3ef070eSClaudio Fontana         reg_last = info.reg_off_last[1];
688a3ef070eSClaudio Fontana         if (reg_last < 0) {
689a3ef070eSClaudio Fontana             reg_last = info.reg_off_split;
690a3ef070eSClaudio Fontana             if (reg_last < 0) {
691a3ef070eSClaudio Fontana                 reg_last = info.reg_off_last[0];
692a3ef070eSClaudio Fontana             }
693a3ef070eSClaudio Fontana         }
694a3ef070eSClaudio Fontana 
695a3ef070eSClaudio Fontana         do {
696a3ef070eSClaudio Fontana             uint64_t pg = vg[reg_off >> 6];
697a3ef070eSClaudio Fontana             do {
698a3ef070eSClaudio Fontana                 if ((pg >> (reg_off & 63)) & 1) {
699a3ef070eSClaudio Fontana                     tlb_fn(env, za, reg_off, addr + reg_off, ra);
700a3ef070eSClaudio Fontana                 }
701a3ef070eSClaudio Fontana                 reg_off += esize;
702a3ef070eSClaudio Fontana             } while (reg_off & 63);
703a3ef070eSClaudio Fontana         } while (reg_off <= reg_last);
704a3ef070eSClaudio Fontana         return;
705a3ef070eSClaudio Fontana #endif
706a3ef070eSClaudio Fontana     }
707a3ef070eSClaudio Fontana 
708a3ef070eSClaudio Fontana     reg_off = info.reg_off_first[0];
709a3ef070eSClaudio Fontana     reg_last = info.reg_off_last[0];
710a3ef070eSClaudio Fontana     host = info.page[0].host;
711a3ef070eSClaudio Fontana 
7123b9991e3SRichard Henderson     set_helper_retaddr(ra);
7133b9991e3SRichard Henderson 
714a3ef070eSClaudio Fontana     while (reg_off <= reg_last) {
715a3ef070eSClaudio Fontana         uint64_t pg = vg[reg_off >> 6];
716a3ef070eSClaudio Fontana         do {
717a3ef070eSClaudio Fontana             if ((pg >> (reg_off & 63)) & 1) {
718a3ef070eSClaudio Fontana                 host_fn(za, reg_off, host + reg_off);
719a3ef070eSClaudio Fontana             }
720a3ef070eSClaudio Fontana             reg_off += 1 << esz;
721a3ef070eSClaudio Fontana         } while (reg_off <= reg_last && (reg_off & 63));
722a3ef070eSClaudio Fontana     }
723a3ef070eSClaudio Fontana 
7243b9991e3SRichard Henderson     clear_helper_retaddr();
7253b9991e3SRichard Henderson 
726a3ef070eSClaudio Fontana     /*
727a3ef070eSClaudio Fontana      * Use the slow path to manage the cross-page misalignment.
728a3ef070eSClaudio Fontana      * But we know this is RAM and cannot trap.
729a3ef070eSClaudio Fontana      */
730a3ef070eSClaudio Fontana     reg_off = info.reg_off_split;
731a3ef070eSClaudio Fontana     if (unlikely(reg_off >= 0)) {
732a3ef070eSClaudio Fontana         tlb_fn(env, za, reg_off, addr + reg_off, ra);
733a3ef070eSClaudio Fontana     }
734a3ef070eSClaudio Fontana 
735a3ef070eSClaudio Fontana     reg_off = info.reg_off_first[1];
736a3ef070eSClaudio Fontana     if (unlikely(reg_off >= 0)) {
737a3ef070eSClaudio Fontana         reg_last = info.reg_off_last[1];
738a3ef070eSClaudio Fontana         host = info.page[1].host;
739a3ef070eSClaudio Fontana 
7403b9991e3SRichard Henderson         set_helper_retaddr(ra);
7413b9991e3SRichard Henderson 
742a3ef070eSClaudio Fontana         do {
743a3ef070eSClaudio Fontana             uint64_t pg = vg[reg_off >> 6];
744a3ef070eSClaudio Fontana             do {
745a3ef070eSClaudio Fontana                 if ((pg >> (reg_off & 63)) & 1) {
746a3ef070eSClaudio Fontana                     host_fn(za, reg_off, host + reg_off);
747a3ef070eSClaudio Fontana                 }
748a3ef070eSClaudio Fontana                 reg_off += 1 << esz;
749a3ef070eSClaudio Fontana             } while (reg_off & 63);
750a3ef070eSClaudio Fontana         } while (reg_off <= reg_last);
7513b9991e3SRichard Henderson 
7523b9991e3SRichard Henderson         clear_helper_retaddr();
753a3ef070eSClaudio Fontana     }
754a3ef070eSClaudio Fontana }
755a3ef070eSClaudio Fontana 
756a3ef070eSClaudio Fontana static inline QEMU_ALWAYS_INLINE
sme_st1_mte(CPUARMState * env,void * za,uint64_t * vg,target_ulong addr,uint32_t desc,uintptr_t ra,int esz,bool vertical,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)757a3ef070eSClaudio Fontana void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
758a3ef070eSClaudio Fontana                  uint32_t desc, uintptr_t ra, int esz, bool vertical,
759a3ef070eSClaudio Fontana                  sve_ldst1_host_fn *host_fn,
760a3ef070eSClaudio Fontana                  sve_ldst1_tlb_fn *tlb_fn)
761a3ef070eSClaudio Fontana {
762a3ef070eSClaudio Fontana     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
763a3ef070eSClaudio Fontana     int bit55 = extract64(addr, 55, 1);
764a3ef070eSClaudio Fontana 
765a3ef070eSClaudio Fontana     /* Remove mtedesc from the normal sve descriptor. */
766a3ef070eSClaudio Fontana     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
767a3ef070eSClaudio Fontana 
768a3ef070eSClaudio Fontana     /* Perform gross MTE suppression early. */
769855f94ecSRichard Henderson     if (!tbi_check(mtedesc, bit55) ||
770855f94ecSRichard Henderson         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
771a3ef070eSClaudio Fontana         mtedesc = 0;
772a3ef070eSClaudio Fontana     }
773a3ef070eSClaudio Fontana 
774a3ef070eSClaudio Fontana     sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
775a3ef070eSClaudio Fontana             vertical, host_fn, tlb_fn);
776a3ef070eSClaudio Fontana }
777a3ef070eSClaudio Fontana 
778a3ef070eSClaudio Fontana #define DO_ST(L, END, ESZ)                                                 \
779a3ef070eSClaudio Fontana void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
780a3ef070eSClaudio Fontana                                  target_ulong addr, uint32_t desc)         \
781a3ef070eSClaudio Fontana {                                                                          \
782a3ef070eSClaudio Fontana     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
783a3ef070eSClaudio Fontana             sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
784a3ef070eSClaudio Fontana }                                                                          \
785a3ef070eSClaudio Fontana void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
786a3ef070eSClaudio Fontana                                  target_ulong addr, uint32_t desc)         \
787a3ef070eSClaudio Fontana {                                                                          \
788a3ef070eSClaudio Fontana     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
789a3ef070eSClaudio Fontana             sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
790a3ef070eSClaudio Fontana }                                                                          \
791a3ef070eSClaudio Fontana void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
792a3ef070eSClaudio Fontana                                      target_ulong addr, uint32_t desc)     \
793a3ef070eSClaudio Fontana {                                                                          \
794a3ef070eSClaudio Fontana     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
795a3ef070eSClaudio Fontana                 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
796a3ef070eSClaudio Fontana }                                                                          \
797a3ef070eSClaudio Fontana void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
798a3ef070eSClaudio Fontana                                      target_ulong addr, uint32_t desc)     \
799a3ef070eSClaudio Fontana {                                                                          \
800a3ef070eSClaudio Fontana     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
801a3ef070eSClaudio Fontana                 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
802a3ef070eSClaudio Fontana }
803a3ef070eSClaudio Fontana 
804a3ef070eSClaudio Fontana DO_ST(b, , MO_8)
DO_ST(h,_be,MO_16)805a3ef070eSClaudio Fontana DO_ST(h, _be, MO_16)
806a3ef070eSClaudio Fontana DO_ST(h, _le, MO_16)
807a3ef070eSClaudio Fontana DO_ST(s, _be, MO_32)
808a3ef070eSClaudio Fontana DO_ST(s, _le, MO_32)
809a3ef070eSClaudio Fontana DO_ST(d, _be, MO_64)
810a3ef070eSClaudio Fontana DO_ST(d, _le, MO_64)
811a3ef070eSClaudio Fontana DO_ST(q, _be, MO_128)
812a3ef070eSClaudio Fontana DO_ST(q, _le, MO_128)
813a3ef070eSClaudio Fontana 
814a3ef070eSClaudio Fontana #undef DO_ST
815a3ef070eSClaudio Fontana 
816a3ef070eSClaudio Fontana void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
817a3ef070eSClaudio Fontana                          void *vpm, uint32_t desc)
818a3ef070eSClaudio Fontana {
819a3ef070eSClaudio Fontana     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
820a3ef070eSClaudio Fontana     uint64_t *pn = vpn, *pm = vpm;
821a3ef070eSClaudio Fontana     uint32_t *zda = vzda, *zn = vzn;
822a3ef070eSClaudio Fontana 
823a3ef070eSClaudio Fontana     for (row = 0; row < oprsz; ) {
824a3ef070eSClaudio Fontana         uint64_t pa = pn[row >> 4];
825a3ef070eSClaudio Fontana         do {
826a3ef070eSClaudio Fontana             if (pa & 1) {
827a3ef070eSClaudio Fontana                 for (col = 0; col < oprsz; ) {
828a3ef070eSClaudio Fontana                     uint64_t pb = pm[col >> 4];
829a3ef070eSClaudio Fontana                     do {
830a3ef070eSClaudio Fontana                         if (pb & 1) {
831a3ef070eSClaudio Fontana                             zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
832a3ef070eSClaudio Fontana                         }
833a3ef070eSClaudio Fontana                         pb >>= 4;
834a3ef070eSClaudio Fontana                     } while (++col & 15);
835a3ef070eSClaudio Fontana                 }
836a3ef070eSClaudio Fontana             }
837a3ef070eSClaudio Fontana             pa >>= 4;
838a3ef070eSClaudio Fontana         } while (++row & 15);
839a3ef070eSClaudio Fontana     }
840a3ef070eSClaudio Fontana }
841a3ef070eSClaudio Fontana 
HELPER(sme_addha_d)842a3ef070eSClaudio Fontana void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
843a3ef070eSClaudio Fontana                          void *vpm, uint32_t desc)
844a3ef070eSClaudio Fontana {
845a3ef070eSClaudio Fontana     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
846a3ef070eSClaudio Fontana     uint8_t *pn = vpn, *pm = vpm;
847a3ef070eSClaudio Fontana     uint64_t *zda = vzda, *zn = vzn;
848a3ef070eSClaudio Fontana 
849a3ef070eSClaudio Fontana     for (row = 0; row < oprsz; ++row) {
850a3ef070eSClaudio Fontana         if (pn[H1(row)] & 1) {
851a3ef070eSClaudio Fontana             for (col = 0; col < oprsz; ++col) {
852a3ef070eSClaudio Fontana                 if (pm[H1(col)] & 1) {
853a3ef070eSClaudio Fontana                     zda[tile_vslice_index(row) + col] += zn[col];
854a3ef070eSClaudio Fontana                 }
855a3ef070eSClaudio Fontana             }
856a3ef070eSClaudio Fontana         }
857a3ef070eSClaudio Fontana     }
858a3ef070eSClaudio Fontana }
859a3ef070eSClaudio Fontana 
HELPER(sme_addva_s)860a3ef070eSClaudio Fontana void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
861a3ef070eSClaudio Fontana                          void *vpm, uint32_t desc)
862a3ef070eSClaudio Fontana {
863a3ef070eSClaudio Fontana     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
864a3ef070eSClaudio Fontana     uint64_t *pn = vpn, *pm = vpm;
865a3ef070eSClaudio Fontana     uint32_t *zda = vzda, *zn = vzn;
866a3ef070eSClaudio Fontana 
867a3ef070eSClaudio Fontana     for (row = 0; row < oprsz; ) {
868a3ef070eSClaudio Fontana         uint64_t pa = pn[row >> 4];
869a3ef070eSClaudio Fontana         do {
870a3ef070eSClaudio Fontana             if (pa & 1) {
871a3ef070eSClaudio Fontana                 uint32_t zn_row = zn[H4(row)];
872a3ef070eSClaudio Fontana                 for (col = 0; col < oprsz; ) {
873a3ef070eSClaudio Fontana                     uint64_t pb = pm[col >> 4];
874a3ef070eSClaudio Fontana                     do {
875a3ef070eSClaudio Fontana                         if (pb & 1) {
876a3ef070eSClaudio Fontana                             zda[tile_vslice_index(row) + H4(col)] += zn_row;
877a3ef070eSClaudio Fontana                         }
878a3ef070eSClaudio Fontana                         pb >>= 4;
879a3ef070eSClaudio Fontana                     } while (++col & 15);
880a3ef070eSClaudio Fontana                 }
881a3ef070eSClaudio Fontana             }
882a3ef070eSClaudio Fontana             pa >>= 4;
883a3ef070eSClaudio Fontana         } while (++row & 15);
884a3ef070eSClaudio Fontana     }
885a3ef070eSClaudio Fontana }
886a3ef070eSClaudio Fontana 
HELPER(sme_addva_d)887a3ef070eSClaudio Fontana void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
888a3ef070eSClaudio Fontana                          void *vpm, uint32_t desc)
889a3ef070eSClaudio Fontana {
890a3ef070eSClaudio Fontana     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
891a3ef070eSClaudio Fontana     uint8_t *pn = vpn, *pm = vpm;
892a3ef070eSClaudio Fontana     uint64_t *zda = vzda, *zn = vzn;
893a3ef070eSClaudio Fontana 
894a3ef070eSClaudio Fontana     for (row = 0; row < oprsz; ++row) {
895a3ef070eSClaudio Fontana         if (pn[H1(row)] & 1) {
896a3ef070eSClaudio Fontana             uint64_t zn_row = zn[row];
897a3ef070eSClaudio Fontana             for (col = 0; col < oprsz; ++col) {
898a3ef070eSClaudio Fontana                 if (pm[H1(col)] & 1) {
899a3ef070eSClaudio Fontana                     zda[tile_vslice_index(row) + col] += zn_row;
900a3ef070eSClaudio Fontana                 }
901a3ef070eSClaudio Fontana             }
902a3ef070eSClaudio Fontana         }
903a3ef070eSClaudio Fontana     }
904a3ef070eSClaudio Fontana }
905a3ef070eSClaudio Fontana 
HELPER(sme_fmopa_s)906a3ef070eSClaudio Fontana void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
907a3ef070eSClaudio Fontana                          void *vpm, void *vst, uint32_t desc)
908a3ef070eSClaudio Fontana {
909a3ef070eSClaudio Fontana     intptr_t row, col, oprsz = simd_maxsz(desc);
910a3ef070eSClaudio Fontana     uint32_t neg = simd_data(desc) << 31;
911a3ef070eSClaudio Fontana     uint16_t *pn = vpn, *pm = vpm;
912a3ef070eSClaudio Fontana     float_status fpst;
913a3ef070eSClaudio Fontana 
914a3ef070eSClaudio Fontana     /*
915a3ef070eSClaudio Fontana      * Make a copy of float_status because this operation does not
916a3ef070eSClaudio Fontana      * update the cumulative fp exception status.  It also produces
917a3ef070eSClaudio Fontana      * default nans.
918a3ef070eSClaudio Fontana      */
919a3ef070eSClaudio Fontana     fpst = *(float_status *)vst;
920a3ef070eSClaudio Fontana     set_default_nan_mode(true, &fpst);
921a3ef070eSClaudio Fontana 
922a3ef070eSClaudio Fontana     for (row = 0; row < oprsz; ) {
923a3ef070eSClaudio Fontana         uint16_t pa = pn[H2(row >> 4)];
924a3ef070eSClaudio Fontana         do {
925a3ef070eSClaudio Fontana             if (pa & 1) {
926a3ef070eSClaudio Fontana                 void *vza_row = vza + tile_vslice_offset(row);
927a3ef070eSClaudio Fontana                 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
928a3ef070eSClaudio Fontana 
929a3ef070eSClaudio Fontana                 for (col = 0; col < oprsz; ) {
930a3ef070eSClaudio Fontana                     uint16_t pb = pm[H2(col >> 4)];
931a3ef070eSClaudio Fontana                     do {
932a3ef070eSClaudio Fontana                         if (pb & 1) {
933a3ef070eSClaudio Fontana                             uint32_t *a = vza_row + H1_4(col);
934a3ef070eSClaudio Fontana                             uint32_t *m = vzm + H1_4(col);
93531d93fedSDaniyal Khan                             *a = float32_muladd(n, *m, *a, 0, &fpst);
936a3ef070eSClaudio Fontana                         }
937a3ef070eSClaudio Fontana                         col += 4;
938a3ef070eSClaudio Fontana                         pb >>= 4;
939a3ef070eSClaudio Fontana                     } while (col & 15);
940a3ef070eSClaudio Fontana                 }
941a3ef070eSClaudio Fontana             }
942a3ef070eSClaudio Fontana             row += 4;
943a3ef070eSClaudio Fontana             pa >>= 4;
944a3ef070eSClaudio Fontana         } while (row & 15);
945a3ef070eSClaudio Fontana     }
946a3ef070eSClaudio Fontana }
947a3ef070eSClaudio Fontana 
HELPER(sme_fmopa_d)948a3ef070eSClaudio Fontana void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
949a3ef070eSClaudio Fontana                          void *vpm, void *vst, uint32_t desc)
950a3ef070eSClaudio Fontana {
951a3ef070eSClaudio Fontana     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
952a3ef070eSClaudio Fontana     uint64_t neg = (uint64_t)simd_data(desc) << 63;
953a3ef070eSClaudio Fontana     uint64_t *za = vza, *zn = vzn, *zm = vzm;
954a3ef070eSClaudio Fontana     uint8_t *pn = vpn, *pm = vpm;
955a3ef070eSClaudio Fontana     float_status fpst = *(float_status *)vst;
956a3ef070eSClaudio Fontana 
957a3ef070eSClaudio Fontana     set_default_nan_mode(true, &fpst);
958a3ef070eSClaudio Fontana 
959a3ef070eSClaudio Fontana     for (row = 0; row < oprsz; ++row) {
960a3ef070eSClaudio Fontana         if (pn[H1(row)] & 1) {
961a3ef070eSClaudio Fontana             uint64_t *za_row = &za[tile_vslice_index(row)];
962a3ef070eSClaudio Fontana             uint64_t n = zn[row] ^ neg;
963a3ef070eSClaudio Fontana 
964a3ef070eSClaudio Fontana             for (col = 0; col < oprsz; ++col) {
965a3ef070eSClaudio Fontana                 if (pm[H1(col)] & 1) {
966a3ef070eSClaudio Fontana                     uint64_t *a = &za_row[col];
967a3ef070eSClaudio Fontana                     *a = float64_muladd(n, zm[col], *a, 0, &fpst);
968a3ef070eSClaudio Fontana                 }
969a3ef070eSClaudio Fontana             }
970a3ef070eSClaudio Fontana         }
971a3ef070eSClaudio Fontana     }
972a3ef070eSClaudio Fontana }
973a3ef070eSClaudio Fontana 
974a3ef070eSClaudio Fontana /*
975a3ef070eSClaudio Fontana  * Alter PAIR as needed for controlling predicates being false,
976a3ef070eSClaudio Fontana  * and for NEG on an enabled row element.
977a3ef070eSClaudio Fontana  */
f16mop_adj_pair(uint32_t pair,uint32_t pg,uint32_t neg)978a3ef070eSClaudio Fontana static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
979a3ef070eSClaudio Fontana {
980a3ef070eSClaudio Fontana     /*
981a3ef070eSClaudio Fontana      * The pseudocode uses a conditional negate after the conditional zero.
982a3ef070eSClaudio Fontana      * It is simpler here to unconditionally negate before conditional zero.
983a3ef070eSClaudio Fontana      */
984a3ef070eSClaudio Fontana     pair ^= neg;
985a3ef070eSClaudio Fontana     if (!(pg & 1)) {
986a3ef070eSClaudio Fontana         pair &= 0xffff0000u;
987a3ef070eSClaudio Fontana     }
988a3ef070eSClaudio Fontana     if (!(pg & 4)) {
989a3ef070eSClaudio Fontana         pair &= 0x0000ffffu;
990a3ef070eSClaudio Fontana     }
991a3ef070eSClaudio Fontana     return pair;
992a3ef070eSClaudio Fontana }
993a3ef070eSClaudio Fontana 
f16_dotadd(float32 sum,uint32_t e1,uint32_t e2,float_status * s_f16,float_status * s_std,float_status * s_odd)994a3ef070eSClaudio Fontana static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
99555f9f4eeSPeter Maydell                           float_status *s_f16, float_status *s_std,
99655f9f4eeSPeter Maydell                           float_status *s_odd)
997a3ef070eSClaudio Fontana {
99855f9f4eeSPeter Maydell     /*
99955f9f4eeSPeter Maydell      * We need three different float_status for different parts of this
100055f9f4eeSPeter Maydell      * operation:
100155f9f4eeSPeter Maydell      *  - the input conversion of the float16 values must use the
100255f9f4eeSPeter Maydell      *    f16-specific float_status, so that the FPCR.FZ16 control is applied
100355f9f4eeSPeter Maydell      *  - operations on float32 including the final accumulation must use
100455f9f4eeSPeter Maydell      *    the normal float_status, so that FPCR.FZ is applied
100555f9f4eeSPeter Maydell      *  - we have pre-set-up copy of s_std which is set to round-to-odd,
100655f9f4eeSPeter Maydell      *    for the multiply (see below)
100755f9f4eeSPeter Maydell      */
100855f9f4eeSPeter Maydell     float64 e1r = float16_to_float64(e1 & 0xffff, true, s_f16);
100955f9f4eeSPeter Maydell     float64 e1c = float16_to_float64(e1 >> 16, true, s_f16);
101055f9f4eeSPeter Maydell     float64 e2r = float16_to_float64(e2 & 0xffff, true, s_f16);
101155f9f4eeSPeter Maydell     float64 e2c = float16_to_float64(e2 >> 16, true, s_f16);
1012a3ef070eSClaudio Fontana     float64 t64;
1013a3ef070eSClaudio Fontana     float32 t32;
1014a3ef070eSClaudio Fontana 
1015a3ef070eSClaudio Fontana     /*
1016a3ef070eSClaudio Fontana      * The ARM pseudocode function FPDot performs both multiplies
1017a3ef070eSClaudio Fontana      * and the add with a single rounding operation.  Emulate this
1018a3ef070eSClaudio Fontana      * by performing the first multiply in round-to-odd, then doing
1019a3ef070eSClaudio Fontana      * the second multiply as fused multiply-add, and rounding to
1020a3ef070eSClaudio Fontana      * float32 all in one step.
1021a3ef070eSClaudio Fontana      */
1022a3ef070eSClaudio Fontana     t64 = float64_mul(e1r, e2r, s_odd);
1023a3ef070eSClaudio Fontana     t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
1024a3ef070eSClaudio Fontana 
1025a3ef070eSClaudio Fontana     /* This conversion is exact, because we've already rounded. */
1026a3ef070eSClaudio Fontana     t32 = float64_to_float32(t64, s_std);
1027a3ef070eSClaudio Fontana 
1028a3ef070eSClaudio Fontana     /* The final accumulation step is not fused. */
1029a3ef070eSClaudio Fontana     return float32_add(sum, t32, s_std);
1030a3ef070eSClaudio Fontana }
1031a3ef070eSClaudio Fontana 
HELPER(sme_fmopa_h)1032a3ef070eSClaudio Fontana void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
103355f9f4eeSPeter Maydell                          void *vpm, CPUARMState *env, uint32_t desc)
1034a3ef070eSClaudio Fontana {
1035a3ef070eSClaudio Fontana     intptr_t row, col, oprsz = simd_maxsz(desc);
1036a3ef070eSClaudio Fontana     uint32_t neg = simd_data(desc) * 0x80008000u;
1037a3ef070eSClaudio Fontana     uint16_t *pn = vpn, *pm = vpm;
103855f9f4eeSPeter Maydell     float_status fpst_odd, fpst_std, fpst_f16;
1039a3ef070eSClaudio Fontana 
1040a3ef070eSClaudio Fontana     /*
104155f9f4eeSPeter Maydell      * Make copies of fp_status and fp_status_f16, because this operation
104255f9f4eeSPeter Maydell      * does not update the cumulative fp exception status.  It also
104355f9f4eeSPeter Maydell      * produces default NaNs. We also need a second copy of fp_status with
104455f9f4eeSPeter Maydell      * round-to-odd -- see above.
1045a3ef070eSClaudio Fontana      */
104655f9f4eeSPeter Maydell     fpst_f16 = env->vfp.fp_status_f16;
104755f9f4eeSPeter Maydell     fpst_std = env->vfp.fp_status;
1048a3ef070eSClaudio Fontana     set_default_nan_mode(true, &fpst_std);
104955f9f4eeSPeter Maydell     set_default_nan_mode(true, &fpst_f16);
1050a3ef070eSClaudio Fontana     fpst_odd = fpst_std;
1051a3ef070eSClaudio Fontana     set_float_rounding_mode(float_round_to_odd, &fpst_odd);
1052a3ef070eSClaudio Fontana 
1053a3ef070eSClaudio Fontana     for (row = 0; row < oprsz; ) {
1054a3ef070eSClaudio Fontana         uint16_t prow = pn[H2(row >> 4)];
1055a3ef070eSClaudio Fontana         do {
1056a3ef070eSClaudio Fontana             void *vza_row = vza + tile_vslice_offset(row);
1057a3ef070eSClaudio Fontana             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1058a3ef070eSClaudio Fontana 
1059a3ef070eSClaudio Fontana             n = f16mop_adj_pair(n, prow, neg);
1060a3ef070eSClaudio Fontana 
1061a3ef070eSClaudio Fontana             for (col = 0; col < oprsz; ) {
1062a3ef070eSClaudio Fontana                 uint16_t pcol = pm[H2(col >> 4)];
1063a3ef070eSClaudio Fontana                 do {
1064a3ef070eSClaudio Fontana                     if (prow & pcol & 0b0101) {
1065a3ef070eSClaudio Fontana                         uint32_t *a = vza_row + H1_4(col);
1066a3ef070eSClaudio Fontana                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1067a3ef070eSClaudio Fontana 
1068a3ef070eSClaudio Fontana                         m = f16mop_adj_pair(m, pcol, 0);
106955f9f4eeSPeter Maydell                         *a = f16_dotadd(*a, n, m,
107055f9f4eeSPeter Maydell                                         &fpst_f16, &fpst_std, &fpst_odd);
10713efd8495SRichard Henderson                     }
1072a3ef070eSClaudio Fontana                     col += 4;
1073a3ef070eSClaudio Fontana                     pcol >>= 4;
1074a3ef070eSClaudio Fontana                 } while (col & 15);
1075a3ef070eSClaudio Fontana             }
1076a3ef070eSClaudio Fontana             row += 4;
1077a3ef070eSClaudio Fontana             prow >>= 4;
1078a3ef070eSClaudio Fontana         } while (row & 15);
1079a3ef070eSClaudio Fontana     }
1080a3ef070eSClaudio Fontana }
1081a3ef070eSClaudio Fontana 
HELPER(sme_bfmopa)1082ecabcfa4SPeter Maydell void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm,
1083ecabcfa4SPeter Maydell                         void *vpn, void *vpm, CPUARMState *env, uint32_t desc)
1084a3ef070eSClaudio Fontana {
1085a3ef070eSClaudio Fontana     intptr_t row, col, oprsz = simd_maxsz(desc);
1086a3ef070eSClaudio Fontana     uint32_t neg = simd_data(desc) * 0x80008000u;
1087a3ef070eSClaudio Fontana     uint16_t *pn = vpn, *pm = vpm;
1088*09b0d9e0SPeter Maydell     float_status fpst, fpst_odd;
1089a3ef070eSClaudio Fontana 
1090*09b0d9e0SPeter Maydell     if (is_ebf(env, &fpst, &fpst_odd)) {
1091a3ef070eSClaudio Fontana         for (row = 0; row < oprsz; ) {
1092a3ef070eSClaudio Fontana             uint16_t prow = pn[H2(row >> 4)];
1093a3ef070eSClaudio Fontana             do {
1094a3ef070eSClaudio Fontana                 void *vza_row = vza + tile_vslice_offset(row);
1095a3ef070eSClaudio Fontana                 uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1096a3ef070eSClaudio Fontana 
1097a3ef070eSClaudio Fontana                 n = f16mop_adj_pair(n, prow, neg);
1098a3ef070eSClaudio Fontana 
1099a3ef070eSClaudio Fontana                 for (col = 0; col < oprsz; ) {
1100a3ef070eSClaudio Fontana                     uint16_t pcol = pm[H2(col >> 4)];
1101a3ef070eSClaudio Fontana                     do {
1102a3ef070eSClaudio Fontana                         if (prow & pcol & 0b0101) {
1103a3ef070eSClaudio Fontana                             uint32_t *a = vza_row + H1_4(col);
1104a3ef070eSClaudio Fontana                             uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1105a3ef070eSClaudio Fontana 
1106a3ef070eSClaudio Fontana                             m = f16mop_adj_pair(m, pcol, 0);
1107*09b0d9e0SPeter Maydell                             *a = bfdotadd_ebf(*a, n, m, &fpst, &fpst_odd);
11083efd8495SRichard Henderson                         }
1109a3ef070eSClaudio Fontana                         col += 4;
1110a3ef070eSClaudio Fontana                         pcol >>= 4;
1111a3ef070eSClaudio Fontana                     } while (col & 15);
1112a3ef070eSClaudio Fontana                 }
1113a3ef070eSClaudio Fontana                 row += 4;
1114a3ef070eSClaudio Fontana                 prow >>= 4;
1115a3ef070eSClaudio Fontana             } while (row & 15);
1116a3ef070eSClaudio Fontana         }
1117*09b0d9e0SPeter Maydell     } else {
1118*09b0d9e0SPeter Maydell         for (row = 0; row < oprsz; ) {
1119*09b0d9e0SPeter Maydell             uint16_t prow = pn[H2(row >> 4)];
1120*09b0d9e0SPeter Maydell             do {
1121*09b0d9e0SPeter Maydell                 void *vza_row = vza + tile_vslice_offset(row);
1122*09b0d9e0SPeter Maydell                 uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1123*09b0d9e0SPeter Maydell 
1124*09b0d9e0SPeter Maydell                 n = f16mop_adj_pair(n, prow, neg);
1125*09b0d9e0SPeter Maydell 
1126*09b0d9e0SPeter Maydell                 for (col = 0; col < oprsz; ) {
1127*09b0d9e0SPeter Maydell                     uint16_t pcol = pm[H2(col >> 4)];
1128*09b0d9e0SPeter Maydell                     do {
1129*09b0d9e0SPeter Maydell                         if (prow & pcol & 0b0101) {
1130*09b0d9e0SPeter Maydell                             uint32_t *a = vza_row + H1_4(col);
1131*09b0d9e0SPeter Maydell                             uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1132*09b0d9e0SPeter Maydell 
1133*09b0d9e0SPeter Maydell                             m = f16mop_adj_pair(m, pcol, 0);
1134*09b0d9e0SPeter Maydell                             *a = bfdotadd(*a, n, m, &fpst);
1135*09b0d9e0SPeter Maydell                         }
1136*09b0d9e0SPeter Maydell                         col += 4;
1137*09b0d9e0SPeter Maydell                         pcol >>= 4;
1138*09b0d9e0SPeter Maydell                     } while (col & 15);
1139*09b0d9e0SPeter Maydell                 }
1140*09b0d9e0SPeter Maydell                 row += 4;
1141*09b0d9e0SPeter Maydell                 prow >>= 4;
1142*09b0d9e0SPeter Maydell             } while (row & 15);
1143*09b0d9e0SPeter Maydell         }
1144*09b0d9e0SPeter Maydell     }
1145a3ef070eSClaudio Fontana }
1146a3ef070eSClaudio Fontana 
1147d572bcb2SRichard Henderson typedef uint32_t IMOPFn32(uint32_t, uint32_t, uint32_t, uint8_t, bool);
do_imopa_s(uint32_t * za,uint32_t * zn,uint32_t * zm,uint8_t * pn,uint8_t * pm,uint32_t desc,IMOPFn32 * fn)1148d572bcb2SRichard Henderson static inline void do_imopa_s(uint32_t *za, uint32_t *zn, uint32_t *zm,
1149a3ef070eSClaudio Fontana                               uint8_t *pn, uint8_t *pm,
1150d572bcb2SRichard Henderson                               uint32_t desc, IMOPFn32 *fn)
1151d572bcb2SRichard Henderson {
1152d572bcb2SRichard Henderson     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
1153d572bcb2SRichard Henderson     bool neg = simd_data(desc);
1154d572bcb2SRichard Henderson 
1155d572bcb2SRichard Henderson     for (row = 0; row < oprsz; ++row) {
1156d572bcb2SRichard Henderson         uint8_t pa = (pn[H1(row >> 1)] >> ((row & 1) * 4)) & 0xf;
1157d572bcb2SRichard Henderson         uint32_t *za_row = &za[tile_vslice_index(row)];
1158d572bcb2SRichard Henderson         uint32_t n = zn[H4(row)];
1159d572bcb2SRichard Henderson 
1160d572bcb2SRichard Henderson         for (col = 0; col < oprsz; ++col) {
1161d572bcb2SRichard Henderson             uint8_t pb = pm[H1(col >> 1)] >> ((col & 1) * 4);
1162d572bcb2SRichard Henderson             uint32_t *a = &za_row[H4(col)];
1163d572bcb2SRichard Henderson 
1164d572bcb2SRichard Henderson             *a = fn(n, zm[H4(col)], *a, pa & pb, neg);
1165d572bcb2SRichard Henderson         }
1166d572bcb2SRichard Henderson     }
1167d572bcb2SRichard Henderson }
1168d572bcb2SRichard Henderson 
1169d572bcb2SRichard Henderson typedef uint64_t IMOPFn64(uint64_t, uint64_t, uint64_t, uint8_t, bool);
do_imopa_d(uint64_t * za,uint64_t * zn,uint64_t * zm,uint8_t * pn,uint8_t * pm,uint32_t desc,IMOPFn64 * fn)1170d572bcb2SRichard Henderson static inline void do_imopa_d(uint64_t *za, uint64_t *zn, uint64_t *zm,
1171d572bcb2SRichard Henderson                               uint8_t *pn, uint8_t *pm,
1172d572bcb2SRichard Henderson                               uint32_t desc, IMOPFn64 *fn)
1173a3ef070eSClaudio Fontana {
1174a3ef070eSClaudio Fontana     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
1175a3ef070eSClaudio Fontana     bool neg = simd_data(desc);
1176a3ef070eSClaudio Fontana 
1177a3ef070eSClaudio Fontana     for (row = 0; row < oprsz; ++row) {
1178a3ef070eSClaudio Fontana         uint8_t pa = pn[H1(row)];
1179a3ef070eSClaudio Fontana         uint64_t *za_row = &za[tile_vslice_index(row)];
1180a3ef070eSClaudio Fontana         uint64_t n = zn[row];
1181a3ef070eSClaudio Fontana 
1182a3ef070eSClaudio Fontana         for (col = 0; col < oprsz; ++col) {
1183a3ef070eSClaudio Fontana             uint8_t pb = pm[H1(col)];
1184a3ef070eSClaudio Fontana             uint64_t *a = &za_row[col];
1185a3ef070eSClaudio Fontana 
1186a3ef070eSClaudio Fontana             *a = fn(n, zm[col], *a, pa & pb, neg);
1187a3ef070eSClaudio Fontana         }
1188a3ef070eSClaudio Fontana     }
1189a3ef070eSClaudio Fontana }
1190a3ef070eSClaudio Fontana 
1191a3ef070eSClaudio Fontana #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1192d572bcb2SRichard Henderson static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \
1193a3ef070eSClaudio Fontana {                                                                           \
1194d572bcb2SRichard Henderson     uint32_t sum = 0;                                                       \
1195a3ef070eSClaudio Fontana     /* Apply P to N as a mask, making the inactive elements 0. */           \
1196a3ef070eSClaudio Fontana     n &= expand_pred_b(p);                                                  \
1197d572bcb2SRichard Henderson     sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
1198d572bcb2SRichard Henderson     sum += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                               \
1199d572bcb2SRichard Henderson     sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
1200d572bcb2SRichard Henderson     sum += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                             \
1201d572bcb2SRichard Henderson     return neg ? a - sum : a + sum;                                         \
1202a3ef070eSClaudio Fontana }
1203a3ef070eSClaudio Fontana 
1204a3ef070eSClaudio Fontana #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1205a3ef070eSClaudio Fontana static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1206a3ef070eSClaudio Fontana {                                                                           \
1207a3ef070eSClaudio Fontana     uint64_t sum = 0;                                                       \
1208a3ef070eSClaudio Fontana     /* Apply P to N as a mask, making the inactive elements 0. */           \
1209a3ef070eSClaudio Fontana     n &= expand_pred_h(p);                                                  \
1210ea3f5a90SPeter Maydell     sum += (int64_t)(NTYPE)(n >> 0) * (MTYPE)(m >> 0);                      \
1211ea3f5a90SPeter Maydell     sum += (int64_t)(NTYPE)(n >> 16) * (MTYPE)(m >> 16);                    \
1212ea3f5a90SPeter Maydell     sum += (int64_t)(NTYPE)(n >> 32) * (MTYPE)(m >> 32);                    \
1213ea3f5a90SPeter Maydell     sum += (int64_t)(NTYPE)(n >> 48) * (MTYPE)(m >> 48);                    \
1214a3ef070eSClaudio Fontana     return neg ? a - sum : a + sum;                                         \
1215a3ef070eSClaudio Fontana }
1216a3ef070eSClaudio Fontana 
1217a3ef070eSClaudio Fontana DEF_IMOP_32(smopa_s, int8_t, int8_t)
1218a3ef070eSClaudio Fontana DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
1219a3ef070eSClaudio Fontana DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
1220a3ef070eSClaudio Fontana DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
1221a3ef070eSClaudio Fontana 
1222a3ef070eSClaudio Fontana DEF_IMOP_64(smopa_d, int16_t, int16_t)
1223a3ef070eSClaudio Fontana DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
1224a3ef070eSClaudio Fontana DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
1225a3ef070eSClaudio Fontana DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
1226a3ef070eSClaudio Fontana 
1227d572bcb2SRichard Henderson #define DEF_IMOPH(NAME, S) \
1228d572bcb2SRichard Henderson     void HELPER(sme_##NAME##_##S)(void *vza, void *vzn, void *vzm,          \
1229d572bcb2SRichard Henderson                                   void *vpn, void *vpm, uint32_t desc)      \
1230d572bcb2SRichard Henderson     { do_imopa_##S(vza, vzn, vzm, vpn, vpm, desc, NAME##_##S); }
1231a3ef070eSClaudio Fontana 
1232d572bcb2SRichard Henderson DEF_IMOPH(smopa, s)
1233d572bcb2SRichard Henderson DEF_IMOPH(umopa, s)
1234d572bcb2SRichard Henderson DEF_IMOPH(sumopa, s)
1235d572bcb2SRichard Henderson DEF_IMOPH(usmopa, s)
1236d572bcb2SRichard Henderson 
1237d572bcb2SRichard Henderson DEF_IMOPH(smopa, d)
1238d572bcb2SRichard Henderson DEF_IMOPH(umopa, d)
1239d572bcb2SRichard Henderson DEF_IMOPH(sumopa, d)
1240d572bcb2SRichard Henderson DEF_IMOPH(usmopa, d)
1241