1a3ef070eSClaudio Fontana /*
2a3ef070eSClaudio Fontana * ARM SME Operations
3a3ef070eSClaudio Fontana *
4a3ef070eSClaudio Fontana * Copyright (c) 2022 Linaro, Ltd.
5a3ef070eSClaudio Fontana *
6a3ef070eSClaudio Fontana * This library is free software; you can redistribute it and/or
7a3ef070eSClaudio Fontana * modify it under the terms of the GNU Lesser General Public
8a3ef070eSClaudio Fontana * License as published by the Free Software Foundation; either
9a3ef070eSClaudio Fontana * version 2.1 of the License, or (at your option) any later version.
10a3ef070eSClaudio Fontana *
11a3ef070eSClaudio Fontana * This library is distributed in the hope that it will be useful,
12a3ef070eSClaudio Fontana * but WITHOUT ANY WARRANTY; without even the implied warranty of
13a3ef070eSClaudio Fontana * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14a3ef070eSClaudio Fontana * Lesser General Public License for more details.
15a3ef070eSClaudio Fontana *
16a3ef070eSClaudio Fontana * You should have received a copy of the GNU Lesser General Public
17a3ef070eSClaudio Fontana * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18a3ef070eSClaudio Fontana */
19a3ef070eSClaudio Fontana
20a3ef070eSClaudio Fontana #include "qemu/osdep.h"
21a3ef070eSClaudio Fontana #include "cpu.h"
22a3ef070eSClaudio Fontana #include "internals.h"
23a3ef070eSClaudio Fontana #include "tcg/tcg-gvec-desc.h"
24a3ef070eSClaudio Fontana #include "exec/helper-proto.h"
25a3ef070eSClaudio Fontana #include "exec/cpu_ldst.h"
26a3ef070eSClaudio Fontana #include "exec/exec-all.h"
27a3ef070eSClaudio Fontana #include "qemu/int128.h"
28a3ef070eSClaudio Fontana #include "fpu/softfloat.h"
29a3ef070eSClaudio Fontana #include "vec_internal.h"
30a3ef070eSClaudio Fontana #include "sve_ldst_internal.h"
31a3ef070eSClaudio Fontana
helper_set_svcr(CPUARMState * env,uint32_t val,uint32_t mask)32a3ef070eSClaudio Fontana void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
33a3ef070eSClaudio Fontana {
34a3ef070eSClaudio Fontana aarch64_set_svcr(env, val, mask);
35a3ef070eSClaudio Fontana }
36a3ef070eSClaudio Fontana
helper_sme_zero(CPUARMState * env,uint32_t imm,uint32_t svl)37a3ef070eSClaudio Fontana void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
38a3ef070eSClaudio Fontana {
39a3ef070eSClaudio Fontana uint32_t i;
40a3ef070eSClaudio Fontana
41a3ef070eSClaudio Fontana /*
42a3ef070eSClaudio Fontana * Special case clearing the entire ZA space.
43a3ef070eSClaudio Fontana * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
44a3ef070eSClaudio Fontana * parts of the ZA storage outside of SVL.
45a3ef070eSClaudio Fontana */
46a3ef070eSClaudio Fontana if (imm == 0xff) {
47a3ef070eSClaudio Fontana memset(env->zarray, 0, sizeof(env->zarray));
48a3ef070eSClaudio Fontana return;
49a3ef070eSClaudio Fontana }
50a3ef070eSClaudio Fontana
51a3ef070eSClaudio Fontana /*
52a3ef070eSClaudio Fontana * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
53a3ef070eSClaudio Fontana * so each row is discontiguous within ZA[].
54a3ef070eSClaudio Fontana */
55a3ef070eSClaudio Fontana for (i = 0; i < svl; i++) {
56a3ef070eSClaudio Fontana if (imm & (1 << (i % 8))) {
57a3ef070eSClaudio Fontana memset(&env->zarray[i], 0, svl);
58a3ef070eSClaudio Fontana }
59a3ef070eSClaudio Fontana }
60a3ef070eSClaudio Fontana }
61a3ef070eSClaudio Fontana
62a3ef070eSClaudio Fontana
63a3ef070eSClaudio Fontana /*
64a3ef070eSClaudio Fontana * When considering the ZA storage as an array of elements of
65a3ef070eSClaudio Fontana * type T, the index within that array of the Nth element of
66a3ef070eSClaudio Fontana * a vertical slice of a tile can be calculated like this,
67a3ef070eSClaudio Fontana * regardless of the size of type T. This is because the tiles
68a3ef070eSClaudio Fontana * are interleaved, so if type T is size N bytes then row 1 of
69a3ef070eSClaudio Fontana * the tile is N rows away from row 0. The division by N to
70a3ef070eSClaudio Fontana * convert a byte offset into an array index and the multiplication
71a3ef070eSClaudio Fontana * by N to convert from vslice-index-within-the-tile to
72a3ef070eSClaudio Fontana * the index within the ZA storage cancel out.
73a3ef070eSClaudio Fontana */
74a3ef070eSClaudio Fontana #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
75a3ef070eSClaudio Fontana
76a3ef070eSClaudio Fontana /*
77a3ef070eSClaudio Fontana * When doing byte arithmetic on the ZA storage, the element
78a3ef070eSClaudio Fontana * byteoff bytes away in a tile vertical slice is always this
79a3ef070eSClaudio Fontana * many bytes away in the ZA storage, regardless of the
80a3ef070eSClaudio Fontana * size of the tile element, assuming that byteoff is a multiple
81a3ef070eSClaudio Fontana * of the element size. Again this is because of the interleaving
82a3ef070eSClaudio Fontana * of the tiles. For instance if we have 1 byte per element then
83a3ef070eSClaudio Fontana * each row of the ZA storage has one byte of the vslice data,
84a3ef070eSClaudio Fontana * and (counting from 0) byte 8 goes in row 8 of the storage
85a3ef070eSClaudio Fontana * at offset (8 * row-size-in-bytes).
86a3ef070eSClaudio Fontana * If we have 8 bytes per element then each row of the ZA storage
87a3ef070eSClaudio Fontana * has 8 bytes of the data, but there are 8 interleaved tiles and
88a3ef070eSClaudio Fontana * so byte 8 of the data goes into row 1 of the tile,
89a3ef070eSClaudio Fontana * which is again row 8 of the storage, so the offset is still
90a3ef070eSClaudio Fontana * (8 * row-size-in-bytes). Similarly for other element sizes.
91a3ef070eSClaudio Fontana */
92a3ef070eSClaudio Fontana #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
93a3ef070eSClaudio Fontana
94a3ef070eSClaudio Fontana
95a3ef070eSClaudio Fontana /*
96a3ef070eSClaudio Fontana * Move Zreg vector to ZArray column.
97a3ef070eSClaudio Fontana */
98a3ef070eSClaudio Fontana #define DO_MOVA_C(NAME, TYPE, H) \
99a3ef070eSClaudio Fontana void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \
100a3ef070eSClaudio Fontana { \
101a3ef070eSClaudio Fontana int i, oprsz = simd_oprsz(desc); \
102a3ef070eSClaudio Fontana for (i = 0; i < oprsz; ) { \
103a3ef070eSClaudio Fontana uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
104a3ef070eSClaudio Fontana do { \
105a3ef070eSClaudio Fontana if (pg & 1) { \
106a3ef070eSClaudio Fontana *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
107a3ef070eSClaudio Fontana } \
108a3ef070eSClaudio Fontana i += sizeof(TYPE); \
109a3ef070eSClaudio Fontana pg >>= sizeof(TYPE); \
110a3ef070eSClaudio Fontana } while (i & 15); \
111a3ef070eSClaudio Fontana } \
112a3ef070eSClaudio Fontana }
113a3ef070eSClaudio Fontana
DO_MOVA_C(sme_mova_cz_b,uint8_t,H1)114a3ef070eSClaudio Fontana DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
115a3ef070eSClaudio Fontana DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
116a3ef070eSClaudio Fontana DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
117a3ef070eSClaudio Fontana
118a3ef070eSClaudio Fontana void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
119a3ef070eSClaudio Fontana {
120a3ef070eSClaudio Fontana int i, oprsz = simd_oprsz(desc) / 8;
121a3ef070eSClaudio Fontana uint8_t *pg = vg;
122a3ef070eSClaudio Fontana uint64_t *n = vn;
123a3ef070eSClaudio Fontana uint64_t *a = za;
124a3ef070eSClaudio Fontana
125a3ef070eSClaudio Fontana for (i = 0; i < oprsz; i++) {
126a3ef070eSClaudio Fontana if (pg[H1(i)] & 1) {
127a3ef070eSClaudio Fontana a[tile_vslice_index(i)] = n[i];
128a3ef070eSClaudio Fontana }
129a3ef070eSClaudio Fontana }
130a3ef070eSClaudio Fontana }
131a3ef070eSClaudio Fontana
HELPER(sme_mova_cz_q)132a3ef070eSClaudio Fontana void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
133a3ef070eSClaudio Fontana {
134a3ef070eSClaudio Fontana int i, oprsz = simd_oprsz(desc) / 16;
135a3ef070eSClaudio Fontana uint16_t *pg = vg;
136a3ef070eSClaudio Fontana Int128 *n = vn;
137a3ef070eSClaudio Fontana Int128 *a = za;
138a3ef070eSClaudio Fontana
139a3ef070eSClaudio Fontana /*
140a3ef070eSClaudio Fontana * Int128 is used here simply to copy 16 bytes, and to simplify
141a3ef070eSClaudio Fontana * the address arithmetic.
142a3ef070eSClaudio Fontana */
143a3ef070eSClaudio Fontana for (i = 0; i < oprsz; i++) {
144a3ef070eSClaudio Fontana if (pg[H2(i)] & 1) {
145a3ef070eSClaudio Fontana a[tile_vslice_index(i)] = n[i];
146a3ef070eSClaudio Fontana }
147a3ef070eSClaudio Fontana }
148a3ef070eSClaudio Fontana }
149a3ef070eSClaudio Fontana
150a3ef070eSClaudio Fontana #undef DO_MOVA_C
151a3ef070eSClaudio Fontana
152a3ef070eSClaudio Fontana /*
153a3ef070eSClaudio Fontana * Move ZArray column to Zreg vector.
154a3ef070eSClaudio Fontana */
155a3ef070eSClaudio Fontana #define DO_MOVA_Z(NAME, TYPE, H) \
156a3ef070eSClaudio Fontana void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \
157a3ef070eSClaudio Fontana { \
158a3ef070eSClaudio Fontana int i, oprsz = simd_oprsz(desc); \
159a3ef070eSClaudio Fontana for (i = 0; i < oprsz; ) { \
160a3ef070eSClaudio Fontana uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
161a3ef070eSClaudio Fontana do { \
162a3ef070eSClaudio Fontana if (pg & 1) { \
163a3ef070eSClaudio Fontana *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
164a3ef070eSClaudio Fontana } \
165a3ef070eSClaudio Fontana i += sizeof(TYPE); \
166a3ef070eSClaudio Fontana pg >>= sizeof(TYPE); \
167a3ef070eSClaudio Fontana } while (i & 15); \
168a3ef070eSClaudio Fontana } \
169a3ef070eSClaudio Fontana }
170a3ef070eSClaudio Fontana
DO_MOVA_Z(sme_mova_zc_b,uint8_t,H1)171a3ef070eSClaudio Fontana DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
172a3ef070eSClaudio Fontana DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
173a3ef070eSClaudio Fontana DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
174a3ef070eSClaudio Fontana
175a3ef070eSClaudio Fontana void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
176a3ef070eSClaudio Fontana {
177a3ef070eSClaudio Fontana int i, oprsz = simd_oprsz(desc) / 8;
178a3ef070eSClaudio Fontana uint8_t *pg = vg;
179a3ef070eSClaudio Fontana uint64_t *d = vd;
180a3ef070eSClaudio Fontana uint64_t *a = za;
181a3ef070eSClaudio Fontana
182a3ef070eSClaudio Fontana for (i = 0; i < oprsz; i++) {
183a3ef070eSClaudio Fontana if (pg[H1(i)] & 1) {
184a3ef070eSClaudio Fontana d[i] = a[tile_vslice_index(i)];
185a3ef070eSClaudio Fontana }
186a3ef070eSClaudio Fontana }
187a3ef070eSClaudio Fontana }
188a3ef070eSClaudio Fontana
HELPER(sme_mova_zc_q)189a3ef070eSClaudio Fontana void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
190a3ef070eSClaudio Fontana {
191a3ef070eSClaudio Fontana int i, oprsz = simd_oprsz(desc) / 16;
192a3ef070eSClaudio Fontana uint16_t *pg = vg;
193a3ef070eSClaudio Fontana Int128 *d = vd;
194a3ef070eSClaudio Fontana Int128 *a = za;
195a3ef070eSClaudio Fontana
196a3ef070eSClaudio Fontana /*
197a3ef070eSClaudio Fontana * Int128 is used here simply to copy 16 bytes, and to simplify
198a3ef070eSClaudio Fontana * the address arithmetic.
199a3ef070eSClaudio Fontana */
200a3ef070eSClaudio Fontana for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
201a3ef070eSClaudio Fontana if (pg[H2(i)] & 1) {
202a3ef070eSClaudio Fontana d[i] = a[tile_vslice_index(i)];
203a3ef070eSClaudio Fontana }
204a3ef070eSClaudio Fontana }
205a3ef070eSClaudio Fontana }
206a3ef070eSClaudio Fontana
207a3ef070eSClaudio Fontana #undef DO_MOVA_Z
208a3ef070eSClaudio Fontana
209a3ef070eSClaudio Fontana /*
210a3ef070eSClaudio Fontana * Clear elements in a tile slice comprising len bytes.
211a3ef070eSClaudio Fontana */
212a3ef070eSClaudio Fontana
213a3ef070eSClaudio Fontana typedef void ClearFn(void *ptr, size_t off, size_t len);
214a3ef070eSClaudio Fontana
clear_horizontal(void * ptr,size_t off,size_t len)215a3ef070eSClaudio Fontana static void clear_horizontal(void *ptr, size_t off, size_t len)
216a3ef070eSClaudio Fontana {
217a3ef070eSClaudio Fontana memset(ptr + off, 0, len);
218a3ef070eSClaudio Fontana }
219a3ef070eSClaudio Fontana
clear_vertical_b(void * vptr,size_t off,size_t len)220a3ef070eSClaudio Fontana static void clear_vertical_b(void *vptr, size_t off, size_t len)
221a3ef070eSClaudio Fontana {
222a3ef070eSClaudio Fontana for (size_t i = 0; i < len; ++i) {
223a3ef070eSClaudio Fontana *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
224a3ef070eSClaudio Fontana }
225a3ef070eSClaudio Fontana }
226a3ef070eSClaudio Fontana
clear_vertical_h(void * vptr,size_t off,size_t len)227a3ef070eSClaudio Fontana static void clear_vertical_h(void *vptr, size_t off, size_t len)
228a3ef070eSClaudio Fontana {
229a3ef070eSClaudio Fontana for (size_t i = 0; i < len; i += 2) {
230a3ef070eSClaudio Fontana *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
231a3ef070eSClaudio Fontana }
232a3ef070eSClaudio Fontana }
233a3ef070eSClaudio Fontana
clear_vertical_s(void * vptr,size_t off,size_t len)234a3ef070eSClaudio Fontana static void clear_vertical_s(void *vptr, size_t off, size_t len)
235a3ef070eSClaudio Fontana {
236a3ef070eSClaudio Fontana for (size_t i = 0; i < len; i += 4) {
237a3ef070eSClaudio Fontana *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
238a3ef070eSClaudio Fontana }
239a3ef070eSClaudio Fontana }
240a3ef070eSClaudio Fontana
clear_vertical_d(void * vptr,size_t off,size_t len)241a3ef070eSClaudio Fontana static void clear_vertical_d(void *vptr, size_t off, size_t len)
242a3ef070eSClaudio Fontana {
243a3ef070eSClaudio Fontana for (size_t i = 0; i < len; i += 8) {
244a3ef070eSClaudio Fontana *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
245a3ef070eSClaudio Fontana }
246a3ef070eSClaudio Fontana }
247a3ef070eSClaudio Fontana
clear_vertical_q(void * vptr,size_t off,size_t len)248a3ef070eSClaudio Fontana static void clear_vertical_q(void *vptr, size_t off, size_t len)
249a3ef070eSClaudio Fontana {
250a3ef070eSClaudio Fontana for (size_t i = 0; i < len; i += 16) {
251a3ef070eSClaudio Fontana memset(vptr + tile_vslice_offset(i + off), 0, 16);
252a3ef070eSClaudio Fontana }
253a3ef070eSClaudio Fontana }
254a3ef070eSClaudio Fontana
255a3ef070eSClaudio Fontana /*
256a3ef070eSClaudio Fontana * Copy elements from an array into a tile slice comprising len bytes.
257a3ef070eSClaudio Fontana */
258a3ef070eSClaudio Fontana
259a3ef070eSClaudio Fontana typedef void CopyFn(void *dst, const void *src, size_t len);
260a3ef070eSClaudio Fontana
copy_horizontal(void * dst,const void * src,size_t len)261a3ef070eSClaudio Fontana static void copy_horizontal(void *dst, const void *src, size_t len)
262a3ef070eSClaudio Fontana {
263a3ef070eSClaudio Fontana memcpy(dst, src, len);
264a3ef070eSClaudio Fontana }
265a3ef070eSClaudio Fontana
copy_vertical_b(void * vdst,const void * vsrc,size_t len)266a3ef070eSClaudio Fontana static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
267a3ef070eSClaudio Fontana {
268a3ef070eSClaudio Fontana const uint8_t *src = vsrc;
269a3ef070eSClaudio Fontana uint8_t *dst = vdst;
270a3ef070eSClaudio Fontana size_t i;
271a3ef070eSClaudio Fontana
272a3ef070eSClaudio Fontana for (i = 0; i < len; ++i) {
273a3ef070eSClaudio Fontana dst[tile_vslice_index(i)] = src[i];
274a3ef070eSClaudio Fontana }
275a3ef070eSClaudio Fontana }
276a3ef070eSClaudio Fontana
copy_vertical_h(void * vdst,const void * vsrc,size_t len)277a3ef070eSClaudio Fontana static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
278a3ef070eSClaudio Fontana {
279a3ef070eSClaudio Fontana const uint16_t *src = vsrc;
280a3ef070eSClaudio Fontana uint16_t *dst = vdst;
281a3ef070eSClaudio Fontana size_t i;
282a3ef070eSClaudio Fontana
283a3ef070eSClaudio Fontana for (i = 0; i < len / 2; ++i) {
284a3ef070eSClaudio Fontana dst[tile_vslice_index(i)] = src[i];
285a3ef070eSClaudio Fontana }
286a3ef070eSClaudio Fontana }
287a3ef070eSClaudio Fontana
copy_vertical_s(void * vdst,const void * vsrc,size_t len)288a3ef070eSClaudio Fontana static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
289a3ef070eSClaudio Fontana {
290a3ef070eSClaudio Fontana const uint32_t *src = vsrc;
291a3ef070eSClaudio Fontana uint32_t *dst = vdst;
292a3ef070eSClaudio Fontana size_t i;
293a3ef070eSClaudio Fontana
294a3ef070eSClaudio Fontana for (i = 0; i < len / 4; ++i) {
295a3ef070eSClaudio Fontana dst[tile_vslice_index(i)] = src[i];
296a3ef070eSClaudio Fontana }
297a3ef070eSClaudio Fontana }
298a3ef070eSClaudio Fontana
copy_vertical_d(void * vdst,const void * vsrc,size_t len)299a3ef070eSClaudio Fontana static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
300a3ef070eSClaudio Fontana {
301a3ef070eSClaudio Fontana const uint64_t *src = vsrc;
302a3ef070eSClaudio Fontana uint64_t *dst = vdst;
303a3ef070eSClaudio Fontana size_t i;
304a3ef070eSClaudio Fontana
305a3ef070eSClaudio Fontana for (i = 0; i < len / 8; ++i) {
306a3ef070eSClaudio Fontana dst[tile_vslice_index(i)] = src[i];
307a3ef070eSClaudio Fontana }
308a3ef070eSClaudio Fontana }
309a3ef070eSClaudio Fontana
copy_vertical_q(void * vdst,const void * vsrc,size_t len)310a3ef070eSClaudio Fontana static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
311a3ef070eSClaudio Fontana {
312a3ef070eSClaudio Fontana for (size_t i = 0; i < len; i += 16) {
313a3ef070eSClaudio Fontana memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
314a3ef070eSClaudio Fontana }
315a3ef070eSClaudio Fontana }
316a3ef070eSClaudio Fontana
317a3ef070eSClaudio Fontana /*
318a3ef070eSClaudio Fontana * Host and TLB primitives for vertical tile slice addressing.
319a3ef070eSClaudio Fontana */
320a3ef070eSClaudio Fontana
321a3ef070eSClaudio Fontana #define DO_LD(NAME, TYPE, HOST, TLB) \
322a3ef070eSClaudio Fontana static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
323a3ef070eSClaudio Fontana { \
324a3ef070eSClaudio Fontana TYPE val = HOST(host); \
325a3ef070eSClaudio Fontana *(TYPE *)(za + tile_vslice_offset(off)) = val; \
326a3ef070eSClaudio Fontana } \
327a3ef070eSClaudio Fontana static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
328a3ef070eSClaudio Fontana intptr_t off, target_ulong addr, uintptr_t ra) \
329a3ef070eSClaudio Fontana { \
330a3ef070eSClaudio Fontana TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \
331a3ef070eSClaudio Fontana *(TYPE *)(za + tile_vslice_offset(off)) = val; \
332a3ef070eSClaudio Fontana }
333a3ef070eSClaudio Fontana
334a3ef070eSClaudio Fontana #define DO_ST(NAME, TYPE, HOST, TLB) \
335a3ef070eSClaudio Fontana static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
336a3ef070eSClaudio Fontana { \
337a3ef070eSClaudio Fontana TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
338a3ef070eSClaudio Fontana HOST(host, val); \
339a3ef070eSClaudio Fontana } \
340a3ef070eSClaudio Fontana static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
341a3ef070eSClaudio Fontana intptr_t off, target_ulong addr, uintptr_t ra) \
342a3ef070eSClaudio Fontana { \
343a3ef070eSClaudio Fontana TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
344a3ef070eSClaudio Fontana TLB(env, useronly_clean_ptr(addr), val, ra); \
345a3ef070eSClaudio Fontana }
346a3ef070eSClaudio Fontana
347a3ef070eSClaudio Fontana /*
348a3ef070eSClaudio Fontana * The ARMVectorReg elements are stored in host-endian 64-bit units.
349a3ef070eSClaudio Fontana * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
350a3ef070eSClaudio Fontana * corresponds to storing the two 64-bit pieces in little-endian order.
351a3ef070eSClaudio Fontana */
352a3ef070eSClaudio Fontana #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \
353a3ef070eSClaudio Fontana static inline void HNAME##_host(void *za, intptr_t off, void *host) \
354a3ef070eSClaudio Fontana { \
355a3ef070eSClaudio Fontana uint64_t val0 = HOST(host), val1 = HOST(host + 8); \
356a3ef070eSClaudio Fontana uint64_t *ptr = za + off; \
357a3ef070eSClaudio Fontana ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
358a3ef070eSClaudio Fontana } \
359a3ef070eSClaudio Fontana static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
360a3ef070eSClaudio Fontana { \
361a3ef070eSClaudio Fontana HNAME##_host(za, tile_vslice_offset(off), host); \
362a3ef070eSClaudio Fontana } \
363a3ef070eSClaudio Fontana static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
364a3ef070eSClaudio Fontana target_ulong addr, uintptr_t ra) \
365a3ef070eSClaudio Fontana { \
366a3ef070eSClaudio Fontana uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \
367a3ef070eSClaudio Fontana uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \
368a3ef070eSClaudio Fontana uint64_t *ptr = za + off; \
369a3ef070eSClaudio Fontana ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
370a3ef070eSClaudio Fontana } \
371a3ef070eSClaudio Fontana static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
372a3ef070eSClaudio Fontana target_ulong addr, uintptr_t ra) \
373a3ef070eSClaudio Fontana { \
374a3ef070eSClaudio Fontana HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
375a3ef070eSClaudio Fontana }
376a3ef070eSClaudio Fontana
377a3ef070eSClaudio Fontana #define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \
378a3ef070eSClaudio Fontana static inline void HNAME##_host(void *za, intptr_t off, void *host) \
379a3ef070eSClaudio Fontana { \
380a3ef070eSClaudio Fontana uint64_t *ptr = za + off; \
381a3ef070eSClaudio Fontana HOST(host, ptr[BE]); \
3824b3520fdSRichard Henderson HOST(host + 8, ptr[!BE]); \
383a3ef070eSClaudio Fontana } \
384a3ef070eSClaudio Fontana static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
385a3ef070eSClaudio Fontana { \
386a3ef070eSClaudio Fontana HNAME##_host(za, tile_vslice_offset(off), host); \
387a3ef070eSClaudio Fontana } \
388a3ef070eSClaudio Fontana static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
389a3ef070eSClaudio Fontana target_ulong addr, uintptr_t ra) \
390a3ef070eSClaudio Fontana { \
391a3ef070eSClaudio Fontana uint64_t *ptr = za + off; \
392a3ef070eSClaudio Fontana TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \
393a3ef070eSClaudio Fontana TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \
394a3ef070eSClaudio Fontana } \
395a3ef070eSClaudio Fontana static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
396a3ef070eSClaudio Fontana target_ulong addr, uintptr_t ra) \
397a3ef070eSClaudio Fontana { \
398a3ef070eSClaudio Fontana HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
399a3ef070eSClaudio Fontana }
400a3ef070eSClaudio Fontana
DO_LD(ld1b,uint8_t,ldub_p,cpu_ldub_data_ra)401a3ef070eSClaudio Fontana DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
402a3ef070eSClaudio Fontana DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
403a3ef070eSClaudio Fontana DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
404a3ef070eSClaudio Fontana DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
405a3ef070eSClaudio Fontana DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
406a3ef070eSClaudio Fontana DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
407a3ef070eSClaudio Fontana DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
408a3ef070eSClaudio Fontana
409a3ef070eSClaudio Fontana DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
410a3ef070eSClaudio Fontana DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
411a3ef070eSClaudio Fontana
412a3ef070eSClaudio Fontana DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
413a3ef070eSClaudio Fontana DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
414a3ef070eSClaudio Fontana DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
415a3ef070eSClaudio Fontana DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
416a3ef070eSClaudio Fontana DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
417a3ef070eSClaudio Fontana DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
418a3ef070eSClaudio Fontana DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
419a3ef070eSClaudio Fontana
420a3ef070eSClaudio Fontana DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
421a3ef070eSClaudio Fontana DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
422a3ef070eSClaudio Fontana
423a3ef070eSClaudio Fontana #undef DO_LD
424a3ef070eSClaudio Fontana #undef DO_ST
425a3ef070eSClaudio Fontana #undef DO_LDQ
426a3ef070eSClaudio Fontana #undef DO_STQ
427a3ef070eSClaudio Fontana
428a3ef070eSClaudio Fontana /*
429a3ef070eSClaudio Fontana * Common helper for all contiguous predicated loads.
430a3ef070eSClaudio Fontana */
431a3ef070eSClaudio Fontana
432a3ef070eSClaudio Fontana static inline QEMU_ALWAYS_INLINE
433a3ef070eSClaudio Fontana void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
434a3ef070eSClaudio Fontana const target_ulong addr, uint32_t desc, const uintptr_t ra,
435a3ef070eSClaudio Fontana const int esz, uint32_t mtedesc, bool vertical,
436a3ef070eSClaudio Fontana sve_ldst1_host_fn *host_fn,
437a3ef070eSClaudio Fontana sve_ldst1_tlb_fn *tlb_fn,
438a3ef070eSClaudio Fontana ClearFn *clr_fn,
439a3ef070eSClaudio Fontana CopyFn *cpy_fn)
440a3ef070eSClaudio Fontana {
441a3ef070eSClaudio Fontana const intptr_t reg_max = simd_oprsz(desc);
442a3ef070eSClaudio Fontana const intptr_t esize = 1 << esz;
443a3ef070eSClaudio Fontana intptr_t reg_off, reg_last;
444a3ef070eSClaudio Fontana SVEContLdSt info;
445a3ef070eSClaudio Fontana void *host;
446a3ef070eSClaudio Fontana int flags;
447a3ef070eSClaudio Fontana
448a3ef070eSClaudio Fontana /* Find the active elements. */
449a3ef070eSClaudio Fontana if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
450a3ef070eSClaudio Fontana /* The entire predicate was false; no load occurs. */
451a3ef070eSClaudio Fontana clr_fn(za, 0, reg_max);
452a3ef070eSClaudio Fontana return;
453a3ef070eSClaudio Fontana }
454a3ef070eSClaudio Fontana
455a3ef070eSClaudio Fontana /* Probe the page(s). Exit with exception for any invalid page. */
456a3ef070eSClaudio Fontana sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
457a3ef070eSClaudio Fontana
458a3ef070eSClaudio Fontana /* Handle watchpoints for all active elements. */
459a3ef070eSClaudio Fontana sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
460a3ef070eSClaudio Fontana BP_MEM_READ, ra);
461a3ef070eSClaudio Fontana
462a3ef070eSClaudio Fontana /*
463a3ef070eSClaudio Fontana * Handle mte checks for all active elements.
464a3ef070eSClaudio Fontana * Since TBI must be set for MTE, !mtedesc => !mte_active.
465a3ef070eSClaudio Fontana */
466a3ef070eSClaudio Fontana if (mtedesc) {
467a3ef070eSClaudio Fontana sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
468a3ef070eSClaudio Fontana mtedesc, ra);
469a3ef070eSClaudio Fontana }
470a3ef070eSClaudio Fontana
471a3ef070eSClaudio Fontana flags = info.page[0].flags | info.page[1].flags;
472a3ef070eSClaudio Fontana if (unlikely(flags != 0)) {
473a3ef070eSClaudio Fontana #ifdef CONFIG_USER_ONLY
474a3ef070eSClaudio Fontana g_assert_not_reached();
475a3ef070eSClaudio Fontana #else
476a3ef070eSClaudio Fontana /*
477a3ef070eSClaudio Fontana * At least one page includes MMIO.
478a3ef070eSClaudio Fontana * Any bus operation can fail with cpu_transaction_failed,
479a3ef070eSClaudio Fontana * which for ARM will raise SyncExternal. Perform the load
480a3ef070eSClaudio Fontana * into scratch memory to preserve register state until the end.
481a3ef070eSClaudio Fontana */
482a3ef070eSClaudio Fontana ARMVectorReg scratch = { };
483a3ef070eSClaudio Fontana
484a3ef070eSClaudio Fontana reg_off = info.reg_off_first[0];
485a3ef070eSClaudio Fontana reg_last = info.reg_off_last[1];
486a3ef070eSClaudio Fontana if (reg_last < 0) {
487a3ef070eSClaudio Fontana reg_last = info.reg_off_split;
488a3ef070eSClaudio Fontana if (reg_last < 0) {
489a3ef070eSClaudio Fontana reg_last = info.reg_off_last[0];
490a3ef070eSClaudio Fontana }
491a3ef070eSClaudio Fontana }
492a3ef070eSClaudio Fontana
493a3ef070eSClaudio Fontana do {
494a3ef070eSClaudio Fontana uint64_t pg = vg[reg_off >> 6];
495a3ef070eSClaudio Fontana do {
496a3ef070eSClaudio Fontana if ((pg >> (reg_off & 63)) & 1) {
497a3ef070eSClaudio Fontana tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
498a3ef070eSClaudio Fontana }
499a3ef070eSClaudio Fontana reg_off += esize;
500a3ef070eSClaudio Fontana } while (reg_off & 63);
501a3ef070eSClaudio Fontana } while (reg_off <= reg_last);
502a3ef070eSClaudio Fontana
503a3ef070eSClaudio Fontana cpy_fn(za, &scratch, reg_max);
504a3ef070eSClaudio Fontana return;
505a3ef070eSClaudio Fontana #endif
506a3ef070eSClaudio Fontana }
507a3ef070eSClaudio Fontana
508a3ef070eSClaudio Fontana /* The entire operation is in RAM, on valid pages. */
509a3ef070eSClaudio Fontana
510a3ef070eSClaudio Fontana reg_off = info.reg_off_first[0];
511a3ef070eSClaudio Fontana reg_last = info.reg_off_last[0];
512a3ef070eSClaudio Fontana host = info.page[0].host;
513a3ef070eSClaudio Fontana
514a3ef070eSClaudio Fontana if (!vertical) {
515a3ef070eSClaudio Fontana memset(za, 0, reg_max);
516a3ef070eSClaudio Fontana } else if (reg_off) {
517a3ef070eSClaudio Fontana clr_fn(za, 0, reg_off);
518a3ef070eSClaudio Fontana }
519a3ef070eSClaudio Fontana
5203b9991e3SRichard Henderson set_helper_retaddr(ra);
5213b9991e3SRichard Henderson
522a3ef070eSClaudio Fontana while (reg_off <= reg_last) {
523a3ef070eSClaudio Fontana uint64_t pg = vg[reg_off >> 6];
524a3ef070eSClaudio Fontana do {
525a3ef070eSClaudio Fontana if ((pg >> (reg_off & 63)) & 1) {
526a3ef070eSClaudio Fontana host_fn(za, reg_off, host + reg_off);
527a3ef070eSClaudio Fontana } else if (vertical) {
528a3ef070eSClaudio Fontana clr_fn(za, reg_off, esize);
529a3ef070eSClaudio Fontana }
530a3ef070eSClaudio Fontana reg_off += esize;
531a3ef070eSClaudio Fontana } while (reg_off <= reg_last && (reg_off & 63));
532a3ef070eSClaudio Fontana }
533a3ef070eSClaudio Fontana
5343b9991e3SRichard Henderson clear_helper_retaddr();
5353b9991e3SRichard Henderson
536a3ef070eSClaudio Fontana /*
537a3ef070eSClaudio Fontana * Use the slow path to manage the cross-page misalignment.
538a3ef070eSClaudio Fontana * But we know this is RAM and cannot trap.
539a3ef070eSClaudio Fontana */
540a3ef070eSClaudio Fontana reg_off = info.reg_off_split;
541a3ef070eSClaudio Fontana if (unlikely(reg_off >= 0)) {
542a3ef070eSClaudio Fontana tlb_fn(env, za, reg_off, addr + reg_off, ra);
543a3ef070eSClaudio Fontana }
544a3ef070eSClaudio Fontana
545a3ef070eSClaudio Fontana reg_off = info.reg_off_first[1];
546a3ef070eSClaudio Fontana if (unlikely(reg_off >= 0)) {
547a3ef070eSClaudio Fontana reg_last = info.reg_off_last[1];
548a3ef070eSClaudio Fontana host = info.page[1].host;
549a3ef070eSClaudio Fontana
5503b9991e3SRichard Henderson set_helper_retaddr(ra);
5513b9991e3SRichard Henderson
552a3ef070eSClaudio Fontana do {
553a3ef070eSClaudio Fontana uint64_t pg = vg[reg_off >> 6];
554a3ef070eSClaudio Fontana do {
555a3ef070eSClaudio Fontana if ((pg >> (reg_off & 63)) & 1) {
556a3ef070eSClaudio Fontana host_fn(za, reg_off, host + reg_off);
557a3ef070eSClaudio Fontana } else if (vertical) {
558a3ef070eSClaudio Fontana clr_fn(za, reg_off, esize);
559a3ef070eSClaudio Fontana }
560a3ef070eSClaudio Fontana reg_off += esize;
561a3ef070eSClaudio Fontana } while (reg_off & 63);
562a3ef070eSClaudio Fontana } while (reg_off <= reg_last);
5633b9991e3SRichard Henderson
5643b9991e3SRichard Henderson clear_helper_retaddr();
565a3ef070eSClaudio Fontana }
566a3ef070eSClaudio Fontana }
567a3ef070eSClaudio Fontana
568a3ef070eSClaudio Fontana static inline QEMU_ALWAYS_INLINE
sme_ld1_mte(CPUARMState * env,void * za,uint64_t * vg,target_ulong addr,uint32_t desc,uintptr_t ra,const int esz,bool vertical,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn,ClearFn * clr_fn,CopyFn * cpy_fn)569a3ef070eSClaudio Fontana void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
570a3ef070eSClaudio Fontana target_ulong addr, uint32_t desc, uintptr_t ra,
571a3ef070eSClaudio Fontana const int esz, bool vertical,
572a3ef070eSClaudio Fontana sve_ldst1_host_fn *host_fn,
573a3ef070eSClaudio Fontana sve_ldst1_tlb_fn *tlb_fn,
574a3ef070eSClaudio Fontana ClearFn *clr_fn,
575a3ef070eSClaudio Fontana CopyFn *cpy_fn)
576a3ef070eSClaudio Fontana {
577a3ef070eSClaudio Fontana uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
578a3ef070eSClaudio Fontana int bit55 = extract64(addr, 55, 1);
579a3ef070eSClaudio Fontana
580a3ef070eSClaudio Fontana /* Remove mtedesc from the normal sve descriptor. */
581a3ef070eSClaudio Fontana desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
582a3ef070eSClaudio Fontana
583a3ef070eSClaudio Fontana /* Perform gross MTE suppression early. */
584855f94ecSRichard Henderson if (!tbi_check(mtedesc, bit55) ||
585855f94ecSRichard Henderson tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
586a3ef070eSClaudio Fontana mtedesc = 0;
587a3ef070eSClaudio Fontana }
588a3ef070eSClaudio Fontana
589a3ef070eSClaudio Fontana sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
590a3ef070eSClaudio Fontana host_fn, tlb_fn, clr_fn, cpy_fn);
591a3ef070eSClaudio Fontana }
592a3ef070eSClaudio Fontana
593a3ef070eSClaudio Fontana #define DO_LD(L, END, ESZ) \
594a3ef070eSClaudio Fontana void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
595a3ef070eSClaudio Fontana target_ulong addr, uint32_t desc) \
596a3ef070eSClaudio Fontana { \
597a3ef070eSClaudio Fontana sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
598a3ef070eSClaudio Fontana sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
599a3ef070eSClaudio Fontana clear_horizontal, copy_horizontal); \
600a3ef070eSClaudio Fontana } \
601a3ef070eSClaudio Fontana void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
602a3ef070eSClaudio Fontana target_ulong addr, uint32_t desc) \
603a3ef070eSClaudio Fontana { \
604a3ef070eSClaudio Fontana sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
605a3ef070eSClaudio Fontana sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
606a3ef070eSClaudio Fontana clear_vertical_##L, copy_vertical_##L); \
607a3ef070eSClaudio Fontana } \
608a3ef070eSClaudio Fontana void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
609a3ef070eSClaudio Fontana target_ulong addr, uint32_t desc) \
610a3ef070eSClaudio Fontana { \
611a3ef070eSClaudio Fontana sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
612a3ef070eSClaudio Fontana sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
613a3ef070eSClaudio Fontana clear_horizontal, copy_horizontal); \
614a3ef070eSClaudio Fontana } \
615a3ef070eSClaudio Fontana void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
616a3ef070eSClaudio Fontana target_ulong addr, uint32_t desc) \
617a3ef070eSClaudio Fontana { \
618a3ef070eSClaudio Fontana sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
619a3ef070eSClaudio Fontana sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
620a3ef070eSClaudio Fontana clear_vertical_##L, copy_vertical_##L); \
621a3ef070eSClaudio Fontana }
622a3ef070eSClaudio Fontana
623a3ef070eSClaudio Fontana DO_LD(b, , MO_8)
DO_LD(h,_be,MO_16)624a3ef070eSClaudio Fontana DO_LD(h, _be, MO_16)
625a3ef070eSClaudio Fontana DO_LD(h, _le, MO_16)
626a3ef070eSClaudio Fontana DO_LD(s, _be, MO_32)
627a3ef070eSClaudio Fontana DO_LD(s, _le, MO_32)
628a3ef070eSClaudio Fontana DO_LD(d, _be, MO_64)
629a3ef070eSClaudio Fontana DO_LD(d, _le, MO_64)
630a3ef070eSClaudio Fontana DO_LD(q, _be, MO_128)
631a3ef070eSClaudio Fontana DO_LD(q, _le, MO_128)
632a3ef070eSClaudio Fontana
633a3ef070eSClaudio Fontana #undef DO_LD
634a3ef070eSClaudio Fontana
635a3ef070eSClaudio Fontana /*
636a3ef070eSClaudio Fontana * Common helper for all contiguous predicated stores.
637a3ef070eSClaudio Fontana */
638a3ef070eSClaudio Fontana
639a3ef070eSClaudio Fontana static inline QEMU_ALWAYS_INLINE
640a3ef070eSClaudio Fontana void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
641a3ef070eSClaudio Fontana const target_ulong addr, uint32_t desc, const uintptr_t ra,
642a3ef070eSClaudio Fontana const int esz, uint32_t mtedesc, bool vertical,
643a3ef070eSClaudio Fontana sve_ldst1_host_fn *host_fn,
644a3ef070eSClaudio Fontana sve_ldst1_tlb_fn *tlb_fn)
645a3ef070eSClaudio Fontana {
646a3ef070eSClaudio Fontana const intptr_t reg_max = simd_oprsz(desc);
647a3ef070eSClaudio Fontana const intptr_t esize = 1 << esz;
648a3ef070eSClaudio Fontana intptr_t reg_off, reg_last;
649a3ef070eSClaudio Fontana SVEContLdSt info;
650a3ef070eSClaudio Fontana void *host;
651a3ef070eSClaudio Fontana int flags;
652a3ef070eSClaudio Fontana
653a3ef070eSClaudio Fontana /* Find the active elements. */
654a3ef070eSClaudio Fontana if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
655a3ef070eSClaudio Fontana /* The entire predicate was false; no store occurs. */
656a3ef070eSClaudio Fontana return;
657a3ef070eSClaudio Fontana }
658a3ef070eSClaudio Fontana
659a3ef070eSClaudio Fontana /* Probe the page(s). Exit with exception for any invalid page. */
660a3ef070eSClaudio Fontana sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
661a3ef070eSClaudio Fontana
662a3ef070eSClaudio Fontana /* Handle watchpoints for all active elements. */
663a3ef070eSClaudio Fontana sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
664a3ef070eSClaudio Fontana BP_MEM_WRITE, ra);
665a3ef070eSClaudio Fontana
666a3ef070eSClaudio Fontana /*
667a3ef070eSClaudio Fontana * Handle mte checks for all active elements.
668a3ef070eSClaudio Fontana * Since TBI must be set for MTE, !mtedesc => !mte_active.
669a3ef070eSClaudio Fontana */
670a3ef070eSClaudio Fontana if (mtedesc) {
671a3ef070eSClaudio Fontana sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
672a3ef070eSClaudio Fontana mtedesc, ra);
673a3ef070eSClaudio Fontana }
674a3ef070eSClaudio Fontana
675a3ef070eSClaudio Fontana flags = info.page[0].flags | info.page[1].flags;
676a3ef070eSClaudio Fontana if (unlikely(flags != 0)) {
677a3ef070eSClaudio Fontana #ifdef CONFIG_USER_ONLY
678a3ef070eSClaudio Fontana g_assert_not_reached();
679a3ef070eSClaudio Fontana #else
680a3ef070eSClaudio Fontana /*
681a3ef070eSClaudio Fontana * At least one page includes MMIO.
682a3ef070eSClaudio Fontana * Any bus operation can fail with cpu_transaction_failed,
683a3ef070eSClaudio Fontana * which for ARM will raise SyncExternal. We cannot avoid
684a3ef070eSClaudio Fontana * this fault and will leave with the store incomplete.
685a3ef070eSClaudio Fontana */
686a3ef070eSClaudio Fontana reg_off = info.reg_off_first[0];
687a3ef070eSClaudio Fontana reg_last = info.reg_off_last[1];
688a3ef070eSClaudio Fontana if (reg_last < 0) {
689a3ef070eSClaudio Fontana reg_last = info.reg_off_split;
690a3ef070eSClaudio Fontana if (reg_last < 0) {
691a3ef070eSClaudio Fontana reg_last = info.reg_off_last[0];
692a3ef070eSClaudio Fontana }
693a3ef070eSClaudio Fontana }
694a3ef070eSClaudio Fontana
695a3ef070eSClaudio Fontana do {
696a3ef070eSClaudio Fontana uint64_t pg = vg[reg_off >> 6];
697a3ef070eSClaudio Fontana do {
698a3ef070eSClaudio Fontana if ((pg >> (reg_off & 63)) & 1) {
699a3ef070eSClaudio Fontana tlb_fn(env, za, reg_off, addr + reg_off, ra);
700a3ef070eSClaudio Fontana }
701a3ef070eSClaudio Fontana reg_off += esize;
702a3ef070eSClaudio Fontana } while (reg_off & 63);
703a3ef070eSClaudio Fontana } while (reg_off <= reg_last);
704a3ef070eSClaudio Fontana return;
705a3ef070eSClaudio Fontana #endif
706a3ef070eSClaudio Fontana }
707a3ef070eSClaudio Fontana
708a3ef070eSClaudio Fontana reg_off = info.reg_off_first[0];
709a3ef070eSClaudio Fontana reg_last = info.reg_off_last[0];
710a3ef070eSClaudio Fontana host = info.page[0].host;
711a3ef070eSClaudio Fontana
7123b9991e3SRichard Henderson set_helper_retaddr(ra);
7133b9991e3SRichard Henderson
714a3ef070eSClaudio Fontana while (reg_off <= reg_last) {
715a3ef070eSClaudio Fontana uint64_t pg = vg[reg_off >> 6];
716a3ef070eSClaudio Fontana do {
717a3ef070eSClaudio Fontana if ((pg >> (reg_off & 63)) & 1) {
718a3ef070eSClaudio Fontana host_fn(za, reg_off, host + reg_off);
719a3ef070eSClaudio Fontana }
720a3ef070eSClaudio Fontana reg_off += 1 << esz;
721a3ef070eSClaudio Fontana } while (reg_off <= reg_last && (reg_off & 63));
722a3ef070eSClaudio Fontana }
723a3ef070eSClaudio Fontana
7243b9991e3SRichard Henderson clear_helper_retaddr();
7253b9991e3SRichard Henderson
726a3ef070eSClaudio Fontana /*
727a3ef070eSClaudio Fontana * Use the slow path to manage the cross-page misalignment.
728a3ef070eSClaudio Fontana * But we know this is RAM and cannot trap.
729a3ef070eSClaudio Fontana */
730a3ef070eSClaudio Fontana reg_off = info.reg_off_split;
731a3ef070eSClaudio Fontana if (unlikely(reg_off >= 0)) {
732a3ef070eSClaudio Fontana tlb_fn(env, za, reg_off, addr + reg_off, ra);
733a3ef070eSClaudio Fontana }
734a3ef070eSClaudio Fontana
735a3ef070eSClaudio Fontana reg_off = info.reg_off_first[1];
736a3ef070eSClaudio Fontana if (unlikely(reg_off >= 0)) {
737a3ef070eSClaudio Fontana reg_last = info.reg_off_last[1];
738a3ef070eSClaudio Fontana host = info.page[1].host;
739a3ef070eSClaudio Fontana
7403b9991e3SRichard Henderson set_helper_retaddr(ra);
7413b9991e3SRichard Henderson
742a3ef070eSClaudio Fontana do {
743a3ef070eSClaudio Fontana uint64_t pg = vg[reg_off >> 6];
744a3ef070eSClaudio Fontana do {
745a3ef070eSClaudio Fontana if ((pg >> (reg_off & 63)) & 1) {
746a3ef070eSClaudio Fontana host_fn(za, reg_off, host + reg_off);
747a3ef070eSClaudio Fontana }
748a3ef070eSClaudio Fontana reg_off += 1 << esz;
749a3ef070eSClaudio Fontana } while (reg_off & 63);
750a3ef070eSClaudio Fontana } while (reg_off <= reg_last);
7513b9991e3SRichard Henderson
7523b9991e3SRichard Henderson clear_helper_retaddr();
753a3ef070eSClaudio Fontana }
754a3ef070eSClaudio Fontana }
755a3ef070eSClaudio Fontana
756a3ef070eSClaudio Fontana static inline QEMU_ALWAYS_INLINE
sme_st1_mte(CPUARMState * env,void * za,uint64_t * vg,target_ulong addr,uint32_t desc,uintptr_t ra,int esz,bool vertical,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)757a3ef070eSClaudio Fontana void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
758a3ef070eSClaudio Fontana uint32_t desc, uintptr_t ra, int esz, bool vertical,
759a3ef070eSClaudio Fontana sve_ldst1_host_fn *host_fn,
760a3ef070eSClaudio Fontana sve_ldst1_tlb_fn *tlb_fn)
761a3ef070eSClaudio Fontana {
762a3ef070eSClaudio Fontana uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
763a3ef070eSClaudio Fontana int bit55 = extract64(addr, 55, 1);
764a3ef070eSClaudio Fontana
765a3ef070eSClaudio Fontana /* Remove mtedesc from the normal sve descriptor. */
766a3ef070eSClaudio Fontana desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
767a3ef070eSClaudio Fontana
768a3ef070eSClaudio Fontana /* Perform gross MTE suppression early. */
769855f94ecSRichard Henderson if (!tbi_check(mtedesc, bit55) ||
770855f94ecSRichard Henderson tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
771a3ef070eSClaudio Fontana mtedesc = 0;
772a3ef070eSClaudio Fontana }
773a3ef070eSClaudio Fontana
774a3ef070eSClaudio Fontana sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
775a3ef070eSClaudio Fontana vertical, host_fn, tlb_fn);
776a3ef070eSClaudio Fontana }
777a3ef070eSClaudio Fontana
778a3ef070eSClaudio Fontana #define DO_ST(L, END, ESZ) \
779a3ef070eSClaudio Fontana void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
780a3ef070eSClaudio Fontana target_ulong addr, uint32_t desc) \
781a3ef070eSClaudio Fontana { \
782a3ef070eSClaudio Fontana sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
783a3ef070eSClaudio Fontana sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
784a3ef070eSClaudio Fontana } \
785a3ef070eSClaudio Fontana void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
786a3ef070eSClaudio Fontana target_ulong addr, uint32_t desc) \
787a3ef070eSClaudio Fontana { \
788a3ef070eSClaudio Fontana sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
789a3ef070eSClaudio Fontana sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
790a3ef070eSClaudio Fontana } \
791a3ef070eSClaudio Fontana void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
792a3ef070eSClaudio Fontana target_ulong addr, uint32_t desc) \
793a3ef070eSClaudio Fontana { \
794a3ef070eSClaudio Fontana sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
795a3ef070eSClaudio Fontana sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
796a3ef070eSClaudio Fontana } \
797a3ef070eSClaudio Fontana void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
798a3ef070eSClaudio Fontana target_ulong addr, uint32_t desc) \
799a3ef070eSClaudio Fontana { \
800a3ef070eSClaudio Fontana sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
801a3ef070eSClaudio Fontana sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
802a3ef070eSClaudio Fontana }
803a3ef070eSClaudio Fontana
804a3ef070eSClaudio Fontana DO_ST(b, , MO_8)
DO_ST(h,_be,MO_16)805a3ef070eSClaudio Fontana DO_ST(h, _be, MO_16)
806a3ef070eSClaudio Fontana DO_ST(h, _le, MO_16)
807a3ef070eSClaudio Fontana DO_ST(s, _be, MO_32)
808a3ef070eSClaudio Fontana DO_ST(s, _le, MO_32)
809a3ef070eSClaudio Fontana DO_ST(d, _be, MO_64)
810a3ef070eSClaudio Fontana DO_ST(d, _le, MO_64)
811a3ef070eSClaudio Fontana DO_ST(q, _be, MO_128)
812a3ef070eSClaudio Fontana DO_ST(q, _le, MO_128)
813a3ef070eSClaudio Fontana
814a3ef070eSClaudio Fontana #undef DO_ST
815a3ef070eSClaudio Fontana
816a3ef070eSClaudio Fontana void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
817a3ef070eSClaudio Fontana void *vpm, uint32_t desc)
818a3ef070eSClaudio Fontana {
819a3ef070eSClaudio Fontana intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
820a3ef070eSClaudio Fontana uint64_t *pn = vpn, *pm = vpm;
821a3ef070eSClaudio Fontana uint32_t *zda = vzda, *zn = vzn;
822a3ef070eSClaudio Fontana
823a3ef070eSClaudio Fontana for (row = 0; row < oprsz; ) {
824a3ef070eSClaudio Fontana uint64_t pa = pn[row >> 4];
825a3ef070eSClaudio Fontana do {
826a3ef070eSClaudio Fontana if (pa & 1) {
827a3ef070eSClaudio Fontana for (col = 0; col < oprsz; ) {
828a3ef070eSClaudio Fontana uint64_t pb = pm[col >> 4];
829a3ef070eSClaudio Fontana do {
830a3ef070eSClaudio Fontana if (pb & 1) {
831a3ef070eSClaudio Fontana zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
832a3ef070eSClaudio Fontana }
833a3ef070eSClaudio Fontana pb >>= 4;
834a3ef070eSClaudio Fontana } while (++col & 15);
835a3ef070eSClaudio Fontana }
836a3ef070eSClaudio Fontana }
837a3ef070eSClaudio Fontana pa >>= 4;
838a3ef070eSClaudio Fontana } while (++row & 15);
839a3ef070eSClaudio Fontana }
840a3ef070eSClaudio Fontana }
841a3ef070eSClaudio Fontana
HELPER(sme_addha_d)842a3ef070eSClaudio Fontana void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
843a3ef070eSClaudio Fontana void *vpm, uint32_t desc)
844a3ef070eSClaudio Fontana {
845a3ef070eSClaudio Fontana intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
846a3ef070eSClaudio Fontana uint8_t *pn = vpn, *pm = vpm;
847a3ef070eSClaudio Fontana uint64_t *zda = vzda, *zn = vzn;
848a3ef070eSClaudio Fontana
849a3ef070eSClaudio Fontana for (row = 0; row < oprsz; ++row) {
850a3ef070eSClaudio Fontana if (pn[H1(row)] & 1) {
851a3ef070eSClaudio Fontana for (col = 0; col < oprsz; ++col) {
852a3ef070eSClaudio Fontana if (pm[H1(col)] & 1) {
853a3ef070eSClaudio Fontana zda[tile_vslice_index(row) + col] += zn[col];
854a3ef070eSClaudio Fontana }
855a3ef070eSClaudio Fontana }
856a3ef070eSClaudio Fontana }
857a3ef070eSClaudio Fontana }
858a3ef070eSClaudio Fontana }
859a3ef070eSClaudio Fontana
HELPER(sme_addva_s)860a3ef070eSClaudio Fontana void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
861a3ef070eSClaudio Fontana void *vpm, uint32_t desc)
862a3ef070eSClaudio Fontana {
863a3ef070eSClaudio Fontana intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
864a3ef070eSClaudio Fontana uint64_t *pn = vpn, *pm = vpm;
865a3ef070eSClaudio Fontana uint32_t *zda = vzda, *zn = vzn;
866a3ef070eSClaudio Fontana
867a3ef070eSClaudio Fontana for (row = 0; row < oprsz; ) {
868a3ef070eSClaudio Fontana uint64_t pa = pn[row >> 4];
869a3ef070eSClaudio Fontana do {
870a3ef070eSClaudio Fontana if (pa & 1) {
871a3ef070eSClaudio Fontana uint32_t zn_row = zn[H4(row)];
872a3ef070eSClaudio Fontana for (col = 0; col < oprsz; ) {
873a3ef070eSClaudio Fontana uint64_t pb = pm[col >> 4];
874a3ef070eSClaudio Fontana do {
875a3ef070eSClaudio Fontana if (pb & 1) {
876a3ef070eSClaudio Fontana zda[tile_vslice_index(row) + H4(col)] += zn_row;
877a3ef070eSClaudio Fontana }
878a3ef070eSClaudio Fontana pb >>= 4;
879a3ef070eSClaudio Fontana } while (++col & 15);
880a3ef070eSClaudio Fontana }
881a3ef070eSClaudio Fontana }
882a3ef070eSClaudio Fontana pa >>= 4;
883a3ef070eSClaudio Fontana } while (++row & 15);
884a3ef070eSClaudio Fontana }
885a3ef070eSClaudio Fontana }
886a3ef070eSClaudio Fontana
HELPER(sme_addva_d)887a3ef070eSClaudio Fontana void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
888a3ef070eSClaudio Fontana void *vpm, uint32_t desc)
889a3ef070eSClaudio Fontana {
890a3ef070eSClaudio Fontana intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
891a3ef070eSClaudio Fontana uint8_t *pn = vpn, *pm = vpm;
892a3ef070eSClaudio Fontana uint64_t *zda = vzda, *zn = vzn;
893a3ef070eSClaudio Fontana
894a3ef070eSClaudio Fontana for (row = 0; row < oprsz; ++row) {
895a3ef070eSClaudio Fontana if (pn[H1(row)] & 1) {
896a3ef070eSClaudio Fontana uint64_t zn_row = zn[row];
897a3ef070eSClaudio Fontana for (col = 0; col < oprsz; ++col) {
898a3ef070eSClaudio Fontana if (pm[H1(col)] & 1) {
899a3ef070eSClaudio Fontana zda[tile_vslice_index(row) + col] += zn_row;
900a3ef070eSClaudio Fontana }
901a3ef070eSClaudio Fontana }
902a3ef070eSClaudio Fontana }
903a3ef070eSClaudio Fontana }
904a3ef070eSClaudio Fontana }
905a3ef070eSClaudio Fontana
HELPER(sme_fmopa_s)906a3ef070eSClaudio Fontana void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
907a3ef070eSClaudio Fontana void *vpm, void *vst, uint32_t desc)
908a3ef070eSClaudio Fontana {
909a3ef070eSClaudio Fontana intptr_t row, col, oprsz = simd_maxsz(desc);
910a3ef070eSClaudio Fontana uint32_t neg = simd_data(desc) << 31;
911a3ef070eSClaudio Fontana uint16_t *pn = vpn, *pm = vpm;
912a3ef070eSClaudio Fontana float_status fpst;
913a3ef070eSClaudio Fontana
914a3ef070eSClaudio Fontana /*
915a3ef070eSClaudio Fontana * Make a copy of float_status because this operation does not
916a3ef070eSClaudio Fontana * update the cumulative fp exception status. It also produces
917a3ef070eSClaudio Fontana * default nans.
918a3ef070eSClaudio Fontana */
919a3ef070eSClaudio Fontana fpst = *(float_status *)vst;
920a3ef070eSClaudio Fontana set_default_nan_mode(true, &fpst);
921a3ef070eSClaudio Fontana
922a3ef070eSClaudio Fontana for (row = 0; row < oprsz; ) {
923a3ef070eSClaudio Fontana uint16_t pa = pn[H2(row >> 4)];
924a3ef070eSClaudio Fontana do {
925a3ef070eSClaudio Fontana if (pa & 1) {
926a3ef070eSClaudio Fontana void *vza_row = vza + tile_vslice_offset(row);
927a3ef070eSClaudio Fontana uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
928a3ef070eSClaudio Fontana
929a3ef070eSClaudio Fontana for (col = 0; col < oprsz; ) {
930a3ef070eSClaudio Fontana uint16_t pb = pm[H2(col >> 4)];
931a3ef070eSClaudio Fontana do {
932a3ef070eSClaudio Fontana if (pb & 1) {
933a3ef070eSClaudio Fontana uint32_t *a = vza_row + H1_4(col);
934a3ef070eSClaudio Fontana uint32_t *m = vzm + H1_4(col);
93531d93fedSDaniyal Khan *a = float32_muladd(n, *m, *a, 0, &fpst);
936a3ef070eSClaudio Fontana }
937a3ef070eSClaudio Fontana col += 4;
938a3ef070eSClaudio Fontana pb >>= 4;
939a3ef070eSClaudio Fontana } while (col & 15);
940a3ef070eSClaudio Fontana }
941a3ef070eSClaudio Fontana }
942a3ef070eSClaudio Fontana row += 4;
943a3ef070eSClaudio Fontana pa >>= 4;
944a3ef070eSClaudio Fontana } while (row & 15);
945a3ef070eSClaudio Fontana }
946a3ef070eSClaudio Fontana }
947a3ef070eSClaudio Fontana
HELPER(sme_fmopa_d)948a3ef070eSClaudio Fontana void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
949a3ef070eSClaudio Fontana void *vpm, void *vst, uint32_t desc)
950a3ef070eSClaudio Fontana {
951a3ef070eSClaudio Fontana intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
952a3ef070eSClaudio Fontana uint64_t neg = (uint64_t)simd_data(desc) << 63;
953a3ef070eSClaudio Fontana uint64_t *za = vza, *zn = vzn, *zm = vzm;
954a3ef070eSClaudio Fontana uint8_t *pn = vpn, *pm = vpm;
955a3ef070eSClaudio Fontana float_status fpst = *(float_status *)vst;
956a3ef070eSClaudio Fontana
957a3ef070eSClaudio Fontana set_default_nan_mode(true, &fpst);
958a3ef070eSClaudio Fontana
959a3ef070eSClaudio Fontana for (row = 0; row < oprsz; ++row) {
960a3ef070eSClaudio Fontana if (pn[H1(row)] & 1) {
961a3ef070eSClaudio Fontana uint64_t *za_row = &za[tile_vslice_index(row)];
962a3ef070eSClaudio Fontana uint64_t n = zn[row] ^ neg;
963a3ef070eSClaudio Fontana
964a3ef070eSClaudio Fontana for (col = 0; col < oprsz; ++col) {
965a3ef070eSClaudio Fontana if (pm[H1(col)] & 1) {
966a3ef070eSClaudio Fontana uint64_t *a = &za_row[col];
967a3ef070eSClaudio Fontana *a = float64_muladd(n, zm[col], *a, 0, &fpst);
968a3ef070eSClaudio Fontana }
969a3ef070eSClaudio Fontana }
970a3ef070eSClaudio Fontana }
971a3ef070eSClaudio Fontana }
972a3ef070eSClaudio Fontana }
973a3ef070eSClaudio Fontana
974a3ef070eSClaudio Fontana /*
975a3ef070eSClaudio Fontana * Alter PAIR as needed for controlling predicates being false,
976a3ef070eSClaudio Fontana * and for NEG on an enabled row element.
977a3ef070eSClaudio Fontana */
f16mop_adj_pair(uint32_t pair,uint32_t pg,uint32_t neg)978a3ef070eSClaudio Fontana static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
979a3ef070eSClaudio Fontana {
980a3ef070eSClaudio Fontana /*
981a3ef070eSClaudio Fontana * The pseudocode uses a conditional negate after the conditional zero.
982a3ef070eSClaudio Fontana * It is simpler here to unconditionally negate before conditional zero.
983a3ef070eSClaudio Fontana */
984a3ef070eSClaudio Fontana pair ^= neg;
985a3ef070eSClaudio Fontana if (!(pg & 1)) {
986a3ef070eSClaudio Fontana pair &= 0xffff0000u;
987a3ef070eSClaudio Fontana }
988a3ef070eSClaudio Fontana if (!(pg & 4)) {
989a3ef070eSClaudio Fontana pair &= 0x0000ffffu;
990a3ef070eSClaudio Fontana }
991a3ef070eSClaudio Fontana return pair;
992a3ef070eSClaudio Fontana }
993a3ef070eSClaudio Fontana
f16_dotadd(float32 sum,uint32_t e1,uint32_t e2,float_status * s_f16,float_status * s_std,float_status * s_odd)994a3ef070eSClaudio Fontana static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
99555f9f4eeSPeter Maydell float_status *s_f16, float_status *s_std,
99655f9f4eeSPeter Maydell float_status *s_odd)
997a3ef070eSClaudio Fontana {
99855f9f4eeSPeter Maydell /*
99955f9f4eeSPeter Maydell * We need three different float_status for different parts of this
100055f9f4eeSPeter Maydell * operation:
100155f9f4eeSPeter Maydell * - the input conversion of the float16 values must use the
100255f9f4eeSPeter Maydell * f16-specific float_status, so that the FPCR.FZ16 control is applied
100355f9f4eeSPeter Maydell * - operations on float32 including the final accumulation must use
100455f9f4eeSPeter Maydell * the normal float_status, so that FPCR.FZ is applied
100555f9f4eeSPeter Maydell * - we have pre-set-up copy of s_std which is set to round-to-odd,
100655f9f4eeSPeter Maydell * for the multiply (see below)
100755f9f4eeSPeter Maydell */
100855f9f4eeSPeter Maydell float64 e1r = float16_to_float64(e1 & 0xffff, true, s_f16);
100955f9f4eeSPeter Maydell float64 e1c = float16_to_float64(e1 >> 16, true, s_f16);
101055f9f4eeSPeter Maydell float64 e2r = float16_to_float64(e2 & 0xffff, true, s_f16);
101155f9f4eeSPeter Maydell float64 e2c = float16_to_float64(e2 >> 16, true, s_f16);
1012a3ef070eSClaudio Fontana float64 t64;
1013a3ef070eSClaudio Fontana float32 t32;
1014a3ef070eSClaudio Fontana
1015a3ef070eSClaudio Fontana /*
1016a3ef070eSClaudio Fontana * The ARM pseudocode function FPDot performs both multiplies
1017a3ef070eSClaudio Fontana * and the add with a single rounding operation. Emulate this
1018a3ef070eSClaudio Fontana * by performing the first multiply in round-to-odd, then doing
1019a3ef070eSClaudio Fontana * the second multiply as fused multiply-add, and rounding to
1020a3ef070eSClaudio Fontana * float32 all in one step.
1021a3ef070eSClaudio Fontana */
1022a3ef070eSClaudio Fontana t64 = float64_mul(e1r, e2r, s_odd);
1023a3ef070eSClaudio Fontana t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
1024a3ef070eSClaudio Fontana
1025a3ef070eSClaudio Fontana /* This conversion is exact, because we've already rounded. */
1026a3ef070eSClaudio Fontana t32 = float64_to_float32(t64, s_std);
1027a3ef070eSClaudio Fontana
1028a3ef070eSClaudio Fontana /* The final accumulation step is not fused. */
1029a3ef070eSClaudio Fontana return float32_add(sum, t32, s_std);
1030a3ef070eSClaudio Fontana }
1031a3ef070eSClaudio Fontana
HELPER(sme_fmopa_h)1032a3ef070eSClaudio Fontana void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
103355f9f4eeSPeter Maydell void *vpm, CPUARMState *env, uint32_t desc)
1034a3ef070eSClaudio Fontana {
1035a3ef070eSClaudio Fontana intptr_t row, col, oprsz = simd_maxsz(desc);
1036a3ef070eSClaudio Fontana uint32_t neg = simd_data(desc) * 0x80008000u;
1037a3ef070eSClaudio Fontana uint16_t *pn = vpn, *pm = vpm;
103855f9f4eeSPeter Maydell float_status fpst_odd, fpst_std, fpst_f16;
1039a3ef070eSClaudio Fontana
1040a3ef070eSClaudio Fontana /*
104155f9f4eeSPeter Maydell * Make copies of fp_status and fp_status_f16, because this operation
104255f9f4eeSPeter Maydell * does not update the cumulative fp exception status. It also
104355f9f4eeSPeter Maydell * produces default NaNs. We also need a second copy of fp_status with
104455f9f4eeSPeter Maydell * round-to-odd -- see above.
1045a3ef070eSClaudio Fontana */
104655f9f4eeSPeter Maydell fpst_f16 = env->vfp.fp_status_f16;
104755f9f4eeSPeter Maydell fpst_std = env->vfp.fp_status;
1048a3ef070eSClaudio Fontana set_default_nan_mode(true, &fpst_std);
104955f9f4eeSPeter Maydell set_default_nan_mode(true, &fpst_f16);
1050a3ef070eSClaudio Fontana fpst_odd = fpst_std;
1051a3ef070eSClaudio Fontana set_float_rounding_mode(float_round_to_odd, &fpst_odd);
1052a3ef070eSClaudio Fontana
1053a3ef070eSClaudio Fontana for (row = 0; row < oprsz; ) {
1054a3ef070eSClaudio Fontana uint16_t prow = pn[H2(row >> 4)];
1055a3ef070eSClaudio Fontana do {
1056a3ef070eSClaudio Fontana void *vza_row = vza + tile_vslice_offset(row);
1057a3ef070eSClaudio Fontana uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1058a3ef070eSClaudio Fontana
1059a3ef070eSClaudio Fontana n = f16mop_adj_pair(n, prow, neg);
1060a3ef070eSClaudio Fontana
1061a3ef070eSClaudio Fontana for (col = 0; col < oprsz; ) {
1062a3ef070eSClaudio Fontana uint16_t pcol = pm[H2(col >> 4)];
1063a3ef070eSClaudio Fontana do {
1064a3ef070eSClaudio Fontana if (prow & pcol & 0b0101) {
1065a3ef070eSClaudio Fontana uint32_t *a = vza_row + H1_4(col);
1066a3ef070eSClaudio Fontana uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1067a3ef070eSClaudio Fontana
1068a3ef070eSClaudio Fontana m = f16mop_adj_pair(m, pcol, 0);
106955f9f4eeSPeter Maydell *a = f16_dotadd(*a, n, m,
107055f9f4eeSPeter Maydell &fpst_f16, &fpst_std, &fpst_odd);
10713efd8495SRichard Henderson }
1072a3ef070eSClaudio Fontana col += 4;
1073a3ef070eSClaudio Fontana pcol >>= 4;
1074a3ef070eSClaudio Fontana } while (col & 15);
1075a3ef070eSClaudio Fontana }
1076a3ef070eSClaudio Fontana row += 4;
1077a3ef070eSClaudio Fontana prow >>= 4;
1078a3ef070eSClaudio Fontana } while (row & 15);
1079a3ef070eSClaudio Fontana }
1080a3ef070eSClaudio Fontana }
1081a3ef070eSClaudio Fontana
HELPER(sme_bfmopa)1082ecabcfa4SPeter Maydell void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm,
1083ecabcfa4SPeter Maydell void *vpn, void *vpm, CPUARMState *env, uint32_t desc)
1084a3ef070eSClaudio Fontana {
1085a3ef070eSClaudio Fontana intptr_t row, col, oprsz = simd_maxsz(desc);
1086a3ef070eSClaudio Fontana uint32_t neg = simd_data(desc) * 0x80008000u;
1087a3ef070eSClaudio Fontana uint16_t *pn = vpn, *pm = vpm;
1088*09b0d9e0SPeter Maydell float_status fpst, fpst_odd;
1089a3ef070eSClaudio Fontana
1090*09b0d9e0SPeter Maydell if (is_ebf(env, &fpst, &fpst_odd)) {
1091a3ef070eSClaudio Fontana for (row = 0; row < oprsz; ) {
1092a3ef070eSClaudio Fontana uint16_t prow = pn[H2(row >> 4)];
1093a3ef070eSClaudio Fontana do {
1094a3ef070eSClaudio Fontana void *vza_row = vza + tile_vslice_offset(row);
1095a3ef070eSClaudio Fontana uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1096a3ef070eSClaudio Fontana
1097a3ef070eSClaudio Fontana n = f16mop_adj_pair(n, prow, neg);
1098a3ef070eSClaudio Fontana
1099a3ef070eSClaudio Fontana for (col = 0; col < oprsz; ) {
1100a3ef070eSClaudio Fontana uint16_t pcol = pm[H2(col >> 4)];
1101a3ef070eSClaudio Fontana do {
1102a3ef070eSClaudio Fontana if (prow & pcol & 0b0101) {
1103a3ef070eSClaudio Fontana uint32_t *a = vza_row + H1_4(col);
1104a3ef070eSClaudio Fontana uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1105a3ef070eSClaudio Fontana
1106a3ef070eSClaudio Fontana m = f16mop_adj_pair(m, pcol, 0);
1107*09b0d9e0SPeter Maydell *a = bfdotadd_ebf(*a, n, m, &fpst, &fpst_odd);
11083efd8495SRichard Henderson }
1109a3ef070eSClaudio Fontana col += 4;
1110a3ef070eSClaudio Fontana pcol >>= 4;
1111a3ef070eSClaudio Fontana } while (col & 15);
1112a3ef070eSClaudio Fontana }
1113a3ef070eSClaudio Fontana row += 4;
1114a3ef070eSClaudio Fontana prow >>= 4;
1115a3ef070eSClaudio Fontana } while (row & 15);
1116a3ef070eSClaudio Fontana }
1117*09b0d9e0SPeter Maydell } else {
1118*09b0d9e0SPeter Maydell for (row = 0; row < oprsz; ) {
1119*09b0d9e0SPeter Maydell uint16_t prow = pn[H2(row >> 4)];
1120*09b0d9e0SPeter Maydell do {
1121*09b0d9e0SPeter Maydell void *vza_row = vza + tile_vslice_offset(row);
1122*09b0d9e0SPeter Maydell uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1123*09b0d9e0SPeter Maydell
1124*09b0d9e0SPeter Maydell n = f16mop_adj_pair(n, prow, neg);
1125*09b0d9e0SPeter Maydell
1126*09b0d9e0SPeter Maydell for (col = 0; col < oprsz; ) {
1127*09b0d9e0SPeter Maydell uint16_t pcol = pm[H2(col >> 4)];
1128*09b0d9e0SPeter Maydell do {
1129*09b0d9e0SPeter Maydell if (prow & pcol & 0b0101) {
1130*09b0d9e0SPeter Maydell uint32_t *a = vza_row + H1_4(col);
1131*09b0d9e0SPeter Maydell uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1132*09b0d9e0SPeter Maydell
1133*09b0d9e0SPeter Maydell m = f16mop_adj_pair(m, pcol, 0);
1134*09b0d9e0SPeter Maydell *a = bfdotadd(*a, n, m, &fpst);
1135*09b0d9e0SPeter Maydell }
1136*09b0d9e0SPeter Maydell col += 4;
1137*09b0d9e0SPeter Maydell pcol >>= 4;
1138*09b0d9e0SPeter Maydell } while (col & 15);
1139*09b0d9e0SPeter Maydell }
1140*09b0d9e0SPeter Maydell row += 4;
1141*09b0d9e0SPeter Maydell prow >>= 4;
1142*09b0d9e0SPeter Maydell } while (row & 15);
1143*09b0d9e0SPeter Maydell }
1144*09b0d9e0SPeter Maydell }
1145a3ef070eSClaudio Fontana }
1146a3ef070eSClaudio Fontana
1147d572bcb2SRichard Henderson typedef uint32_t IMOPFn32(uint32_t, uint32_t, uint32_t, uint8_t, bool);
do_imopa_s(uint32_t * za,uint32_t * zn,uint32_t * zm,uint8_t * pn,uint8_t * pm,uint32_t desc,IMOPFn32 * fn)1148d572bcb2SRichard Henderson static inline void do_imopa_s(uint32_t *za, uint32_t *zn, uint32_t *zm,
1149a3ef070eSClaudio Fontana uint8_t *pn, uint8_t *pm,
1150d572bcb2SRichard Henderson uint32_t desc, IMOPFn32 *fn)
1151d572bcb2SRichard Henderson {
1152d572bcb2SRichard Henderson intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
1153d572bcb2SRichard Henderson bool neg = simd_data(desc);
1154d572bcb2SRichard Henderson
1155d572bcb2SRichard Henderson for (row = 0; row < oprsz; ++row) {
1156d572bcb2SRichard Henderson uint8_t pa = (pn[H1(row >> 1)] >> ((row & 1) * 4)) & 0xf;
1157d572bcb2SRichard Henderson uint32_t *za_row = &za[tile_vslice_index(row)];
1158d572bcb2SRichard Henderson uint32_t n = zn[H4(row)];
1159d572bcb2SRichard Henderson
1160d572bcb2SRichard Henderson for (col = 0; col < oprsz; ++col) {
1161d572bcb2SRichard Henderson uint8_t pb = pm[H1(col >> 1)] >> ((col & 1) * 4);
1162d572bcb2SRichard Henderson uint32_t *a = &za_row[H4(col)];
1163d572bcb2SRichard Henderson
1164d572bcb2SRichard Henderson *a = fn(n, zm[H4(col)], *a, pa & pb, neg);
1165d572bcb2SRichard Henderson }
1166d572bcb2SRichard Henderson }
1167d572bcb2SRichard Henderson }
1168d572bcb2SRichard Henderson
1169d572bcb2SRichard Henderson typedef uint64_t IMOPFn64(uint64_t, uint64_t, uint64_t, uint8_t, bool);
do_imopa_d(uint64_t * za,uint64_t * zn,uint64_t * zm,uint8_t * pn,uint8_t * pm,uint32_t desc,IMOPFn64 * fn)1170d572bcb2SRichard Henderson static inline void do_imopa_d(uint64_t *za, uint64_t *zn, uint64_t *zm,
1171d572bcb2SRichard Henderson uint8_t *pn, uint8_t *pm,
1172d572bcb2SRichard Henderson uint32_t desc, IMOPFn64 *fn)
1173a3ef070eSClaudio Fontana {
1174a3ef070eSClaudio Fontana intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
1175a3ef070eSClaudio Fontana bool neg = simd_data(desc);
1176a3ef070eSClaudio Fontana
1177a3ef070eSClaudio Fontana for (row = 0; row < oprsz; ++row) {
1178a3ef070eSClaudio Fontana uint8_t pa = pn[H1(row)];
1179a3ef070eSClaudio Fontana uint64_t *za_row = &za[tile_vslice_index(row)];
1180a3ef070eSClaudio Fontana uint64_t n = zn[row];
1181a3ef070eSClaudio Fontana
1182a3ef070eSClaudio Fontana for (col = 0; col < oprsz; ++col) {
1183a3ef070eSClaudio Fontana uint8_t pb = pm[H1(col)];
1184a3ef070eSClaudio Fontana uint64_t *a = &za_row[col];
1185a3ef070eSClaudio Fontana
1186a3ef070eSClaudio Fontana *a = fn(n, zm[col], *a, pa & pb, neg);
1187a3ef070eSClaudio Fontana }
1188a3ef070eSClaudio Fontana }
1189a3ef070eSClaudio Fontana }
1190a3ef070eSClaudio Fontana
1191a3ef070eSClaudio Fontana #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1192d572bcb2SRichard Henderson static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \
1193a3ef070eSClaudio Fontana { \
1194d572bcb2SRichard Henderson uint32_t sum = 0; \
1195a3ef070eSClaudio Fontana /* Apply P to N as a mask, making the inactive elements 0. */ \
1196a3ef070eSClaudio Fontana n &= expand_pred_b(p); \
1197d572bcb2SRichard Henderson sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
1198d572bcb2SRichard Henderson sum += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \
1199d572bcb2SRichard Henderson sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
1200d572bcb2SRichard Henderson sum += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \
1201d572bcb2SRichard Henderson return neg ? a - sum : a + sum; \
1202a3ef070eSClaudio Fontana }
1203a3ef070eSClaudio Fontana
1204a3ef070eSClaudio Fontana #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1205a3ef070eSClaudio Fontana static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1206a3ef070eSClaudio Fontana { \
1207a3ef070eSClaudio Fontana uint64_t sum = 0; \
1208a3ef070eSClaudio Fontana /* Apply P to N as a mask, making the inactive elements 0. */ \
1209a3ef070eSClaudio Fontana n &= expand_pred_h(p); \
1210ea3f5a90SPeter Maydell sum += (int64_t)(NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
1211ea3f5a90SPeter Maydell sum += (int64_t)(NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
1212ea3f5a90SPeter Maydell sum += (int64_t)(NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
1213ea3f5a90SPeter Maydell sum += (int64_t)(NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
1214a3ef070eSClaudio Fontana return neg ? a - sum : a + sum; \
1215a3ef070eSClaudio Fontana }
1216a3ef070eSClaudio Fontana
1217a3ef070eSClaudio Fontana DEF_IMOP_32(smopa_s, int8_t, int8_t)
1218a3ef070eSClaudio Fontana DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
1219a3ef070eSClaudio Fontana DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
1220a3ef070eSClaudio Fontana DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
1221a3ef070eSClaudio Fontana
1222a3ef070eSClaudio Fontana DEF_IMOP_64(smopa_d, int16_t, int16_t)
1223a3ef070eSClaudio Fontana DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
1224a3ef070eSClaudio Fontana DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
1225a3ef070eSClaudio Fontana DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
1226a3ef070eSClaudio Fontana
1227d572bcb2SRichard Henderson #define DEF_IMOPH(NAME, S) \
1228d572bcb2SRichard Henderson void HELPER(sme_##NAME##_##S)(void *vza, void *vzn, void *vzm, \
1229d572bcb2SRichard Henderson void *vpn, void *vpm, uint32_t desc) \
1230d572bcb2SRichard Henderson { do_imopa_##S(vza, vzn, vzm, vpn, vpm, desc, NAME##_##S); }
1231a3ef070eSClaudio Fontana
1232d572bcb2SRichard Henderson DEF_IMOPH(smopa, s)
1233d572bcb2SRichard Henderson DEF_IMOPH(umopa, s)
1234d572bcb2SRichard Henderson DEF_IMOPH(sumopa, s)
1235d572bcb2SRichard Henderson DEF_IMOPH(usmopa, s)
1236d572bcb2SRichard Henderson
1237d572bcb2SRichard Henderson DEF_IMOPH(smopa, d)
1238d572bcb2SRichard Henderson DEF_IMOPH(umopa, d)
1239d572bcb2SRichard Henderson DEF_IMOPH(sumopa, d)
1240d572bcb2SRichard Henderson DEF_IMOPH(usmopa, d)
1241