1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/page-protection.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29 #include "vec_internal.h"
30 #include "sve_ldst_internal.h"
31 #include "hw/core/tcg-cpu-ops.h"
32
33
34 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
35 *
36 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
37 * and bit 0 set if C is set. Compare the definitions of these variables
38 * within CPUARMState.
39 */
40
41 /* For no G bits set, NZCV = C. */
42 #define PREDTEST_INIT 1
43
44 /* This is an iterative function, called for each Pd and Pg word
45 * moving forward.
46 */
iter_predtest_fwd(uint64_t d,uint64_t g,uint32_t flags)47 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
48 {
49 if (likely(g)) {
50 /* Compute N from first D & G.
51 Use bit 2 to signal first G bit seen. */
52 if (!(flags & 4)) {
53 flags |= ((d & (g & -g)) != 0) << 31;
54 flags |= 4;
55 }
56
57 /* Accumulate Z from each D & G. */
58 flags |= ((d & g) != 0) << 1;
59
60 /* Compute C from last !(D & G). Replace previous. */
61 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
62 }
63 return flags;
64 }
65
66 /* This is an iterative function, called for each Pd and Pg word
67 * moving backward.
68 */
iter_predtest_bwd(uint64_t d,uint64_t g,uint32_t flags)69 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
70 {
71 if (likely(g)) {
72 /* Compute C from first (i.e last) !(D & G).
73 Use bit 2 to signal first G bit seen. */
74 if (!(flags & 4)) {
75 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
76 flags |= (d & pow2floor(g)) == 0;
77 }
78
79 /* Accumulate Z from each D & G. */
80 flags |= ((d & g) != 0) << 1;
81
82 /* Compute N from last (i.e first) D & G. Replace previous. */
83 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
84 }
85 return flags;
86 }
87
88 /* The same for a single word predicate. */
HELPER(sve_predtest1)89 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
90 {
91 return iter_predtest_fwd(d, g, PREDTEST_INIT);
92 }
93
94 /* The same for a multi-word predicate. */
HELPER(sve_predtest)95 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
96 {
97 uint32_t flags = PREDTEST_INIT;
98 uint64_t *d = vd, *g = vg;
99 uintptr_t i = 0;
100
101 do {
102 flags = iter_predtest_fwd(d[i], g[i], flags);
103 } while (++i < words);
104
105 return flags;
106 }
107
108 /* Similarly for single word elements. */
expand_pred_s(uint8_t byte)109 static inline uint64_t expand_pred_s(uint8_t byte)
110 {
111 static const uint64_t word[] = {
112 [0x01] = 0x00000000ffffffffull,
113 [0x10] = 0xffffffff00000000ull,
114 [0x11] = 0xffffffffffffffffull,
115 };
116 return word[byte & 0x11];
117 }
118
119 #define LOGICAL_PPPP(NAME, FUNC) \
120 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
121 { \
122 uintptr_t opr_sz = simd_oprsz(desc); \
123 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
124 uintptr_t i; \
125 for (i = 0; i < opr_sz / 8; ++i) { \
126 d[i] = FUNC(n[i], m[i], g[i]); \
127 } \
128 }
129
130 #define DO_AND(N, M, G) (((N) & (M)) & (G))
131 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
132 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
133 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
134 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
135 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
136 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
137 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
138
LOGICAL_PPPP(sve_and_pppp,DO_AND)139 LOGICAL_PPPP(sve_and_pppp, DO_AND)
140 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
141 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
142 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
143 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
144 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
145 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
146 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
147
148 #undef DO_AND
149 #undef DO_BIC
150 #undef DO_EOR
151 #undef DO_ORR
152 #undef DO_ORN
153 #undef DO_NOR
154 #undef DO_NAND
155 #undef DO_SEL
156 #undef LOGICAL_PPPP
157
158 /* Fully general three-operand expander, controlled by a predicate.
159 * This is complicated by the host-endian storage of the register file.
160 */
161 /* ??? I don't expect the compiler could ever vectorize this itself.
162 * With some tables we can convert bit masks to byte masks, and with
163 * extra care wrt byte/word ordering we could use gcc generic vectors
164 * and do 16 bytes at a time.
165 */
166 #define DO_ZPZZ(NAME, TYPE, H, OP) \
167 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
168 { \
169 intptr_t i, opr_sz = simd_oprsz(desc); \
170 for (i = 0; i < opr_sz; ) { \
171 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
172 do { \
173 if (pg & 1) { \
174 TYPE nn = *(TYPE *)(vn + H(i)); \
175 TYPE mm = *(TYPE *)(vm + H(i)); \
176 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
177 } \
178 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
179 } while (i & 15); \
180 } \
181 }
182
183 /* Similarly, specialized for 64-bit operands. */
184 #define DO_ZPZZ_D(NAME, TYPE, OP) \
185 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
186 { \
187 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
188 TYPE *d = vd, *n = vn, *m = vm; \
189 uint8_t *pg = vg; \
190 for (i = 0; i < opr_sz; i += 1) { \
191 if (pg[H1(i)] & 1) { \
192 TYPE nn = n[i], mm = m[i]; \
193 d[i] = OP(nn, mm); \
194 } \
195 } \
196 }
197
198 #define DO_AND(N, M) (N & M)
199 #define DO_EOR(N, M) (N ^ M)
200 #define DO_ORR(N, M) (N | M)
201 #define DO_BIC(N, M) (N & ~M)
202 #define DO_ADD(N, M) (N + M)
203 #define DO_SUB(N, M) (N - M)
204 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
205 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
206 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
207 #define DO_MUL(N, M) (N * M)
208
209
210 /*
211 * We must avoid the C undefined behaviour cases: division by
212 * zero and signed division of INT_MIN by -1. Both of these
213 * have architecturally defined required results for Arm.
214 * We special case all signed divisions by -1 to avoid having
215 * to deduce the minimum integer for the type involved.
216 */
217 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
218 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
219
220 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
221 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
222 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
223 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
224
225 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
226 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
227 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
228 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
229
230 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
231 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
232 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
233 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
234
235 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
236 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
237 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
238 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
239
240 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
241 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
242 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
243 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
244
245 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
246 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
247 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
248 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
249
250 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
251 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
252 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
253 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
254
255 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
256 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
257 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
258 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
259
260 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
261 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
262 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
263 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
264
265 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
266 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
267 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
268 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
269
270 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
271 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
272 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
273 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
274
275 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
276 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
277 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
278 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
279
280 /* Because the computation type is at least twice as large as required,
281 these work for both signed and unsigned source types. */
282 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
283 {
284 return (n * m) >> 8;
285 }
286
do_mulh_h(int32_t n,int32_t m)287 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
288 {
289 return (n * m) >> 16;
290 }
291
do_mulh_s(int64_t n,int64_t m)292 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
293 {
294 return (n * m) >> 32;
295 }
296
do_smulh_d(uint64_t n,uint64_t m)297 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
298 {
299 uint64_t lo, hi;
300 muls64(&lo, &hi, n, m);
301 return hi;
302 }
303
do_umulh_d(uint64_t n,uint64_t m)304 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
305 {
306 uint64_t lo, hi;
307 mulu64(&lo, &hi, n, m);
308 return hi;
309 }
310
DO_ZPZZ(sve_mul_zpzz_b,uint8_t,H1,DO_MUL)311 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
312 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
313 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
314 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
315
316 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
317 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
318 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
319 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
320
321 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
322 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
323 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
324 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
325
326 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
327 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
328
329 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
330 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
331
332 /* Note that all bits of the shift are significant
333 and not modulo the element size. */
334 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
335 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
336 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
337
338 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
339 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
340 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
341
342 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
343 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
344 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
345
346 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
347 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
348 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
349
350 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
351 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
352 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
353
354 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
355 {
356 int8_t n1 = n, n2 = n >> 8;
357 return m + n1 + n2;
358 }
359
do_sadalp_s(int32_t n,int32_t m)360 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
361 {
362 int16_t n1 = n, n2 = n >> 16;
363 return m + n1 + n2;
364 }
365
do_sadalp_d(int64_t n,int64_t m)366 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
367 {
368 int32_t n1 = n, n2 = n >> 32;
369 return m + n1 + n2;
370 }
371
DO_ZPZZ(sve2_sadalp_zpzz_h,int16_t,H1_2,do_sadalp_h)372 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
373 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
374 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
375
376 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
377 {
378 uint8_t n1 = n, n2 = n >> 8;
379 return m + n1 + n2;
380 }
381
do_uadalp_s(uint32_t n,uint32_t m)382 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
383 {
384 uint16_t n1 = n, n2 = n >> 16;
385 return m + n1 + n2;
386 }
387
do_uadalp_d(uint64_t n,uint64_t m)388 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
389 {
390 uint32_t n1 = n, n2 = n >> 32;
391 return m + n1 + n2;
392 }
393
DO_ZPZZ(sve2_uadalp_zpzz_h,uint16_t,H1_2,do_uadalp_h)394 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
395 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
396 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
397
398 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
399 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
400 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
401 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
402
403 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
404 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
405 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
406 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
407
408 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
409 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
410 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
411 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
412
413 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
414 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
415 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
416 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
417
418 /*
419 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
420 * We pass in a pointer to a dummy saturation field to trigger
421 * the saturating arithmetic but discard the information about
422 * whether it has occurred.
423 */
424 #define do_sqshl_b(n, m) \
425 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
426 #define do_sqshl_h(n, m) \
427 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
428 #define do_sqshl_s(n, m) \
429 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
430 #define do_sqshl_d(n, m) \
431 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
432
433 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
434 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
435 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
436 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
437
438 #define do_uqshl_b(n, m) \
439 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
440 #define do_uqshl_h(n, m) \
441 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
442 #define do_uqshl_s(n, m) \
443 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
444 #define do_uqshl_d(n, m) \
445 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
446
447 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
448 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
449 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
450 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
451
452 #define do_sqrshl_b(n, m) \
453 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
454 #define do_sqrshl_h(n, m) \
455 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
456 #define do_sqrshl_s(n, m) \
457 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
458 #define do_sqrshl_d(n, m) \
459 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
460
461 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
462 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
463 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
464 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
465
466 #undef do_sqrshl_d
467
468 #define do_uqrshl_b(n, m) \
469 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
470 #define do_uqrshl_h(n, m) \
471 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
472 #define do_uqrshl_s(n, m) \
473 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
474 #define do_uqrshl_d(n, m) \
475 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
476
477 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
478 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
479 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
480 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
481
482 #undef do_uqrshl_d
483
484 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
485 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
486
487 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
488 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
489 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
490 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
491
492 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
493 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
494 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
495 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
496
497 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
498 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
499
500 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
501 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
502 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
503 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
504
505 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
506 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
507 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
508 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
509
510 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
511 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
512
513 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
514 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
515 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
516 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
517
518 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
519 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
520 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
521 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
522
523 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
524 {
525 return val >= max ? max : val <= min ? min : val;
526 }
527
528 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
529 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
530 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
531
do_sqadd_d(int64_t n,int64_t m)532 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
533 {
534 int64_t r = n + m;
535 if (((r ^ n) & ~(n ^ m)) < 0) {
536 /* Signed overflow. */
537 return r < 0 ? INT64_MAX : INT64_MIN;
538 }
539 return r;
540 }
541
DO_ZPZZ(sve2_sqadd_zpzz_b,int8_t,H1,DO_SQADD_B)542 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
543 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
544 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
545 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
546
547 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
548 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
549 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
550
551 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
552 {
553 uint64_t r = n + m;
554 return r < n ? UINT64_MAX : r;
555 }
556
DO_ZPZZ(sve2_uqadd_zpzz_b,uint8_t,H1,DO_UQADD_B)557 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
558 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
559 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
560 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
561
562 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
563 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
564 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
565
566 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
567 {
568 int64_t r = n - m;
569 if (((r ^ n) & (n ^ m)) < 0) {
570 /* Signed overflow. */
571 return r < 0 ? INT64_MAX : INT64_MIN;
572 }
573 return r;
574 }
575
DO_ZPZZ(sve2_sqsub_zpzz_b,int8_t,H1,DO_SQSUB_B)576 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
577 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
578 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
579 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
580
581 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
582 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
583 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
584
585 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
586 {
587 return n > m ? n - m : 0;
588 }
589
DO_ZPZZ(sve2_uqsub_zpzz_b,uint8_t,H1,DO_UQSUB_B)590 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
591 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
592 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
593 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
594
595 #define DO_SUQADD_B(n, m) \
596 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
597 #define DO_SUQADD_H(n, m) \
598 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
599 #define DO_SUQADD_S(n, m) \
600 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
601
602 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
603 {
604 uint64_t r = n + m;
605
606 if (n < 0) {
607 /* Note that m - abs(n) cannot underflow. */
608 if (r > INT64_MAX) {
609 /* Result is either very large positive or negative. */
610 if (m > -n) {
611 /* m > abs(n), so r is a very large positive. */
612 return INT64_MAX;
613 }
614 /* Result is negative. */
615 }
616 } else {
617 /* Both inputs are positive: check for overflow. */
618 if (r < m || r > INT64_MAX) {
619 return INT64_MAX;
620 }
621 }
622 return r;
623 }
624
DO_ZPZZ(sve2_suqadd_zpzz_b,uint8_t,H1,DO_SUQADD_B)625 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
626 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
627 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
628 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
629
630 #define DO_USQADD_B(n, m) \
631 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
632 #define DO_USQADD_H(n, m) \
633 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
634 #define DO_USQADD_S(n, m) \
635 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
636
637 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
638 {
639 uint64_t r = n + m;
640
641 if (m < 0) {
642 return n < -m ? 0 : r;
643 }
644 return r < n ? UINT64_MAX : r;
645 }
646
DO_ZPZZ(sve2_usqadd_zpzz_b,uint8_t,H1,DO_USQADD_B)647 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
648 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
649 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
650 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
651
652 #undef DO_ZPZZ
653 #undef DO_ZPZZ_D
654
655 /*
656 * Three operand expander, operating on element pairs.
657 * If the slot I is even, the elements from from VN {I, I+1}.
658 * If the slot I is odd, the elements from from VM {I-1, I}.
659 * Load all of the input elements in each pair before overwriting output.
660 */
661 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
662 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
663 { \
664 intptr_t i, opr_sz = simd_oprsz(desc); \
665 for (i = 0; i < opr_sz; ) { \
666 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
667 do { \
668 TYPE n0 = *(TYPE *)(vn + H(i)); \
669 TYPE m0 = *(TYPE *)(vm + H(i)); \
670 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
671 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
672 if (pg & 1) { \
673 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
674 } \
675 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
676 if (pg & 1) { \
677 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
678 } \
679 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
680 } while (i & 15); \
681 } \
682 }
683
684 /* Similarly, specialized for 64-bit operands. */
685 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
686 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
687 { \
688 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
689 TYPE *d = vd, *n = vn, *m = vm; \
690 uint8_t *pg = vg; \
691 for (i = 0; i < opr_sz; i += 2) { \
692 TYPE n0 = n[i], n1 = n[i + 1]; \
693 TYPE m0 = m[i], m1 = m[i + 1]; \
694 if (pg[H1(i)] & 1) { \
695 d[i] = OP(n0, n1); \
696 } \
697 if (pg[H1(i + 1)] & 1) { \
698 d[i + 1] = OP(m0, m1); \
699 } \
700 } \
701 }
702
703 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
704 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
705 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
706 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
707
708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
709 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
710 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
711 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
712
713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
714 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
715 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
716 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
717
718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
719 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
720 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
721 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
722
723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
724 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
725 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
726 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
727
728 #undef DO_ZPZZ_PAIR
729 #undef DO_ZPZZ_PAIR_D
730
731 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
732 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
733 void *status, uint32_t desc) \
734 { \
735 intptr_t i, opr_sz = simd_oprsz(desc); \
736 for (i = 0; i < opr_sz; ) { \
737 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
738 do { \
739 TYPE n0 = *(TYPE *)(vn + H(i)); \
740 TYPE m0 = *(TYPE *)(vm + H(i)); \
741 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
742 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
743 if (pg & 1) { \
744 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
745 } \
746 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
747 if (pg & 1) { \
748 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
749 } \
750 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
751 } while (i & 15); \
752 } \
753 }
754
755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
756 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
757 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
758
759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
760 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
761 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
762
763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
764 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
765 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
766
767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
768 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
769 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
770
771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
772 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
773 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
774
775 #undef DO_ZPZZ_PAIR_FP
776
777 /* Three-operand expander, controlled by a predicate, in which the
778 * third operand is "wide". That is, for D = N op M, the same 64-bit
779 * value of M is used with all of the narrower values of N.
780 */
781 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
782 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
783 { \
784 intptr_t i, opr_sz = simd_oprsz(desc); \
785 for (i = 0; i < opr_sz; ) { \
786 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
787 TYPEW mm = *(TYPEW *)(vm + i); \
788 do { \
789 if (pg & 1) { \
790 TYPE nn = *(TYPE *)(vn + H(i)); \
791 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
792 } \
793 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
794 } while (i & 7); \
795 } \
796 }
797
798 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
799 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
800 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
801
802 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
803 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
804 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
805
806 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
807 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
808 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
809
810 #undef DO_ZPZW
811
812 /* Fully general two-operand expander, controlled by a predicate.
813 */
814 #define DO_ZPZ(NAME, TYPE, H, OP) \
815 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
816 { \
817 intptr_t i, opr_sz = simd_oprsz(desc); \
818 for (i = 0; i < opr_sz; ) { \
819 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
820 do { \
821 if (pg & 1) { \
822 TYPE nn = *(TYPE *)(vn + H(i)); \
823 *(TYPE *)(vd + H(i)) = OP(nn); \
824 } \
825 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
826 } while (i & 15); \
827 } \
828 }
829
830 /* Similarly, specialized for 64-bit operands. */
831 #define DO_ZPZ_D(NAME, TYPE, OP) \
832 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
833 { \
834 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
835 TYPE *d = vd, *n = vn; \
836 uint8_t *pg = vg; \
837 for (i = 0; i < opr_sz; i += 1) { \
838 if (pg[H1(i)] & 1) { \
839 TYPE nn = n[i]; \
840 d[i] = OP(nn); \
841 } \
842 } \
843 }
844
845 #define DO_CLS_B(N) (clrsb32(N) - 24)
846 #define DO_CLS_H(N) (clrsb32(N) - 16)
847
848 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
849 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
850 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
851 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
852
853 #define DO_CLZ_B(N) (clz32(N) - 24)
854 #define DO_CLZ_H(N) (clz32(N) - 16)
855
856 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
857 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
858 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
859 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
860
861 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
862 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
863 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
864 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
865
866 #define DO_CNOT(N) (N == 0)
867
868 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
869 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
870 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
871 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
872
873 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
874
875 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
876 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
877 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
878
879 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
880
881 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
882 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
883 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
884
885 #define DO_NOT(N) (~N)
886
887 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
888 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
889 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
890 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
891
892 #define DO_SXTB(N) ((int8_t)N)
893 #define DO_SXTH(N) ((int16_t)N)
894 #define DO_SXTS(N) ((int32_t)N)
895 #define DO_UXTB(N) ((uint8_t)N)
896 #define DO_UXTH(N) ((uint16_t)N)
897 #define DO_UXTS(N) ((uint32_t)N)
898
899 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
900 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
901 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
902 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
903 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
904 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
905
906 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
907 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
908 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
909 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
910 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
911 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
912
913 #define DO_ABS(N) (N < 0 ? -N : N)
914
915 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
916 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
917 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
918 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
919
920 #define DO_NEG(N) (-N)
921
922 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
923 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
924 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
925 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
926
927 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
928 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
929 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
930
931 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
932 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
933
934 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
935
936 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
937 {
938 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
939 uint64_t *d = vd, *n = vn;
940 uint8_t *pg = vg;
941
942 for (i = 0; i < opr_sz; i += 2) {
943 if (pg[H1(i)] & 1) {
944 uint64_t n0 = n[i + 0];
945 uint64_t n1 = n[i + 1];
946 d[i + 0] = n1;
947 d[i + 1] = n0;
948 }
949 }
950 }
951
DO_ZPZ(sve_rbit_b,uint8_t,H1,revbit8)952 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
953 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
954 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
955 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
956
957 #define DO_SQABS(X) \
958 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
959 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
960
961 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
962 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
963 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
964 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
965
966 #define DO_SQNEG(X) \
967 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
968 x_ == min_ ? -min_ - 1 : -x_; })
969
970 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
971 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
972 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
973 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
974
975 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
976 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
977
978 /* Three-operand expander, unpredicated, in which the third operand is "wide".
979 */
980 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
981 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
982 { \
983 intptr_t i, opr_sz = simd_oprsz(desc); \
984 for (i = 0; i < opr_sz; ) { \
985 TYPEW mm = *(TYPEW *)(vm + i); \
986 do { \
987 TYPE nn = *(TYPE *)(vn + H(i)); \
988 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
989 i += sizeof(TYPE); \
990 } while (i & 7); \
991 } \
992 }
993
994 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
995 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
996 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
997
998 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
999 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1000 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1001
1002 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1003 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1004 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1005
1006 #undef DO_ZZW
1007
1008 #undef DO_CLS_B
1009 #undef DO_CLS_H
1010 #undef DO_CLZ_B
1011 #undef DO_CLZ_H
1012 #undef DO_CNOT
1013 #undef DO_FABS
1014 #undef DO_FNEG
1015 #undef DO_ABS
1016 #undef DO_NEG
1017 #undef DO_ZPZ
1018 #undef DO_ZPZ_D
1019
1020 /*
1021 * Three-operand expander, unpredicated, in which the two inputs are
1022 * selected from the top or bottom half of the wide column.
1023 */
1024 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1025 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1026 { \
1027 intptr_t i, opr_sz = simd_oprsz(desc); \
1028 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1029 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1030 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1031 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1032 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1033 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1034 } \
1035 }
1036
1037 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1038 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1039 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1040
1041 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1042 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1043 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1044
1045 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1046 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1047 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1048
1049 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1050 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1051 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1052
1053 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1054 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1055 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1056
1057 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1058 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1059 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1060
1061 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1062 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1063 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1064
1065 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1066 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1067 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1068
1069 /* Note that the multiply cannot overflow, but the doubling can. */
1070 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1071 {
1072 int16_t val = n * m;
1073 return DO_SQADD_H(val, val);
1074 }
1075
do_sqdmull_s(int32_t n,int32_t m)1076 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1077 {
1078 int32_t val = n * m;
1079 return DO_SQADD_S(val, val);
1080 }
1081
do_sqdmull_d(int64_t n,int64_t m)1082 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1083 {
1084 int64_t val = n * m;
1085 return do_sqadd_d(val, val);
1086 }
1087
DO_ZZZ_TB(sve2_sqdmull_zzz_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h)1088 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1089 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1090 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1091
1092 #undef DO_ZZZ_TB
1093
1094 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1095 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1096 { \
1097 intptr_t i, opr_sz = simd_oprsz(desc); \
1098 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1099 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1100 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1101 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1102 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1103 } \
1104 }
1105
1106 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1107 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1108 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1109
1110 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1111 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1112 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1113
1114 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1115 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1116 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1117
1118 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1119 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1120 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1121
1122 #undef DO_ZZZ_WTB
1123
1124 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1125 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1126 { \
1127 intptr_t i, opr_sz = simd_oprsz(desc); \
1128 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1129 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1130 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1131 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1132 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1133 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1134 } \
1135 }
1136
1137 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1138 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1139 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1140 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1141
1142 #undef DO_ZZZ_NTB
1143
1144 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1145 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1146 { \
1147 intptr_t i, opr_sz = simd_oprsz(desc); \
1148 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1149 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1150 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1151 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1152 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1153 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1154 } \
1155 }
1156
1157 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1158 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1159 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1160
1161 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1162 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1163 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1164
1165 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1166 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1167 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1168
1169 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1170 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1171 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1172
1173 #define DO_NMUL(N, M) -(N * M)
1174
1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1176 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1177 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1178
1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1180 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1181 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1182
1183 #undef DO_ZZZW_ACC
1184
1185 #define DO_XTNB(NAME, TYPE, OP) \
1186 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1187 { \
1188 intptr_t i, opr_sz = simd_oprsz(desc); \
1189 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1190 TYPE nn = *(TYPE *)(vn + i); \
1191 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1192 *(TYPE *)(vd + i) = nn; \
1193 } \
1194 }
1195
1196 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1197 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1198 { \
1199 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1200 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1201 TYPE nn = *(TYPE *)(vn + i); \
1202 *(TYPEN *)(vd + i + odd) = OP(nn); \
1203 } \
1204 }
1205
1206 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1207 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1208 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1209
1210 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1211 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1212 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1213
1214 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1215 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1216 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1217
1218 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1219 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1220 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1221
1222 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1223 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1224 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1225
1226 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1227 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1228 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1229
1230 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1231 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1232 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1233
1234 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1235 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1236 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1237
1238 #undef DO_XTNB
1239 #undef DO_XTNT
1240
1241 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1242 {
1243 intptr_t i, opr_sz = simd_oprsz(desc);
1244 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1245 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1246 uint32_t *a = va, *n = vn;
1247 uint64_t *d = vd, *m = vm;
1248
1249 for (i = 0; i < opr_sz / 8; ++i) {
1250 uint32_t e1 = a[2 * i + H4(0)];
1251 uint32_t e2 = n[2 * i + sel] ^ inv;
1252 uint64_t c = extract64(m[i], 32, 1);
1253 /* Compute and store the entire 33-bit result at once. */
1254 d[i] = c + e1 + e2;
1255 }
1256 }
1257
HELPER(sve2_adcl_d)1258 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1259 {
1260 intptr_t i, opr_sz = simd_oprsz(desc);
1261 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1262 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1263 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1264
1265 for (i = 0; i < opr_sz / 8; i += 2) {
1266 Int128 e1 = int128_make64(a[i]);
1267 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1268 Int128 c = int128_make64(m[i + 1] & 1);
1269 Int128 r = int128_add(int128_add(e1, e2), c);
1270 d[i + 0] = int128_getlo(r);
1271 d[i + 1] = int128_gethi(r);
1272 }
1273 }
1274
1275 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1276 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1277 { \
1278 intptr_t i, opr_sz = simd_oprsz(desc); \
1279 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1280 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1281 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1282 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1283 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1284 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1285 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1286 } \
1287 }
1288
DO_SQDMLAL(sve2_sqdmlal_zzzw_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h,DO_SQADD_H)1289 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1290 do_sqdmull_h, DO_SQADD_H)
1291 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1292 do_sqdmull_s, DO_SQADD_S)
1293 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1294 do_sqdmull_d, do_sqadd_d)
1295
1296 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1297 do_sqdmull_h, DO_SQSUB_H)
1298 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1299 do_sqdmull_s, DO_SQSUB_S)
1300 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1301 do_sqdmull_d, do_sqsub_d)
1302
1303 #undef DO_SQDMLAL
1304
1305 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1306 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1307 { \
1308 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1309 int rot = simd_data(desc); \
1310 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1311 bool sub_r = rot == 1 || rot == 2; \
1312 bool sub_i = rot >= 2; \
1313 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1314 for (i = 0; i < opr_sz; i += 2) { \
1315 TYPE elt1_a = n[H(i + sel_a)]; \
1316 TYPE elt2_a = m[H(i + sel_a)]; \
1317 TYPE elt2_b = m[H(i + sel_b)]; \
1318 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1319 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1320 } \
1321 }
1322
1323 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1324
1325 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1326 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1327 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1328 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1329
1330 #define DO_SQRDMLAH_B(N, M, A, S) \
1331 do_sqrdmlah_b(N, M, A, S, true)
1332 #define DO_SQRDMLAH_H(N, M, A, S) \
1333 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1334 #define DO_SQRDMLAH_S(N, M, A, S) \
1335 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1336 #define DO_SQRDMLAH_D(N, M, A, S) \
1337 do_sqrdmlah_d(N, M, A, S, true)
1338
1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1341 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1342 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1343
1344 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1345 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1346 { \
1347 intptr_t i, j, oprsz = simd_oprsz(desc); \
1348 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1349 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1350 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1351 bool sub_r = rot == 1 || rot == 2; \
1352 bool sub_i = rot >= 2; \
1353 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1354 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1355 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1356 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1357 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1358 TYPE elt1_a = n[H(i + j + sel_a)]; \
1359 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1360 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1361 } \
1362 } \
1363 }
1364
1365 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1366 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1367
1368 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1369 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1370
1371 #undef DO_CMLA
1372 #undef DO_CMLA_FUNC
1373 #undef DO_CMLA_IDX_FUNC
1374 #undef DO_SQRDMLAH_B
1375 #undef DO_SQRDMLAH_H
1376 #undef DO_SQRDMLAH_S
1377 #undef DO_SQRDMLAH_D
1378
1379 /* Note N and M are 4 elements bundled into one unit. */
1380 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1381 int sel_a, int sel_b, int sub_i)
1382 {
1383 for (int i = 0; i <= 1; i++) {
1384 int32_t elt1_r = (int8_t)(n >> (16 * i));
1385 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1386 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1387 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1388
1389 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1390 }
1391 return a;
1392 }
1393
do_cdot_d(uint64_t n,uint64_t m,int64_t a,int sel_a,int sel_b,int sub_i)1394 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1395 int sel_a, int sel_b, int sub_i)
1396 {
1397 for (int i = 0; i <= 1; i++) {
1398 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1399 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1400 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1401 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1402
1403 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1404 }
1405 return a;
1406 }
1407
HELPER(sve2_cdot_zzzz_s)1408 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1409 void *va, uint32_t desc)
1410 {
1411 int opr_sz = simd_oprsz(desc);
1412 int rot = simd_data(desc);
1413 int sel_a = rot & 1;
1414 int sel_b = sel_a ^ 1;
1415 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1416 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1417
1418 for (int e = 0; e < opr_sz / 4; e++) {
1419 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1420 }
1421 }
1422
HELPER(sve2_cdot_zzzz_d)1423 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1424 void *va, uint32_t desc)
1425 {
1426 int opr_sz = simd_oprsz(desc);
1427 int rot = simd_data(desc);
1428 int sel_a = rot & 1;
1429 int sel_b = sel_a ^ 1;
1430 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1431 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1432
1433 for (int e = 0; e < opr_sz / 8; e++) {
1434 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1435 }
1436 }
1437
HELPER(sve2_cdot_idx_s)1438 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1439 void *va, uint32_t desc)
1440 {
1441 int opr_sz = simd_oprsz(desc);
1442 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1443 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1444 int sel_a = rot & 1;
1445 int sel_b = sel_a ^ 1;
1446 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1447 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1448
1449 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1450 uint32_t seg_m = m[seg + idx];
1451 for (int e = 0; e < 4; e++) {
1452 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1453 sel_a, sel_b, sub_i);
1454 }
1455 }
1456 }
1457
HELPER(sve2_cdot_idx_d)1458 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1459 void *va, uint32_t desc)
1460 {
1461 int seg, opr_sz = simd_oprsz(desc);
1462 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1463 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1464 int sel_a = rot & 1;
1465 int sel_b = sel_a ^ 1;
1466 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1467 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1468
1469 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1470 uint64_t seg_m = m[seg + idx];
1471 for (int e = 0; e < 2; e++) {
1472 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1473 sel_a, sel_b, sub_i);
1474 }
1475 }
1476 }
1477
1478 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1479 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1480 { \
1481 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1482 intptr_t i, j, idx = simd_data(desc); \
1483 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1484 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1485 TYPE mm = m[i]; \
1486 for (j = 0; j < segment; j++) { \
1487 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1488 } \
1489 } \
1490 }
1491
1492 #define DO_SQRDMLAH_H(N, M, A) \
1493 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1494 #define DO_SQRDMLAH_S(N, M, A) \
1495 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1496 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1497
DO_ZZXZ(sve2_sqrdmlah_idx_h,int16_t,H2,DO_SQRDMLAH_H)1498 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1499 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1500 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1501
1502 #define DO_SQRDMLSH_H(N, M, A) \
1503 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1504 #define DO_SQRDMLSH_S(N, M, A) \
1505 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1506 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1507
1508 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1509 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1510 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1511
1512 #undef DO_ZZXZ
1513
1514 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1515 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1516 { \
1517 intptr_t i, j, oprsz = simd_oprsz(desc); \
1518 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1519 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1520 for (i = 0; i < oprsz; i += 16) { \
1521 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1522 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1523 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1524 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1525 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1526 } \
1527 } \
1528 }
1529
1530 #define DO_MLA(N, M, A) (A + N * M)
1531
1532 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1533 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1534 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1535 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1536
1537 #define DO_MLS(N, M, A) (A - N * M)
1538
1539 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1540 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1541 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1542 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1543
1544 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1545 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1546
1547 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1548 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1549
1550 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1551 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1552
1553 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1554 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1555
1556 #undef DO_MLA
1557 #undef DO_MLS
1558 #undef DO_ZZXW
1559
1560 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1561 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1562 { \
1563 intptr_t i, j, oprsz = simd_oprsz(desc); \
1564 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1565 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1566 for (i = 0; i < oprsz; i += 16) { \
1567 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1568 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1569 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1570 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1571 } \
1572 } \
1573 }
1574
1575 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1576 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1577
1578 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1579 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1580
1581 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1582 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1583
1584 #undef DO_ZZX
1585
1586 #define DO_BITPERM(NAME, TYPE, OP) \
1587 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1588 { \
1589 intptr_t i, opr_sz = simd_oprsz(desc); \
1590 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1591 TYPE nn = *(TYPE *)(vn + i); \
1592 TYPE mm = *(TYPE *)(vm + i); \
1593 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1594 } \
1595 }
1596
1597 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1598 {
1599 uint64_t res = 0;
1600 int db, rb = 0;
1601
1602 for (db = 0; db < n; ++db) {
1603 if ((mask >> db) & 1) {
1604 res |= ((data >> db) & 1) << rb;
1605 ++rb;
1606 }
1607 }
1608 return res;
1609 }
1610
DO_BITPERM(sve2_bext_b,uint8_t,bitextract)1611 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1612 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1613 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1614 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1615
1616 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1617 {
1618 uint64_t res = 0;
1619 int rb, db = 0;
1620
1621 for (rb = 0; rb < n; ++rb) {
1622 if ((mask >> rb) & 1) {
1623 res |= ((data >> db) & 1) << rb;
1624 ++db;
1625 }
1626 }
1627 return res;
1628 }
1629
DO_BITPERM(sve2_bdep_b,uint8_t,bitdeposit)1630 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1631 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1632 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1633 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1634
1635 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1636 {
1637 uint64_t resm = 0, resu = 0;
1638 int db, rbm = 0, rbu = 0;
1639
1640 for (db = 0; db < n; ++db) {
1641 uint64_t val = (data >> db) & 1;
1642 if ((mask >> db) & 1) {
1643 resm |= val << rbm++;
1644 } else {
1645 resu |= val << rbu++;
1646 }
1647 }
1648
1649 return resm | (resu << rbm);
1650 }
1651
DO_BITPERM(sve2_bgrp_b,uint8_t,bitgroup)1652 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1653 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1654 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1655 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1656
1657 #undef DO_BITPERM
1658
1659 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1660 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1661 { \
1662 intptr_t i, opr_sz = simd_oprsz(desc); \
1663 int sub_r = simd_data(desc); \
1664 if (sub_r) { \
1665 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1666 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1667 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1668 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1669 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1670 acc_r = ADD_OP(acc_r, el2_i); \
1671 acc_i = SUB_OP(acc_i, el2_r); \
1672 *(TYPE *)(vd + H(i)) = acc_r; \
1673 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1674 } \
1675 } else { \
1676 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1677 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1678 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1679 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1680 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1681 acc_r = SUB_OP(acc_r, el2_i); \
1682 acc_i = ADD_OP(acc_i, el2_r); \
1683 *(TYPE *)(vd + H(i)) = acc_r; \
1684 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1685 } \
1686 } \
1687 }
1688
1689 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1690 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1691 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1692 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1693
1694 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1695 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1696 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1697 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1698
1699 #undef DO_CADD
1700
1701 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1702 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1703 { \
1704 intptr_t i, opr_sz = simd_oprsz(desc); \
1705 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1706 int shift = simd_data(desc) >> 1; \
1707 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1708 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1709 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1710 } \
1711 }
1712
1713 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1714 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1715 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1716
1717 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1718 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1719 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1720
1721 #undef DO_ZZI_SHLL
1722
1723 /* Two-operand reduction expander, controlled by a predicate.
1724 * The difference between TYPERED and TYPERET has to do with
1725 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1726 * but TYPERET must be unsigned so that e.g. a 32-bit value
1727 * is not sign-extended to the ABI uint64_t return type.
1728 */
1729 /* ??? If we were to vectorize this by hand the reduction ordering
1730 * would change. For integer operands, this is perfectly fine.
1731 */
1732 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1733 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1734 { \
1735 intptr_t i, opr_sz = simd_oprsz(desc); \
1736 TYPERED ret = INIT; \
1737 for (i = 0; i < opr_sz; ) { \
1738 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1739 do { \
1740 if (pg & 1) { \
1741 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1742 ret = OP(ret, nn); \
1743 } \
1744 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1745 } while (i & 15); \
1746 } \
1747 return (TYPERET)ret; \
1748 }
1749
1750 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1751 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1752 { \
1753 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1754 TYPEE *n = vn; \
1755 uint8_t *pg = vg; \
1756 TYPER ret = INIT; \
1757 for (i = 0; i < opr_sz; i += 1) { \
1758 if (pg[H1(i)] & 1) { \
1759 TYPEE nn = n[i]; \
1760 ret = OP(ret, nn); \
1761 } \
1762 } \
1763 return ret; \
1764 }
1765
1766 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1767 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1768 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1769 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1770
1771 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1772 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1773 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1774 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1775
1776 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1777 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1778 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1779 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1780
1781 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1782 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1783 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1784
1785 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1786 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1787 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1788 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1789
1790 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1791 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1792 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1793 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1794
1795 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1796 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1797 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1798 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1799
1800 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1801 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1802 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1803 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1804
1805 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1806 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1807 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1808 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1809
1810 #undef DO_VPZ
1811 #undef DO_VPZ_D
1812
1813 /* Two vector operand, one scalar operand, unpredicated. */
1814 #define DO_ZZI(NAME, TYPE, OP) \
1815 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1816 { \
1817 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1818 TYPE s = s64, *d = vd, *n = vn; \
1819 for (i = 0; i < opr_sz; ++i) { \
1820 d[i] = OP(n[i], s); \
1821 } \
1822 }
1823
1824 #define DO_SUBR(X, Y) (Y - X)
1825
1826 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1827 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1828 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1829 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1830
1831 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1832 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1833 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1834 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1835
1836 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1837 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1838 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1839 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1840
1841 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1842 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1843 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1844 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1845
1846 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1847 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1848 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1849 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1850
1851 #undef DO_ZZI
1852
1853 #undef DO_AND
1854 #undef DO_ORR
1855 #undef DO_EOR
1856 #undef DO_BIC
1857 #undef DO_ADD
1858 #undef DO_SUB
1859 #undef DO_MAX
1860 #undef DO_MIN
1861 #undef DO_ABD
1862 #undef DO_MUL
1863 #undef DO_DIV
1864 #undef DO_ASR
1865 #undef DO_LSR
1866 #undef DO_LSL
1867 #undef DO_SUBR
1868
1869 /* Similar to the ARM LastActiveElement pseudocode function, except the
1870 result is multiplied by the element size. This includes the not found
1871 indication; e.g. not found for esz=3 is -8. */
1872 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1873 {
1874 uint64_t mask = pred_esz_masks[esz];
1875 intptr_t i = words;
1876
1877 do {
1878 uint64_t this_g = g[--i] & mask;
1879 if (this_g) {
1880 return i * 64 + (63 - clz64(this_g));
1881 }
1882 } while (i > 0);
1883 return (intptr_t)-1 << esz;
1884 }
1885
HELPER(sve_pfirst)1886 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1887 {
1888 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1889 uint32_t flags = PREDTEST_INIT;
1890 uint64_t *d = vd, *g = vg;
1891 intptr_t i = 0;
1892
1893 do {
1894 uint64_t this_d = d[i];
1895 uint64_t this_g = g[i];
1896
1897 if (this_g) {
1898 if (!(flags & 4)) {
1899 /* Set in D the first bit of G. */
1900 this_d |= this_g & -this_g;
1901 d[i] = this_d;
1902 }
1903 flags = iter_predtest_fwd(this_d, this_g, flags);
1904 }
1905 } while (++i < words);
1906
1907 return flags;
1908 }
1909
HELPER(sve_pnext)1910 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1911 {
1912 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1913 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1914 uint32_t flags = PREDTEST_INIT;
1915 uint64_t *d = vd, *g = vg, esz_mask;
1916 intptr_t i, next;
1917
1918 next = last_active_element(vd, words, esz) + (1 << esz);
1919 esz_mask = pred_esz_masks[esz];
1920
1921 /* Similar to the pseudocode for pnext, but scaled by ESZ
1922 so that we find the correct bit. */
1923 if (next < words * 64) {
1924 uint64_t mask = -1;
1925
1926 if (next & 63) {
1927 mask = ~((1ull << (next & 63)) - 1);
1928 next &= -64;
1929 }
1930 do {
1931 uint64_t this_g = g[next / 64] & esz_mask & mask;
1932 if (this_g != 0) {
1933 next = (next & -64) + ctz64(this_g);
1934 break;
1935 }
1936 next += 64;
1937 mask = -1;
1938 } while (next < words * 64);
1939 }
1940
1941 i = 0;
1942 do {
1943 uint64_t this_d = 0;
1944 if (i == next / 64) {
1945 this_d = 1ull << (next & 63);
1946 }
1947 d[i] = this_d;
1948 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1949 } while (++i < words);
1950
1951 return flags;
1952 }
1953
1954 /*
1955 * Copy Zn into Zd, and store zero into inactive elements.
1956 * If inv, store zeros into the active elements.
1957 */
HELPER(sve_movz_b)1958 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1959 {
1960 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1961 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1962 uint64_t *d = vd, *n = vn;
1963 uint8_t *pg = vg;
1964
1965 for (i = 0; i < opr_sz; i += 1) {
1966 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1967 }
1968 }
1969
HELPER(sve_movz_h)1970 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1971 {
1972 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1973 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1974 uint64_t *d = vd, *n = vn;
1975 uint8_t *pg = vg;
1976
1977 for (i = 0; i < opr_sz; i += 1) {
1978 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1979 }
1980 }
1981
HELPER(sve_movz_s)1982 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1983 {
1984 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1985 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1986 uint64_t *d = vd, *n = vn;
1987 uint8_t *pg = vg;
1988
1989 for (i = 0; i < opr_sz; i += 1) {
1990 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1991 }
1992 }
1993
HELPER(sve_movz_d)1994 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1995 {
1996 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1997 uint64_t *d = vd, *n = vn;
1998 uint8_t *pg = vg;
1999 uint8_t inv = simd_data(desc);
2000
2001 for (i = 0; i < opr_sz; i += 1) {
2002 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2003 }
2004 }
2005
2006 /* Three-operand expander, immediate operand, controlled by a predicate.
2007 */
2008 #define DO_ZPZI(NAME, TYPE, H, OP) \
2009 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2010 { \
2011 intptr_t i, opr_sz = simd_oprsz(desc); \
2012 TYPE imm = simd_data(desc); \
2013 for (i = 0; i < opr_sz; ) { \
2014 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2015 do { \
2016 if (pg & 1) { \
2017 TYPE nn = *(TYPE *)(vn + H(i)); \
2018 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2019 } \
2020 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2021 } while (i & 15); \
2022 } \
2023 }
2024
2025 /* Similarly, specialized for 64-bit operands. */
2026 #define DO_ZPZI_D(NAME, TYPE, OP) \
2027 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2028 { \
2029 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2030 TYPE *d = vd, *n = vn; \
2031 TYPE imm = simd_data(desc); \
2032 uint8_t *pg = vg; \
2033 for (i = 0; i < opr_sz; i += 1) { \
2034 if (pg[H1(i)] & 1) { \
2035 TYPE nn = n[i]; \
2036 d[i] = OP(nn, imm); \
2037 } \
2038 } \
2039 }
2040
2041 #define DO_SHR(N, M) (N >> M)
2042 #define DO_SHL(N, M) (N << M)
2043
2044 /* Arithmetic shift right for division. This rounds negative numbers
2045 toward zero as per signed division. Therefore before shifting,
2046 when N is negative, add 2**M-1. */
2047 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2048
do_urshr(uint64_t x,unsigned sh)2049 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2050 {
2051 if (likely(sh < 64)) {
2052 return (x >> sh) + ((x >> (sh - 1)) & 1);
2053 } else if (sh == 64) {
2054 return x >> 63;
2055 } else {
2056 return 0;
2057 }
2058 }
2059
do_srshr(int64_t x,unsigned sh)2060 static inline int64_t do_srshr(int64_t x, unsigned sh)
2061 {
2062 if (likely(sh < 64)) {
2063 return (x >> sh) + ((x >> (sh - 1)) & 1);
2064 } else {
2065 /* Rounding the sign bit always produces 0. */
2066 return 0;
2067 }
2068 }
2069
DO_ZPZI(sve_asr_zpzi_b,int8_t,H1,DO_SHR)2070 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2071 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2072 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2073 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2074
2075 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2076 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2077 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2078 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2079
2080 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2081 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2082 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2083 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2084
2085 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2086 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2087 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2088 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2089
2090 /* SVE2 bitwise shift by immediate */
2091 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2092 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2093 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2094 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2095
2096 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2097 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2098 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2099 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2100
2101 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2102 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2103 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2104 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2105
2106 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2107 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2108 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2109 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2110
2111 #define do_suqrshl_b(n, m) \
2112 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2113 #define do_suqrshl_h(n, m) \
2114 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2115 #define do_suqrshl_s(n, m) \
2116 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2117 #define do_suqrshl_d(n, m) \
2118 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2119
2120 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2121 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2122 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2123 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2124
2125 #undef DO_ASRD
2126 #undef DO_ZPZI
2127 #undef DO_ZPZI_D
2128
2129 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2130 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2131 { \
2132 intptr_t i, opr_sz = simd_oprsz(desc); \
2133 int shift = simd_data(desc); \
2134 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2135 TYPEW nn = *(TYPEW *)(vn + i); \
2136 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2137 } \
2138 }
2139
2140 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2141 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2142 { \
2143 intptr_t i, opr_sz = simd_oprsz(desc); \
2144 int shift = simd_data(desc); \
2145 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2146 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2147 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2148 } \
2149 }
2150
2151 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2152 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2153 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2154
2155 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2156 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2157 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2158
2159 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2160 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2161 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2162
2163 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2164 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2165 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2166
2167 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2168 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2169 #define DO_SQSHRUN_D(x, sh) \
2170 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2171
2172 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2173 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2174 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2175
2176 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2177 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2178 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2179
2180 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2181 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2182 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2183
2184 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2185 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2186 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2187
2188 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2189 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2190 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2191
2192 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2193 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2194 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2195
2196 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2197 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2198 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2199
2200 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2201 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2202 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2203
2204 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2205 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2206 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2207
2208 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2209 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2210 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2211
2212 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2213 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2214 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2215
2216 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2217 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2218 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2219
2220 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2221 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2222 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2223
2224 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2225 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2226 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2227
2228 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2229 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2230 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2231
2232 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2233 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2234 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2235
2236 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2237 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2238 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2239
2240 #undef DO_SHRNB
2241 #undef DO_SHRNT
2242
2243 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2244 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2245 { \
2246 intptr_t i, opr_sz = simd_oprsz(desc); \
2247 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2248 TYPEW nn = *(TYPEW *)(vn + i); \
2249 TYPEW mm = *(TYPEW *)(vm + i); \
2250 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2251 } \
2252 }
2253
2254 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2255 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2256 { \
2257 intptr_t i, opr_sz = simd_oprsz(desc); \
2258 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2259 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2260 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2261 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2262 } \
2263 }
2264
2265 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2266 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2267 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2268 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2269
2270 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2271 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2272 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2273
2274 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2275 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2276 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2277
2278 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2279 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2280 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2281
2282 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2283 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2284 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2285
2286 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2287 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2288 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2289
2290 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2291 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2292 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2293
2294 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2295 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2296 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2297
2298 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2299 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2300 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2301
2302 #undef DO_RSUBHN
2303 #undef DO_SUBHN
2304 #undef DO_RADDHN
2305 #undef DO_ADDHN
2306
2307 #undef DO_BINOPNB
2308
2309 /* Fully general four-operand expander, controlled by a predicate.
2310 */
2311 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2312 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2313 void *vg, uint32_t desc) \
2314 { \
2315 intptr_t i, opr_sz = simd_oprsz(desc); \
2316 for (i = 0; i < opr_sz; ) { \
2317 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2318 do { \
2319 if (pg & 1) { \
2320 TYPE nn = *(TYPE *)(vn + H(i)); \
2321 TYPE mm = *(TYPE *)(vm + H(i)); \
2322 TYPE aa = *(TYPE *)(va + H(i)); \
2323 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2324 } \
2325 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2326 } while (i & 15); \
2327 } \
2328 }
2329
2330 /* Similarly, specialized for 64-bit operands. */
2331 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2332 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2333 void *vg, uint32_t desc) \
2334 { \
2335 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2336 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2337 uint8_t *pg = vg; \
2338 for (i = 0; i < opr_sz; i += 1) { \
2339 if (pg[H1(i)] & 1) { \
2340 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2341 d[i] = OP(aa, nn, mm); \
2342 } \
2343 } \
2344 }
2345
2346 #define DO_MLA(A, N, M) (A + N * M)
2347 #define DO_MLS(A, N, M) (A - N * M)
2348
2349 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2350 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2351
2352 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2353 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2354
2355 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2356 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2357
2358 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2359 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2360
2361 #undef DO_MLA
2362 #undef DO_MLS
2363 #undef DO_ZPZZZ
2364 #undef DO_ZPZZZ_D
2365
2366 void HELPER(sve_index_b)(void *vd, uint32_t start,
2367 uint32_t incr, uint32_t desc)
2368 {
2369 intptr_t i, opr_sz = simd_oprsz(desc);
2370 uint8_t *d = vd;
2371 for (i = 0; i < opr_sz; i += 1) {
2372 d[H1(i)] = start + i * incr;
2373 }
2374 }
2375
HELPER(sve_index_h)2376 void HELPER(sve_index_h)(void *vd, uint32_t start,
2377 uint32_t incr, uint32_t desc)
2378 {
2379 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2380 uint16_t *d = vd;
2381 for (i = 0; i < opr_sz; i += 1) {
2382 d[H2(i)] = start + i * incr;
2383 }
2384 }
2385
HELPER(sve_index_s)2386 void HELPER(sve_index_s)(void *vd, uint32_t start,
2387 uint32_t incr, uint32_t desc)
2388 {
2389 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2390 uint32_t *d = vd;
2391 for (i = 0; i < opr_sz; i += 1) {
2392 d[H4(i)] = start + i * incr;
2393 }
2394 }
2395
HELPER(sve_index_d)2396 void HELPER(sve_index_d)(void *vd, uint64_t start,
2397 uint64_t incr, uint32_t desc)
2398 {
2399 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2400 uint64_t *d = vd;
2401 for (i = 0; i < opr_sz; i += 1) {
2402 d[i] = start + i * incr;
2403 }
2404 }
2405
HELPER(sve_adr_p32)2406 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2407 {
2408 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2409 uint32_t sh = simd_data(desc);
2410 uint32_t *d = vd, *n = vn, *m = vm;
2411 for (i = 0; i < opr_sz; i += 1) {
2412 d[i] = n[i] + (m[i] << sh);
2413 }
2414 }
2415
HELPER(sve_adr_p64)2416 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2417 {
2418 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2419 uint64_t sh = simd_data(desc);
2420 uint64_t *d = vd, *n = vn, *m = vm;
2421 for (i = 0; i < opr_sz; i += 1) {
2422 d[i] = n[i] + (m[i] << sh);
2423 }
2424 }
2425
HELPER(sve_adr_s32)2426 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2427 {
2428 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2429 uint64_t sh = simd_data(desc);
2430 uint64_t *d = vd, *n = vn, *m = vm;
2431 for (i = 0; i < opr_sz; i += 1) {
2432 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2433 }
2434 }
2435
HELPER(sve_adr_u32)2436 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2437 {
2438 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2439 uint64_t sh = simd_data(desc);
2440 uint64_t *d = vd, *n = vn, *m = vm;
2441 for (i = 0; i < opr_sz; i += 1) {
2442 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2443 }
2444 }
2445
HELPER(sve_fexpa_h)2446 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2447 {
2448 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2449 static const uint16_t coeff[] = {
2450 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2451 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2452 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2453 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2454 };
2455 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2456 uint16_t *d = vd, *n = vn;
2457
2458 for (i = 0; i < opr_sz; i++) {
2459 uint16_t nn = n[i];
2460 intptr_t idx = extract32(nn, 0, 5);
2461 uint16_t exp = extract32(nn, 5, 5);
2462 d[i] = coeff[idx] | (exp << 10);
2463 }
2464 }
2465
HELPER(sve_fexpa_s)2466 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2467 {
2468 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2469 static const uint32_t coeff[] = {
2470 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2471 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2472 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2473 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2474 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2475 0x1ef532, 0x20b051, 0x227043, 0x243516,
2476 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2477 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2478 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2479 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2480 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2481 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2482 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2483 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2484 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2485 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2486 };
2487 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2488 uint32_t *d = vd, *n = vn;
2489
2490 for (i = 0; i < opr_sz; i++) {
2491 uint32_t nn = n[i];
2492 intptr_t idx = extract32(nn, 0, 6);
2493 uint32_t exp = extract32(nn, 6, 8);
2494 d[i] = coeff[idx] | (exp << 23);
2495 }
2496 }
2497
HELPER(sve_fexpa_d)2498 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2499 {
2500 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2501 static const uint64_t coeff[] = {
2502 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2503 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2504 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2505 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2506 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2507 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2508 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2509 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2510 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2511 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2512 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2513 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2514 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2515 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2516 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2517 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2518 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2519 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2520 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2521 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2522 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2523 0xFA7C1819E90D8ull,
2524 };
2525 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2526 uint64_t *d = vd, *n = vn;
2527
2528 for (i = 0; i < opr_sz; i++) {
2529 uint64_t nn = n[i];
2530 intptr_t idx = extract32(nn, 0, 6);
2531 uint64_t exp = extract32(nn, 6, 11);
2532 d[i] = coeff[idx] | (exp << 52);
2533 }
2534 }
2535
HELPER(sve_ftssel_h)2536 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2537 {
2538 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2539 uint16_t *d = vd, *n = vn, *m = vm;
2540 for (i = 0; i < opr_sz; i += 1) {
2541 uint16_t nn = n[i];
2542 uint16_t mm = m[i];
2543 if (mm & 1) {
2544 nn = float16_one;
2545 }
2546 d[i] = nn ^ (mm & 2) << 14;
2547 }
2548 }
2549
HELPER(sve_ftssel_s)2550 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2551 {
2552 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2553 uint32_t *d = vd, *n = vn, *m = vm;
2554 for (i = 0; i < opr_sz; i += 1) {
2555 uint32_t nn = n[i];
2556 uint32_t mm = m[i];
2557 if (mm & 1) {
2558 nn = float32_one;
2559 }
2560 d[i] = nn ^ (mm & 2) << 30;
2561 }
2562 }
2563
HELPER(sve_ftssel_d)2564 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2565 {
2566 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2567 uint64_t *d = vd, *n = vn, *m = vm;
2568 for (i = 0; i < opr_sz; i += 1) {
2569 uint64_t nn = n[i];
2570 uint64_t mm = m[i];
2571 if (mm & 1) {
2572 nn = float64_one;
2573 }
2574 d[i] = nn ^ (mm & 2) << 62;
2575 }
2576 }
2577
2578 /*
2579 * Signed saturating addition with scalar operand.
2580 */
2581
HELPER(sve_sqaddi_b)2582 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2583 {
2584 intptr_t i, oprsz = simd_oprsz(desc);
2585
2586 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2587 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2588 }
2589 }
2590
HELPER(sve_sqaddi_h)2591 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2592 {
2593 intptr_t i, oprsz = simd_oprsz(desc);
2594
2595 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2596 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2597 }
2598 }
2599
HELPER(sve_sqaddi_s)2600 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2601 {
2602 intptr_t i, oprsz = simd_oprsz(desc);
2603
2604 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2605 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2606 }
2607 }
2608
HELPER(sve_sqaddi_d)2609 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2610 {
2611 intptr_t i, oprsz = simd_oprsz(desc);
2612
2613 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2614 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2615 }
2616 }
2617
2618 /*
2619 * Unsigned saturating addition with scalar operand.
2620 */
2621
HELPER(sve_uqaddi_b)2622 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2623 {
2624 intptr_t i, oprsz = simd_oprsz(desc);
2625
2626 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2627 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2628 }
2629 }
2630
HELPER(sve_uqaddi_h)2631 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2632 {
2633 intptr_t i, oprsz = simd_oprsz(desc);
2634
2635 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2636 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2637 }
2638 }
2639
HELPER(sve_uqaddi_s)2640 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2641 {
2642 intptr_t i, oprsz = simd_oprsz(desc);
2643
2644 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2645 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2646 }
2647 }
2648
HELPER(sve_uqaddi_d)2649 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2650 {
2651 intptr_t i, oprsz = simd_oprsz(desc);
2652
2653 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2654 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2655 }
2656 }
2657
HELPER(sve_uqsubi_d)2658 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2659 {
2660 intptr_t i, oprsz = simd_oprsz(desc);
2661
2662 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2663 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2664 }
2665 }
2666
2667 /* Two operand predicated copy immediate with merge. All valid immediates
2668 * can fit within 17 signed bits in the simd_data field.
2669 */
HELPER(sve_cpy_m_b)2670 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2671 uint64_t mm, uint32_t desc)
2672 {
2673 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2674 uint64_t *d = vd, *n = vn;
2675 uint8_t *pg = vg;
2676
2677 mm = dup_const(MO_8, mm);
2678 for (i = 0; i < opr_sz; i += 1) {
2679 uint64_t nn = n[i];
2680 uint64_t pp = expand_pred_b(pg[H1(i)]);
2681 d[i] = (mm & pp) | (nn & ~pp);
2682 }
2683 }
2684
HELPER(sve_cpy_m_h)2685 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2686 uint64_t mm, uint32_t desc)
2687 {
2688 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2689 uint64_t *d = vd, *n = vn;
2690 uint8_t *pg = vg;
2691
2692 mm = dup_const(MO_16, mm);
2693 for (i = 0; i < opr_sz; i += 1) {
2694 uint64_t nn = n[i];
2695 uint64_t pp = expand_pred_h(pg[H1(i)]);
2696 d[i] = (mm & pp) | (nn & ~pp);
2697 }
2698 }
2699
HELPER(sve_cpy_m_s)2700 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2701 uint64_t mm, uint32_t desc)
2702 {
2703 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2704 uint64_t *d = vd, *n = vn;
2705 uint8_t *pg = vg;
2706
2707 mm = dup_const(MO_32, mm);
2708 for (i = 0; i < opr_sz; i += 1) {
2709 uint64_t nn = n[i];
2710 uint64_t pp = expand_pred_s(pg[H1(i)]);
2711 d[i] = (mm & pp) | (nn & ~pp);
2712 }
2713 }
2714
HELPER(sve_cpy_m_d)2715 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2716 uint64_t mm, uint32_t desc)
2717 {
2718 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2719 uint64_t *d = vd, *n = vn;
2720 uint8_t *pg = vg;
2721
2722 for (i = 0; i < opr_sz; i += 1) {
2723 uint64_t nn = n[i];
2724 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2725 }
2726 }
2727
HELPER(sve_cpy_z_b)2728 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2729 {
2730 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2731 uint64_t *d = vd;
2732 uint8_t *pg = vg;
2733
2734 val = dup_const(MO_8, val);
2735 for (i = 0; i < opr_sz; i += 1) {
2736 d[i] = val & expand_pred_b(pg[H1(i)]);
2737 }
2738 }
2739
HELPER(sve_cpy_z_h)2740 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2741 {
2742 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2743 uint64_t *d = vd;
2744 uint8_t *pg = vg;
2745
2746 val = dup_const(MO_16, val);
2747 for (i = 0; i < opr_sz; i += 1) {
2748 d[i] = val & expand_pred_h(pg[H1(i)]);
2749 }
2750 }
2751
HELPER(sve_cpy_z_s)2752 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2753 {
2754 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2755 uint64_t *d = vd;
2756 uint8_t *pg = vg;
2757
2758 val = dup_const(MO_32, val);
2759 for (i = 0; i < opr_sz; i += 1) {
2760 d[i] = val & expand_pred_s(pg[H1(i)]);
2761 }
2762 }
2763
HELPER(sve_cpy_z_d)2764 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2765 {
2766 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2767 uint64_t *d = vd;
2768 uint8_t *pg = vg;
2769
2770 for (i = 0; i < opr_sz; i += 1) {
2771 d[i] = (pg[H1(i)] & 1 ? val : 0);
2772 }
2773 }
2774
2775 /* Big-endian hosts need to frob the byte indices. If the copy
2776 * happens to be 8-byte aligned, then no frobbing necessary.
2777 */
swap_memmove(void * vd,void * vs,size_t n)2778 static void swap_memmove(void *vd, void *vs, size_t n)
2779 {
2780 uintptr_t d = (uintptr_t)vd;
2781 uintptr_t s = (uintptr_t)vs;
2782 uintptr_t o = (d | s | n) & 7;
2783 size_t i;
2784
2785 #if !HOST_BIG_ENDIAN
2786 o = 0;
2787 #endif
2788 switch (o) {
2789 case 0:
2790 memmove(vd, vs, n);
2791 break;
2792
2793 case 4:
2794 if (d < s || d >= s + n) {
2795 for (i = 0; i < n; i += 4) {
2796 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2797 }
2798 } else {
2799 for (i = n; i > 0; ) {
2800 i -= 4;
2801 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2802 }
2803 }
2804 break;
2805
2806 case 2:
2807 case 6:
2808 if (d < s || d >= s + n) {
2809 for (i = 0; i < n; i += 2) {
2810 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2811 }
2812 } else {
2813 for (i = n; i > 0; ) {
2814 i -= 2;
2815 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2816 }
2817 }
2818 break;
2819
2820 default:
2821 if (d < s || d >= s + n) {
2822 for (i = 0; i < n; i++) {
2823 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2824 }
2825 } else {
2826 for (i = n; i > 0; ) {
2827 i -= 1;
2828 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2829 }
2830 }
2831 break;
2832 }
2833 }
2834
2835 /* Similarly for memset of 0. */
swap_memzero(void * vd,size_t n)2836 static void swap_memzero(void *vd, size_t n)
2837 {
2838 uintptr_t d = (uintptr_t)vd;
2839 uintptr_t o = (d | n) & 7;
2840 size_t i;
2841
2842 /* Usually, the first bit of a predicate is set, so N is 0. */
2843 if (likely(n == 0)) {
2844 return;
2845 }
2846
2847 #if !HOST_BIG_ENDIAN
2848 o = 0;
2849 #endif
2850 switch (o) {
2851 case 0:
2852 memset(vd, 0, n);
2853 break;
2854
2855 case 4:
2856 for (i = 0; i < n; i += 4) {
2857 *(uint32_t *)H1_4(d + i) = 0;
2858 }
2859 break;
2860
2861 case 2:
2862 case 6:
2863 for (i = 0; i < n; i += 2) {
2864 *(uint16_t *)H1_2(d + i) = 0;
2865 }
2866 break;
2867
2868 default:
2869 for (i = 0; i < n; i++) {
2870 *(uint8_t *)H1(d + i) = 0;
2871 }
2872 break;
2873 }
2874 }
2875
HELPER(sve_ext)2876 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2877 {
2878 intptr_t opr_sz = simd_oprsz(desc);
2879 size_t n_ofs = simd_data(desc);
2880 size_t n_siz = opr_sz - n_ofs;
2881
2882 if (vd != vm) {
2883 swap_memmove(vd, vn + n_ofs, n_siz);
2884 swap_memmove(vd + n_siz, vm, n_ofs);
2885 } else if (vd != vn) {
2886 swap_memmove(vd + n_siz, vd, n_ofs);
2887 swap_memmove(vd, vn + n_ofs, n_siz);
2888 } else {
2889 /* vd == vn == vm. Need temp space. */
2890 ARMVectorReg tmp;
2891 swap_memmove(&tmp, vm, n_ofs);
2892 swap_memmove(vd, vd + n_ofs, n_siz);
2893 memcpy(vd + n_siz, &tmp, n_ofs);
2894 }
2895 }
2896
2897 #define DO_INSR(NAME, TYPE, H) \
2898 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2899 { \
2900 intptr_t opr_sz = simd_oprsz(desc); \
2901 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2902 *(TYPE *)(vd + H(0)) = val; \
2903 }
2904
DO_INSR(sve_insr_b,uint8_t,H1)2905 DO_INSR(sve_insr_b, uint8_t, H1)
2906 DO_INSR(sve_insr_h, uint16_t, H1_2)
2907 DO_INSR(sve_insr_s, uint32_t, H1_4)
2908 DO_INSR(sve_insr_d, uint64_t, H1_8)
2909
2910 #undef DO_INSR
2911
2912 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2913 {
2914 intptr_t i, j, opr_sz = simd_oprsz(desc);
2915 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2916 uint64_t f = *(uint64_t *)(vn + i);
2917 uint64_t b = *(uint64_t *)(vn + j);
2918 *(uint64_t *)(vd + i) = bswap64(b);
2919 *(uint64_t *)(vd + j) = bswap64(f);
2920 }
2921 }
2922
HELPER(sve_rev_h)2923 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2924 {
2925 intptr_t i, j, opr_sz = simd_oprsz(desc);
2926 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2927 uint64_t f = *(uint64_t *)(vn + i);
2928 uint64_t b = *(uint64_t *)(vn + j);
2929 *(uint64_t *)(vd + i) = hswap64(b);
2930 *(uint64_t *)(vd + j) = hswap64(f);
2931 }
2932 }
2933
HELPER(sve_rev_s)2934 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2935 {
2936 intptr_t i, j, opr_sz = simd_oprsz(desc);
2937 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2938 uint64_t f = *(uint64_t *)(vn + i);
2939 uint64_t b = *(uint64_t *)(vn + j);
2940 *(uint64_t *)(vd + i) = rol64(b, 32);
2941 *(uint64_t *)(vd + j) = rol64(f, 32);
2942 }
2943 }
2944
HELPER(sve_rev_d)2945 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2946 {
2947 intptr_t i, j, opr_sz = simd_oprsz(desc);
2948 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2949 uint64_t f = *(uint64_t *)(vn + i);
2950 uint64_t b = *(uint64_t *)(vn + j);
2951 *(uint64_t *)(vd + i) = b;
2952 *(uint64_t *)(vd + j) = f;
2953 }
2954 }
2955
2956 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2957
do_tbl1(void * vd,void * vn,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)2958 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2959 bool is_tbx, tb_impl_fn *fn)
2960 {
2961 ARMVectorReg scratch;
2962 uintptr_t oprsz = simd_oprsz(desc);
2963
2964 if (unlikely(vd == vn)) {
2965 vn = memcpy(&scratch, vn, oprsz);
2966 }
2967
2968 fn(vd, vn, NULL, vm, oprsz, is_tbx);
2969 }
2970
do_tbl2(void * vd,void * vn0,void * vn1,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)2971 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2972 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2973 {
2974 ARMVectorReg scratch;
2975 uintptr_t oprsz = simd_oprsz(desc);
2976
2977 if (unlikely(vd == vn0)) {
2978 vn0 = memcpy(&scratch, vn0, oprsz);
2979 if (vd == vn1) {
2980 vn1 = vn0;
2981 }
2982 } else if (unlikely(vd == vn1)) {
2983 vn1 = memcpy(&scratch, vn1, oprsz);
2984 }
2985
2986 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2987 }
2988
2989 #define DO_TB(SUFF, TYPE, H) \
2990 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
2991 void *vm, uintptr_t oprsz, bool is_tbx) \
2992 { \
2993 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
2994 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
2995 for (i = 0; i < nelem; ++i) { \
2996 TYPE index = indexes[H1(i)], val = 0; \
2997 if (index < nelem) { \
2998 val = tbl0[H(index)]; \
2999 } else { \
3000 index -= nelem; \
3001 if (tbl1 && index < nelem) { \
3002 val = tbl1[H(index)]; \
3003 } else if (is_tbx) { \
3004 continue; \
3005 } \
3006 } \
3007 d[H(i)] = val; \
3008 } \
3009 } \
3010 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3011 { \
3012 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3013 } \
3014 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3015 void *vm, uint32_t desc) \
3016 { \
3017 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3018 } \
3019 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3020 { \
3021 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3022 }
3023
3024 DO_TB(b, uint8_t, H1)
3025 DO_TB(h, uint16_t, H2)
3026 DO_TB(s, uint32_t, H4)
3027 DO_TB(d, uint64_t, H8)
3028
3029 #undef DO_TB
3030
3031 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3032 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3033 { \
3034 intptr_t i, opr_sz = simd_oprsz(desc); \
3035 TYPED *d = vd; \
3036 TYPES *n = vn; \
3037 ARMVectorReg tmp; \
3038 if (unlikely(vn - vd < opr_sz)) { \
3039 n = memcpy(&tmp, n, opr_sz / 2); \
3040 } \
3041 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3042 d[HD(i)] = n[HS(i)]; \
3043 } \
3044 }
3045
3046 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3047 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3048 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3049
3050 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3051 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3052 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3053
3054 #undef DO_UNPK
3055
3056 /* Mask of bits included in the even numbered predicates of width esz.
3057 * We also use this for expand_bits/compress_bits, and so extend the
3058 * same pattern out to 16-bit units.
3059 */
3060 static const uint64_t even_bit_esz_masks[5] = {
3061 0x5555555555555555ull,
3062 0x3333333333333333ull,
3063 0x0f0f0f0f0f0f0f0full,
3064 0x00ff00ff00ff00ffull,
3065 0x0000ffff0000ffffull,
3066 };
3067
3068 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3069 * For N==0, this corresponds to the operation that in qemu/bitops.h
3070 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3071 * section 7-2 Shuffling Bits.
3072 */
expand_bits(uint64_t x,int n)3073 static uint64_t expand_bits(uint64_t x, int n)
3074 {
3075 int i;
3076
3077 x &= 0xffffffffu;
3078 for (i = 4; i >= n; i--) {
3079 int sh = 1 << i;
3080 x = ((x << sh) | x) & even_bit_esz_masks[i];
3081 }
3082 return x;
3083 }
3084
3085 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3086 * For N==0, this corresponds to the operation that in qemu/bitops.h
3087 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3088 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3089 */
compress_bits(uint64_t x,int n)3090 static uint64_t compress_bits(uint64_t x, int n)
3091 {
3092 int i;
3093
3094 for (i = n; i <= 4; i++) {
3095 int sh = 1 << i;
3096 x &= even_bit_esz_masks[i];
3097 x = (x >> sh) | x;
3098 }
3099 return x & 0xffffffffu;
3100 }
3101
HELPER(sve_zip_p)3102 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3103 {
3104 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3105 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3106 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3107 int esize = 1 << esz;
3108 uint64_t *d = vd;
3109 intptr_t i;
3110
3111 if (oprsz <= 8) {
3112 uint64_t nn = *(uint64_t *)vn;
3113 uint64_t mm = *(uint64_t *)vm;
3114 int half = 4 * oprsz;
3115
3116 nn = extract64(nn, high * half, half);
3117 mm = extract64(mm, high * half, half);
3118 nn = expand_bits(nn, esz);
3119 mm = expand_bits(mm, esz);
3120 d[0] = nn | (mm << esize);
3121 } else {
3122 ARMPredicateReg tmp;
3123
3124 /* We produce output faster than we consume input.
3125 Therefore we must be mindful of possible overlap. */
3126 if (vd == vn) {
3127 vn = memcpy(&tmp, vn, oprsz);
3128 if (vd == vm) {
3129 vm = vn;
3130 }
3131 } else if (vd == vm) {
3132 vm = memcpy(&tmp, vm, oprsz);
3133 }
3134 if (high) {
3135 high = oprsz >> 1;
3136 }
3137
3138 if ((oprsz & 7) == 0) {
3139 uint32_t *n = vn, *m = vm;
3140 high >>= 2;
3141
3142 for (i = 0; i < oprsz / 8; i++) {
3143 uint64_t nn = n[H4(high + i)];
3144 uint64_t mm = m[H4(high + i)];
3145
3146 nn = expand_bits(nn, esz);
3147 mm = expand_bits(mm, esz);
3148 d[i] = nn | (mm << esize);
3149 }
3150 } else {
3151 uint8_t *n = vn, *m = vm;
3152 uint16_t *d16 = vd;
3153
3154 for (i = 0; i < oprsz / 2; i++) {
3155 uint16_t nn = n[H1(high + i)];
3156 uint16_t mm = m[H1(high + i)];
3157
3158 nn = expand_bits(nn, esz);
3159 mm = expand_bits(mm, esz);
3160 d16[H2(i)] = nn | (mm << esize);
3161 }
3162 }
3163 }
3164 }
3165
HELPER(sve_uzp_p)3166 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3167 {
3168 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3169 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3170 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3171 uint64_t *d = vd, *n = vn, *m = vm;
3172 uint64_t l, h;
3173 intptr_t i;
3174
3175 if (oprsz <= 8) {
3176 l = compress_bits(n[0] >> odd, esz);
3177 h = compress_bits(m[0] >> odd, esz);
3178 d[0] = l | (h << (4 * oprsz));
3179 } else {
3180 ARMPredicateReg tmp_m;
3181 intptr_t oprsz_16 = oprsz / 16;
3182
3183 if ((vm - vd) < (uintptr_t)oprsz) {
3184 m = memcpy(&tmp_m, vm, oprsz);
3185 }
3186
3187 for (i = 0; i < oprsz_16; i++) {
3188 l = n[2 * i + 0];
3189 h = n[2 * i + 1];
3190 l = compress_bits(l >> odd, esz);
3191 h = compress_bits(h >> odd, esz);
3192 d[i] = l | (h << 32);
3193 }
3194
3195 /*
3196 * For VL which is not a multiple of 512, the results from M do not
3197 * align nicely with the uint64_t for D. Put the aligned results
3198 * from M into TMP_M and then copy it into place afterward.
3199 */
3200 if (oprsz & 15) {
3201 int final_shift = (oprsz & 15) * 2;
3202
3203 l = n[2 * i + 0];
3204 h = n[2 * i + 1];
3205 l = compress_bits(l >> odd, esz);
3206 h = compress_bits(h >> odd, esz);
3207 d[i] = l | (h << final_shift);
3208
3209 for (i = 0; i < oprsz_16; i++) {
3210 l = m[2 * i + 0];
3211 h = m[2 * i + 1];
3212 l = compress_bits(l >> odd, esz);
3213 h = compress_bits(h >> odd, esz);
3214 tmp_m.p[i] = l | (h << 32);
3215 }
3216 l = m[2 * i + 0];
3217 h = m[2 * i + 1];
3218 l = compress_bits(l >> odd, esz);
3219 h = compress_bits(h >> odd, esz);
3220 tmp_m.p[i] = l | (h << final_shift);
3221
3222 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3223 } else {
3224 for (i = 0; i < oprsz_16; i++) {
3225 l = m[2 * i + 0];
3226 h = m[2 * i + 1];
3227 l = compress_bits(l >> odd, esz);
3228 h = compress_bits(h >> odd, esz);
3229 d[oprsz_16 + i] = l | (h << 32);
3230 }
3231 }
3232 }
3233 }
3234
HELPER(sve_trn_p)3235 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3236 {
3237 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3238 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3239 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3240 uint64_t *d = vd, *n = vn, *m = vm;
3241 uint64_t mask;
3242 int shr, shl;
3243 intptr_t i;
3244
3245 shl = 1 << esz;
3246 shr = 0;
3247 mask = even_bit_esz_masks[esz];
3248 if (odd) {
3249 mask <<= shl;
3250 shr = shl;
3251 shl = 0;
3252 }
3253
3254 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3255 uint64_t nn = (n[i] & mask) >> shr;
3256 uint64_t mm = (m[i] & mask) << shl;
3257 d[i] = nn + mm;
3258 }
3259 }
3260
3261 /* Reverse units of 2**N bits. */
reverse_bits_64(uint64_t x,int n)3262 static uint64_t reverse_bits_64(uint64_t x, int n)
3263 {
3264 int i, sh;
3265
3266 x = bswap64(x);
3267 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3268 uint64_t mask = even_bit_esz_masks[i];
3269 x = ((x & mask) << sh) | ((x >> sh) & mask);
3270 }
3271 return x;
3272 }
3273
reverse_bits_8(uint8_t x,int n)3274 static uint8_t reverse_bits_8(uint8_t x, int n)
3275 {
3276 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3277 int i, sh;
3278
3279 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3280 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3281 }
3282 return x;
3283 }
3284
HELPER(sve_rev_p)3285 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3286 {
3287 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3288 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3289 intptr_t i, oprsz_2 = oprsz / 2;
3290
3291 if (oprsz <= 8) {
3292 uint64_t l = *(uint64_t *)vn;
3293 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3294 *(uint64_t *)vd = l;
3295 } else if ((oprsz & 15) == 0) {
3296 for (i = 0; i < oprsz_2; i += 8) {
3297 intptr_t ih = oprsz - 8 - i;
3298 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3299 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3300 *(uint64_t *)(vd + i) = h;
3301 *(uint64_t *)(vd + ih) = l;
3302 }
3303 } else {
3304 for (i = 0; i < oprsz_2; i += 1) {
3305 intptr_t il = H1(i);
3306 intptr_t ih = H1(oprsz - 1 - i);
3307 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3308 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3309 *(uint8_t *)(vd + il) = h;
3310 *(uint8_t *)(vd + ih) = l;
3311 }
3312 }
3313 }
3314
HELPER(sve_punpk_p)3315 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3316 {
3317 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3318 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3319 uint64_t *d = vd;
3320 intptr_t i;
3321
3322 if (oprsz <= 8) {
3323 uint64_t nn = *(uint64_t *)vn;
3324 int half = 4 * oprsz;
3325
3326 nn = extract64(nn, high * half, half);
3327 nn = expand_bits(nn, 0);
3328 d[0] = nn;
3329 } else {
3330 ARMPredicateReg tmp_n;
3331
3332 /* We produce output faster than we consume input.
3333 Therefore we must be mindful of possible overlap. */
3334 if ((vn - vd) < (uintptr_t)oprsz) {
3335 vn = memcpy(&tmp_n, vn, oprsz);
3336 }
3337 if (high) {
3338 high = oprsz >> 1;
3339 }
3340
3341 if ((oprsz & 7) == 0) {
3342 uint32_t *n = vn;
3343 high >>= 2;
3344
3345 for (i = 0; i < oprsz / 8; i++) {
3346 uint64_t nn = n[H4(high + i)];
3347 d[i] = expand_bits(nn, 0);
3348 }
3349 } else {
3350 uint16_t *d16 = vd;
3351 uint8_t *n = vn;
3352
3353 for (i = 0; i < oprsz / 2; i++) {
3354 uint16_t nn = n[H1(high + i)];
3355 d16[H2(i)] = expand_bits(nn, 0);
3356 }
3357 }
3358 }
3359 }
3360
3361 #define DO_ZIP(NAME, TYPE, H) \
3362 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3363 { \
3364 intptr_t oprsz = simd_oprsz(desc); \
3365 intptr_t odd_ofs = simd_data(desc); \
3366 intptr_t i, oprsz_2 = oprsz / 2; \
3367 ARMVectorReg tmp_n, tmp_m; \
3368 /* We produce output faster than we consume input. \
3369 Therefore we must be mindful of possible overlap. */ \
3370 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3371 vn = memcpy(&tmp_n, vn, oprsz); \
3372 } \
3373 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3374 vm = memcpy(&tmp_m, vm, oprsz); \
3375 } \
3376 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3377 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3378 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3379 *(TYPE *)(vm + odd_ofs + H(i)); \
3380 } \
3381 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3382 memset(vd + oprsz - 16, 0, 16); \
3383 } \
3384 }
3385
DO_ZIP(sve_zip_b,uint8_t,H1)3386 DO_ZIP(sve_zip_b, uint8_t, H1)
3387 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3388 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3389 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3390 DO_ZIP(sve2_zip_q, Int128, )
3391
3392 #define DO_UZP(NAME, TYPE, H) \
3393 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3394 { \
3395 intptr_t oprsz = simd_oprsz(desc); \
3396 intptr_t odd_ofs = simd_data(desc); \
3397 intptr_t i, p; \
3398 ARMVectorReg tmp_m; \
3399 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3400 vm = memcpy(&tmp_m, vm, oprsz); \
3401 } \
3402 i = 0, p = odd_ofs; \
3403 do { \
3404 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3405 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3406 } while (p < oprsz); \
3407 p -= oprsz; \
3408 do { \
3409 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3410 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3411 } while (p < oprsz); \
3412 tcg_debug_assert(i == oprsz); \
3413 }
3414
3415 DO_UZP(sve_uzp_b, uint8_t, H1)
3416 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3417 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3418 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3419 DO_UZP(sve2_uzp_q, Int128, )
3420
3421 #define DO_TRN(NAME, TYPE, H) \
3422 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3423 { \
3424 intptr_t oprsz = simd_oprsz(desc); \
3425 intptr_t odd_ofs = simd_data(desc); \
3426 intptr_t i; \
3427 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3428 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3429 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3430 *(TYPE *)(vd + H(i + 0)) = ae; \
3431 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3432 } \
3433 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3434 memset(vd + oprsz - 16, 0, 16); \
3435 } \
3436 }
3437
3438 DO_TRN(sve_trn_b, uint8_t, H1)
3439 DO_TRN(sve_trn_h, uint16_t, H1_2)
3440 DO_TRN(sve_trn_s, uint32_t, H1_4)
3441 DO_TRN(sve_trn_d, uint64_t, H1_8)
3442 DO_TRN(sve2_trn_q, Int128, )
3443
3444 #undef DO_ZIP
3445 #undef DO_UZP
3446 #undef DO_TRN
3447
3448 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3449 {
3450 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3451 uint32_t *d = vd, *n = vn;
3452 uint8_t *pg = vg;
3453
3454 for (i = j = 0; i < opr_sz; i++) {
3455 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3456 d[H4(j)] = n[H4(i)];
3457 j++;
3458 }
3459 }
3460 for (; j < opr_sz; j++) {
3461 d[H4(j)] = 0;
3462 }
3463 }
3464
HELPER(sve_compact_d)3465 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3466 {
3467 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3468 uint64_t *d = vd, *n = vn;
3469 uint8_t *pg = vg;
3470
3471 for (i = j = 0; i < opr_sz; i++) {
3472 if (pg[H1(i)] & 1) {
3473 d[j] = n[i];
3474 j++;
3475 }
3476 }
3477 for (; j < opr_sz; j++) {
3478 d[j] = 0;
3479 }
3480 }
3481
3482 /* Similar to the ARM LastActiveElement pseudocode function, except the
3483 * result is multiplied by the element size. This includes the not found
3484 * indication; e.g. not found for esz=3 is -8.
3485 */
HELPER(sve_last_active_element)3486 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3487 {
3488 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3489 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3490
3491 return last_active_element(vg, words, esz);
3492 }
3493
HELPER(sve_splice)3494 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3495 {
3496 intptr_t opr_sz = simd_oprsz(desc) / 8;
3497 int esz = simd_data(desc);
3498 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3499 intptr_t i, first_i, last_i;
3500 ARMVectorReg tmp;
3501
3502 first_i = last_i = 0;
3503 first_g = last_g = 0;
3504
3505 /* Find the extent of the active elements within VG. */
3506 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3507 pg = *(uint64_t *)(vg + i) & mask;
3508 if (pg) {
3509 if (last_g == 0) {
3510 last_g = pg;
3511 last_i = i;
3512 }
3513 first_g = pg;
3514 first_i = i;
3515 }
3516 }
3517
3518 len = 0;
3519 if (first_g != 0) {
3520 first_i = first_i * 8 + ctz64(first_g);
3521 last_i = last_i * 8 + 63 - clz64(last_g);
3522 len = last_i - first_i + (1 << esz);
3523 if (vd == vm) {
3524 vm = memcpy(&tmp, vm, opr_sz * 8);
3525 }
3526 swap_memmove(vd, vn + first_i, len);
3527 }
3528 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3529 }
3530
HELPER(sve_sel_zpzz_b)3531 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3532 void *vg, uint32_t desc)
3533 {
3534 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3535 uint64_t *d = vd, *n = vn, *m = vm;
3536 uint8_t *pg = vg;
3537
3538 for (i = 0; i < opr_sz; i += 1) {
3539 uint64_t nn = n[i], mm = m[i];
3540 uint64_t pp = expand_pred_b(pg[H1(i)]);
3541 d[i] = (nn & pp) | (mm & ~pp);
3542 }
3543 }
3544
HELPER(sve_sel_zpzz_h)3545 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3546 void *vg, uint32_t desc)
3547 {
3548 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3549 uint64_t *d = vd, *n = vn, *m = vm;
3550 uint8_t *pg = vg;
3551
3552 for (i = 0; i < opr_sz; i += 1) {
3553 uint64_t nn = n[i], mm = m[i];
3554 uint64_t pp = expand_pred_h(pg[H1(i)]);
3555 d[i] = (nn & pp) | (mm & ~pp);
3556 }
3557 }
3558
HELPER(sve_sel_zpzz_s)3559 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3560 void *vg, uint32_t desc)
3561 {
3562 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3563 uint64_t *d = vd, *n = vn, *m = vm;
3564 uint8_t *pg = vg;
3565
3566 for (i = 0; i < opr_sz; i += 1) {
3567 uint64_t nn = n[i], mm = m[i];
3568 uint64_t pp = expand_pred_s(pg[H1(i)]);
3569 d[i] = (nn & pp) | (mm & ~pp);
3570 }
3571 }
3572
HELPER(sve_sel_zpzz_d)3573 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3574 void *vg, uint32_t desc)
3575 {
3576 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3577 uint64_t *d = vd, *n = vn, *m = vm;
3578 uint8_t *pg = vg;
3579
3580 for (i = 0; i < opr_sz; i += 1) {
3581 uint64_t nn = n[i], mm = m[i];
3582 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3583 }
3584 }
3585
HELPER(sve_sel_zpzz_q)3586 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3587 void *vg, uint32_t desc)
3588 {
3589 intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3590 Int128 *d = vd, *n = vn, *m = vm;
3591 uint16_t *pg = vg;
3592
3593 for (i = 0; i < opr_sz; i += 1) {
3594 d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3595 }
3596 }
3597
3598 /* Two operand comparison controlled by a predicate.
3599 * ??? It is very tempting to want to be able to expand this inline
3600 * with x86 instructions, e.g.
3601 *
3602 * vcmpeqw zm, zn, %ymm0
3603 * vpmovmskb %ymm0, %eax
3604 * and $0x5555, %eax
3605 * and pg, %eax
3606 *
3607 * or even aarch64, e.g.
3608 *
3609 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3610 * cmeq v0.8h, zn, zm
3611 * and v0.8h, v0.8h, mask
3612 * addv h0, v0.8h
3613 * and v0.8b, pg
3614 *
3615 * However, coming up with an abstraction that allows vector inputs and
3616 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3617 * scalar outputs, is tricky.
3618 */
3619 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3620 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3621 { \
3622 intptr_t opr_sz = simd_oprsz(desc); \
3623 uint32_t flags = PREDTEST_INIT; \
3624 intptr_t i = opr_sz; \
3625 do { \
3626 uint64_t out = 0, pg; \
3627 do { \
3628 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3629 TYPE nn = *(TYPE *)(vn + H(i)); \
3630 TYPE mm = *(TYPE *)(vm + H(i)); \
3631 out |= nn OP mm; \
3632 } while (i & 63); \
3633 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3634 out &= pg; \
3635 *(uint64_t *)(vd + (i >> 3)) = out; \
3636 flags = iter_predtest_bwd(out, pg, flags); \
3637 } while (i > 0); \
3638 return flags; \
3639 }
3640
3641 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3642 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3643 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3644 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3645 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3646 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3647 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3648 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3649
3650 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3651 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3652 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3653 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3654
3655 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3656 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3657 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3658 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3659
3660 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3661 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3662 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3663 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3664
3665 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3666 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3667 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3668 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3669
3670 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3671 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3672 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3673 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3674
3675 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3676 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3677 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3678 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3679
3680 #undef DO_CMP_PPZZ_B
3681 #undef DO_CMP_PPZZ_H
3682 #undef DO_CMP_PPZZ_S
3683 #undef DO_CMP_PPZZ_D
3684 #undef DO_CMP_PPZZ
3685
3686 /* Similar, but the second source is "wide". */
3687 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3688 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3689 { \
3690 intptr_t opr_sz = simd_oprsz(desc); \
3691 uint32_t flags = PREDTEST_INIT; \
3692 intptr_t i = opr_sz; \
3693 do { \
3694 uint64_t out = 0, pg; \
3695 do { \
3696 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3697 do { \
3698 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3699 TYPE nn = *(TYPE *)(vn + H(i)); \
3700 out |= nn OP mm; \
3701 } while (i & 7); \
3702 } while (i & 63); \
3703 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3704 out &= pg; \
3705 *(uint64_t *)(vd + (i >> 3)) = out; \
3706 flags = iter_predtest_bwd(out, pg, flags); \
3707 } while (i > 0); \
3708 return flags; \
3709 }
3710
3711 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3712 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3713 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3714 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3715 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3716 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3717
3718 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3719 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3720 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3721
3722 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3723 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3724 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3725
3726 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3727 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3728 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3729
3730 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3731 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3732 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3733
3734 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3735 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3736 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3737
3738 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3739 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3740 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3741
3742 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3743 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3744 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3745
3746 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3747 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3748 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3749
3750 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3751 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3752 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3753
3754 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3755 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3756 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3757
3758 #undef DO_CMP_PPZW_B
3759 #undef DO_CMP_PPZW_H
3760 #undef DO_CMP_PPZW_S
3761 #undef DO_CMP_PPZW
3762
3763 /* Similar, but the second source is immediate. */
3764 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3765 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3766 { \
3767 intptr_t opr_sz = simd_oprsz(desc); \
3768 uint32_t flags = PREDTEST_INIT; \
3769 TYPE mm = simd_data(desc); \
3770 intptr_t i = opr_sz; \
3771 do { \
3772 uint64_t out = 0, pg; \
3773 do { \
3774 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3775 TYPE nn = *(TYPE *)(vn + H(i)); \
3776 out |= nn OP mm; \
3777 } while (i & 63); \
3778 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3779 out &= pg; \
3780 *(uint64_t *)(vd + (i >> 3)) = out; \
3781 flags = iter_predtest_bwd(out, pg, flags); \
3782 } while (i > 0); \
3783 return flags; \
3784 }
3785
3786 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3787 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3788 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3789 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3790 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3791 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3792 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3793 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3794
3795 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3796 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3797 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3798 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3799
3800 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3801 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3802 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3803 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3804
3805 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3806 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3807 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3808 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3809
3810 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3811 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3812 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3813 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3814
3815 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3816 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3817 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3818 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3819
3820 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3821 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3822 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3823 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3824
3825 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3826 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3827 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3828 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3829
3830 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3831 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3832 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3833 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3834
3835 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3836 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3837 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3838 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3839
3840 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3841 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3842 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3843 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3844
3845 #undef DO_CMP_PPZI_B
3846 #undef DO_CMP_PPZI_H
3847 #undef DO_CMP_PPZI_S
3848 #undef DO_CMP_PPZI_D
3849 #undef DO_CMP_PPZI
3850
3851 /* Similar to the ARM LastActive pseudocode function. */
last_active_pred(void * vd,void * vg,intptr_t oprsz)3852 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3853 {
3854 intptr_t i;
3855
3856 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3857 uint64_t pg = *(uint64_t *)(vg + i);
3858 if (pg) {
3859 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3860 }
3861 }
3862 return 0;
3863 }
3864
3865 /* Compute a mask into RETB that is true for all G, up to and including
3866 * (if after) or excluding (if !after) the first G & N.
3867 * Return true if BRK found.
3868 */
compute_brk(uint64_t * retb,uint64_t n,uint64_t g,bool brk,bool after)3869 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3870 bool brk, bool after)
3871 {
3872 uint64_t b;
3873
3874 if (brk) {
3875 b = 0;
3876 } else if ((g & n) == 0) {
3877 /* For all G, no N are set; break not found. */
3878 b = g;
3879 } else {
3880 /* Break somewhere in N. Locate it. */
3881 b = g & n; /* guard true, pred true */
3882 b = b & -b; /* first such */
3883 if (after) {
3884 b = b | (b - 1); /* break after same */
3885 } else {
3886 b = b - 1; /* break before same */
3887 }
3888 brk = true;
3889 }
3890
3891 *retb = b;
3892 return brk;
3893 }
3894
3895 /* Compute a zeroing BRK. */
compute_brk_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3896 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3897 intptr_t oprsz, bool after)
3898 {
3899 bool brk = false;
3900 intptr_t i;
3901
3902 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3903 uint64_t this_b, this_g = g[i];
3904
3905 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3906 d[i] = this_b & this_g;
3907 }
3908 }
3909
3910 /* Likewise, but also compute flags. */
compute_brks_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3911 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3912 intptr_t oprsz, bool after)
3913 {
3914 uint32_t flags = PREDTEST_INIT;
3915 bool brk = false;
3916 intptr_t i;
3917
3918 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3919 uint64_t this_b, this_d, this_g = g[i];
3920
3921 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3922 d[i] = this_d = this_b & this_g;
3923 flags = iter_predtest_fwd(this_d, this_g, flags);
3924 }
3925 return flags;
3926 }
3927
3928 /* Compute a merging BRK. */
compute_brk_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3929 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3930 intptr_t oprsz, bool after)
3931 {
3932 bool brk = false;
3933 intptr_t i;
3934
3935 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3936 uint64_t this_b, this_g = g[i];
3937
3938 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3939 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3940 }
3941 }
3942
3943 /* Likewise, but also compute flags. */
compute_brks_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3944 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3945 intptr_t oprsz, bool after)
3946 {
3947 uint32_t flags = PREDTEST_INIT;
3948 bool brk = false;
3949 intptr_t i;
3950
3951 for (i = 0; i < oprsz / 8; ++i) {
3952 uint64_t this_b, this_d = d[i], this_g = g[i];
3953
3954 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3955 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3956 flags = iter_predtest_fwd(this_d, this_g, flags);
3957 }
3958 return flags;
3959 }
3960
do_zero(ARMPredicateReg * d,intptr_t oprsz)3961 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3962 {
3963 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3964 * The compiler should turn this into 4 64-bit integer stores.
3965 */
3966 memset(d, 0, sizeof(ARMPredicateReg));
3967 return PREDTEST_INIT;
3968 }
3969
HELPER(sve_brkpa)3970 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3971 uint32_t pred_desc)
3972 {
3973 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3974 if (last_active_pred(vn, vg, oprsz)) {
3975 compute_brk_z(vd, vm, vg, oprsz, true);
3976 } else {
3977 do_zero(vd, oprsz);
3978 }
3979 }
3980
HELPER(sve_brkpas)3981 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3982 uint32_t pred_desc)
3983 {
3984 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3985 if (last_active_pred(vn, vg, oprsz)) {
3986 return compute_brks_z(vd, vm, vg, oprsz, true);
3987 } else {
3988 return do_zero(vd, oprsz);
3989 }
3990 }
3991
HELPER(sve_brkpb)3992 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3993 uint32_t pred_desc)
3994 {
3995 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3996 if (last_active_pred(vn, vg, oprsz)) {
3997 compute_brk_z(vd, vm, vg, oprsz, false);
3998 } else {
3999 do_zero(vd, oprsz);
4000 }
4001 }
4002
HELPER(sve_brkpbs)4003 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4004 uint32_t pred_desc)
4005 {
4006 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4007 if (last_active_pred(vn, vg, oprsz)) {
4008 return compute_brks_z(vd, vm, vg, oprsz, false);
4009 } else {
4010 return do_zero(vd, oprsz);
4011 }
4012 }
4013
HELPER(sve_brka_z)4014 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4015 {
4016 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4017 compute_brk_z(vd, vn, vg, oprsz, true);
4018 }
4019
HELPER(sve_brkas_z)4020 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4021 {
4022 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4023 return compute_brks_z(vd, vn, vg, oprsz, true);
4024 }
4025
HELPER(sve_brkb_z)4026 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4027 {
4028 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4029 compute_brk_z(vd, vn, vg, oprsz, false);
4030 }
4031
HELPER(sve_brkbs_z)4032 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4033 {
4034 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4035 return compute_brks_z(vd, vn, vg, oprsz, false);
4036 }
4037
HELPER(sve_brka_m)4038 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4039 {
4040 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4041 compute_brk_m(vd, vn, vg, oprsz, true);
4042 }
4043
HELPER(sve_brkas_m)4044 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4045 {
4046 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4047 return compute_brks_m(vd, vn, vg, oprsz, true);
4048 }
4049
HELPER(sve_brkb_m)4050 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4051 {
4052 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4053 compute_brk_m(vd, vn, vg, oprsz, false);
4054 }
4055
HELPER(sve_brkbs_m)4056 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4057 {
4058 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4059 return compute_brks_m(vd, vn, vg, oprsz, false);
4060 }
4061
HELPER(sve_brkn)4062 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4063 {
4064 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4065 if (!last_active_pred(vn, vg, oprsz)) {
4066 do_zero(vd, oprsz);
4067 }
4068 }
4069
4070 /* As if PredTest(Ones(PL), D, esz). */
predtest_ones(ARMPredicateReg * d,intptr_t oprsz,uint64_t esz_mask)4071 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4072 uint64_t esz_mask)
4073 {
4074 uint32_t flags = PREDTEST_INIT;
4075 intptr_t i;
4076
4077 for (i = 0; i < oprsz / 8; i++) {
4078 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4079 }
4080 if (oprsz & 7) {
4081 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4082 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4083 }
4084 return flags;
4085 }
4086
HELPER(sve_brkns)4087 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4088 {
4089 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4090 if (last_active_pred(vn, vg, oprsz)) {
4091 return predtest_ones(vd, oprsz, -1);
4092 } else {
4093 return do_zero(vd, oprsz);
4094 }
4095 }
4096
HELPER(sve_cntp)4097 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4098 {
4099 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4100 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4101 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4102 intptr_t i;
4103
4104 for (i = 0; i < words; ++i) {
4105 uint64_t t = n[i] & g[i] & mask;
4106 sum += ctpop64(t);
4107 }
4108 return sum;
4109 }
4110
HELPER(sve_whilel)4111 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4112 {
4113 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4114 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4115 uint64_t esz_mask = pred_esz_masks[esz];
4116 ARMPredicateReg *d = vd;
4117 uint32_t flags;
4118 intptr_t i;
4119
4120 /* Begin with a zero predicate register. */
4121 flags = do_zero(d, oprsz);
4122 if (count == 0) {
4123 return flags;
4124 }
4125
4126 /* Set all of the requested bits. */
4127 for (i = 0; i < count / 64; ++i) {
4128 d->p[i] = esz_mask;
4129 }
4130 if (count & 63) {
4131 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4132 }
4133
4134 return predtest_ones(d, oprsz, esz_mask);
4135 }
4136
HELPER(sve_whileg)4137 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4138 {
4139 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4140 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4141 uint64_t esz_mask = pred_esz_masks[esz];
4142 ARMPredicateReg *d = vd;
4143 intptr_t i, invcount, oprbits;
4144 uint64_t bits;
4145
4146 if (count == 0) {
4147 return do_zero(d, oprsz);
4148 }
4149
4150 oprbits = oprsz * 8;
4151 tcg_debug_assert(count <= oprbits);
4152
4153 bits = esz_mask;
4154 if (oprbits & 63) {
4155 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4156 }
4157
4158 invcount = oprbits - count;
4159 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4160 d->p[i] = bits;
4161 bits = esz_mask;
4162 }
4163
4164 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4165
4166 while (--i >= 0) {
4167 d->p[i] = 0;
4168 }
4169
4170 return predtest_ones(d, oprsz, esz_mask);
4171 }
4172
4173 /* Recursive reduction on a function;
4174 * C.f. the ARM ARM function ReducePredicated.
4175 *
4176 * While it would be possible to write this without the DATA temporary,
4177 * it is much simpler to process the predicate register this way.
4178 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4179 * little to gain with a more complex non-recursive form.
4180 */
4181 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4182 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4183 { \
4184 if (n == 1) { \
4185 return *data; \
4186 } else { \
4187 uintptr_t half = n / 2; \
4188 TYPE lo = NAME##_reduce(data, status, half); \
4189 TYPE hi = NAME##_reduce(data + half, status, half); \
4190 return TYPE##_##FUNC(lo, hi, status); \
4191 } \
4192 } \
4193 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4194 { \
4195 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4196 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4197 for (i = 0; i < oprsz; ) { \
4198 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4199 do { \
4200 TYPE nn = *(TYPE *)(vn + H(i)); \
4201 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4202 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4203 } while (i & 15); \
4204 } \
4205 for (; i < maxsz; i += sizeof(TYPE)) { \
4206 *(TYPE *)((void *)data + i) = IDENT; \
4207 } \
4208 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4209 }
4210
DO_REDUCE(sve_faddv_h,float16,H1_2,add,float16_zero)4211 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4212 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4213 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4214
4215 /* Identity is floatN_default_nan, without the function call. */
4216 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4217 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4218 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4219
4220 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4221 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4222 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4223
4224 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4225 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4226 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4227
4228 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4229 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4230 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4231
4232 #undef DO_REDUCE
4233
4234 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4235 void *status, uint32_t desc)
4236 {
4237 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4238 float16 result = nn;
4239
4240 do {
4241 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4242 do {
4243 if (pg & 1) {
4244 float16 mm = *(float16 *)(vm + H1_2(i));
4245 result = float16_add(result, mm, status);
4246 }
4247 i += sizeof(float16), pg >>= sizeof(float16);
4248 } while (i & 15);
4249 } while (i < opr_sz);
4250
4251 return result;
4252 }
4253
HELPER(sve_fadda_s)4254 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4255 void *status, uint32_t desc)
4256 {
4257 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4258 float32 result = nn;
4259
4260 do {
4261 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4262 do {
4263 if (pg & 1) {
4264 float32 mm = *(float32 *)(vm + H1_2(i));
4265 result = float32_add(result, mm, status);
4266 }
4267 i += sizeof(float32), pg >>= sizeof(float32);
4268 } while (i & 15);
4269 } while (i < opr_sz);
4270
4271 return result;
4272 }
4273
HELPER(sve_fadda_d)4274 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4275 void *status, uint32_t desc)
4276 {
4277 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4278 uint64_t *m = vm;
4279 uint8_t *pg = vg;
4280
4281 for (i = 0; i < opr_sz; i++) {
4282 if (pg[H1(i)] & 1) {
4283 nn = float64_add(nn, m[i], status);
4284 }
4285 }
4286
4287 return nn;
4288 }
4289
4290 /* Fully general three-operand expander, controlled by a predicate,
4291 * With the extra float_status parameter.
4292 */
4293 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4294 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4295 void *status, uint32_t desc) \
4296 { \
4297 intptr_t i = simd_oprsz(desc); \
4298 uint64_t *g = vg; \
4299 do { \
4300 uint64_t pg = g[(i - 1) >> 6]; \
4301 do { \
4302 i -= sizeof(TYPE); \
4303 if (likely((pg >> (i & 63)) & 1)) { \
4304 TYPE nn = *(TYPE *)(vn + H(i)); \
4305 TYPE mm = *(TYPE *)(vm + H(i)); \
4306 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4307 } \
4308 } while (i & 63); \
4309 } while (i != 0); \
4310 }
4311
DO_ZPZZ_FP(sve_fadd_h,uint16_t,H1_2,float16_add)4312 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4313 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4314 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4315
4316 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4317 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4318 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4319
4320 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4321 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4322 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4323
4324 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4325 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4326 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4327
4328 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4329 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4330 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4331
4332 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4333 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4334 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4335
4336 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4337 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4338 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4339
4340 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4341 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4342 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4343
4344 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4345 {
4346 return float16_abs(float16_sub(a, b, s));
4347 }
4348
abd_s(float32 a,float32 b,float_status * s)4349 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4350 {
4351 return float32_abs(float32_sub(a, b, s));
4352 }
4353
abd_d(float64 a,float64 b,float_status * s)4354 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4355 {
4356 return float64_abs(float64_sub(a, b, s));
4357 }
4358
DO_ZPZZ_FP(sve_fabd_h,uint16_t,H1_2,abd_h)4359 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4360 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4361 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4362
4363 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4364 {
4365 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4366 return float64_scalbn(a, b_int, s);
4367 }
4368
DO_ZPZZ_FP(sve_fscalbn_h,int16_t,H1_2,float16_scalbn)4369 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4370 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4371 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4372
4373 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4374 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4375 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4376
4377 #undef DO_ZPZZ_FP
4378
4379 /* Three-operand expander, with one scalar operand, controlled by
4380 * a predicate, with the extra float_status parameter.
4381 */
4382 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4383 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4384 void *status, uint32_t desc) \
4385 { \
4386 intptr_t i = simd_oprsz(desc); \
4387 uint64_t *g = vg; \
4388 TYPE mm = scalar; \
4389 do { \
4390 uint64_t pg = g[(i - 1) >> 6]; \
4391 do { \
4392 i -= sizeof(TYPE); \
4393 if (likely((pg >> (i & 63)) & 1)) { \
4394 TYPE nn = *(TYPE *)(vn + H(i)); \
4395 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4396 } \
4397 } while (i & 63); \
4398 } while (i != 0); \
4399 }
4400
4401 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4402 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4403 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4404
4405 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4406 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4407 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4408
4409 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4410 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4411 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4412
4413 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4414 {
4415 return float16_sub(b, a, s);
4416 }
4417
subr_s(float32 a,float32 b,float_status * s)4418 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4419 {
4420 return float32_sub(b, a, s);
4421 }
4422
subr_d(float64 a,float64 b,float_status * s)4423 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4424 {
4425 return float64_sub(b, a, s);
4426 }
4427
DO_ZPZS_FP(sve_fsubrs_h,float16,H1_2,subr_h)4428 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4429 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4430 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4431
4432 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4433 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4434 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4435
4436 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4437 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4438 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4439
4440 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4441 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4442 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4443
4444 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4445 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4446 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4447
4448 /* Fully general two-operand expander, controlled by a predicate,
4449 * With the extra float_status parameter.
4450 */
4451 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4452 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4453 { \
4454 intptr_t i = simd_oprsz(desc); \
4455 uint64_t *g = vg; \
4456 do { \
4457 uint64_t pg = g[(i - 1) >> 6]; \
4458 do { \
4459 i -= sizeof(TYPE); \
4460 if (likely((pg >> (i & 63)) & 1)) { \
4461 TYPE nn = *(TYPE *)(vn + H(i)); \
4462 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4463 } \
4464 } while (i & 63); \
4465 } while (i != 0); \
4466 }
4467
4468 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4469 * FZ16. When converting from fp16, this affects flushing input denormals;
4470 * when converting to fp16, this affects flushing output denormals.
4471 */
4472 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4473 {
4474 bool save = get_flush_inputs_to_zero(fpst);
4475 float32 ret;
4476
4477 set_flush_inputs_to_zero(false, fpst);
4478 ret = float16_to_float32(f, true, fpst);
4479 set_flush_inputs_to_zero(save, fpst);
4480 return ret;
4481 }
4482
sve_f16_to_f64(float16 f,float_status * fpst)4483 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4484 {
4485 bool save = get_flush_inputs_to_zero(fpst);
4486 float64 ret;
4487
4488 set_flush_inputs_to_zero(false, fpst);
4489 ret = float16_to_float64(f, true, fpst);
4490 set_flush_inputs_to_zero(save, fpst);
4491 return ret;
4492 }
4493
sve_f32_to_f16(float32 f,float_status * fpst)4494 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4495 {
4496 bool save = get_flush_to_zero(fpst);
4497 float16 ret;
4498
4499 set_flush_to_zero(false, fpst);
4500 ret = float32_to_float16(f, true, fpst);
4501 set_flush_to_zero(save, fpst);
4502 return ret;
4503 }
4504
sve_f64_to_f16(float64 f,float_status * fpst)4505 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4506 {
4507 bool save = get_flush_to_zero(fpst);
4508 float16 ret;
4509
4510 set_flush_to_zero(false, fpst);
4511 ret = float64_to_float16(f, true, fpst);
4512 set_flush_to_zero(save, fpst);
4513 return ret;
4514 }
4515
vfp_float16_to_int16_rtz(float16 f,float_status * s)4516 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4517 {
4518 if (float16_is_any_nan(f)) {
4519 float_raise(float_flag_invalid, s);
4520 return 0;
4521 }
4522 return float16_to_int16_round_to_zero(f, s);
4523 }
4524
vfp_float16_to_int64_rtz(float16 f,float_status * s)4525 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4526 {
4527 if (float16_is_any_nan(f)) {
4528 float_raise(float_flag_invalid, s);
4529 return 0;
4530 }
4531 return float16_to_int64_round_to_zero(f, s);
4532 }
4533
vfp_float32_to_int64_rtz(float32 f,float_status * s)4534 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4535 {
4536 if (float32_is_any_nan(f)) {
4537 float_raise(float_flag_invalid, s);
4538 return 0;
4539 }
4540 return float32_to_int64_round_to_zero(f, s);
4541 }
4542
vfp_float64_to_int64_rtz(float64 f,float_status * s)4543 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4544 {
4545 if (float64_is_any_nan(f)) {
4546 float_raise(float_flag_invalid, s);
4547 return 0;
4548 }
4549 return float64_to_int64_round_to_zero(f, s);
4550 }
4551
vfp_float16_to_uint16_rtz(float16 f,float_status * s)4552 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4553 {
4554 if (float16_is_any_nan(f)) {
4555 float_raise(float_flag_invalid, s);
4556 return 0;
4557 }
4558 return float16_to_uint16_round_to_zero(f, s);
4559 }
4560
vfp_float16_to_uint64_rtz(float16 f,float_status * s)4561 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4562 {
4563 if (float16_is_any_nan(f)) {
4564 float_raise(float_flag_invalid, s);
4565 return 0;
4566 }
4567 return float16_to_uint64_round_to_zero(f, s);
4568 }
4569
vfp_float32_to_uint64_rtz(float32 f,float_status * s)4570 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4571 {
4572 if (float32_is_any_nan(f)) {
4573 float_raise(float_flag_invalid, s);
4574 return 0;
4575 }
4576 return float32_to_uint64_round_to_zero(f, s);
4577 }
4578
vfp_float64_to_uint64_rtz(float64 f,float_status * s)4579 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4580 {
4581 if (float64_is_any_nan(f)) {
4582 float_raise(float_flag_invalid, s);
4583 return 0;
4584 }
4585 return float64_to_uint64_round_to_zero(f, s);
4586 }
4587
DO_ZPZ_FP(sve_fcvt_sh,uint32_t,H1_4,sve_f32_to_f16)4588 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4589 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4590 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4591 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4592 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4593 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4594 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4595
4596 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4597 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4598 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4599 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4600 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4601 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4602 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4603
4604 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4605 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4606 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4607 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4608 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4609 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4610 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4611
4612 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4613 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4614 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4615
4616 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4617 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4618 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4619
4620 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4621 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4622 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4623
4624 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4625 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4626 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4627
4628 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4629 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4630 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4631 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4632 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4633 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4634 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4635
4636 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4637 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4638 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4639 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4640 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4641 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4642 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4643
4644 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4645 {
4646 /* Extract frac to the top of the uint32_t. */
4647 uint32_t frac = (uint32_t)a << (16 + 6);
4648 int16_t exp = extract32(a, 10, 5);
4649
4650 if (unlikely(exp == 0)) {
4651 if (frac != 0) {
4652 if (!get_flush_inputs_to_zero(s)) {
4653 /* denormal: bias - fractional_zeros */
4654 return -15 - clz32(frac);
4655 }
4656 /* flush to zero */
4657 float_raise(float_flag_input_denormal, s);
4658 }
4659 } else if (unlikely(exp == 0x1f)) {
4660 if (frac == 0) {
4661 return INT16_MAX; /* infinity */
4662 }
4663 } else {
4664 /* normal: exp - bias */
4665 return exp - 15;
4666 }
4667 /* nan or zero */
4668 float_raise(float_flag_invalid, s);
4669 return INT16_MIN;
4670 }
4671
do_float32_logb_as_int(float32 a,float_status * s)4672 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4673 {
4674 /* Extract frac to the top of the uint32_t. */
4675 uint32_t frac = a << 9;
4676 int32_t exp = extract32(a, 23, 8);
4677
4678 if (unlikely(exp == 0)) {
4679 if (frac != 0) {
4680 if (!get_flush_inputs_to_zero(s)) {
4681 /* denormal: bias - fractional_zeros */
4682 return -127 - clz32(frac);
4683 }
4684 /* flush to zero */
4685 float_raise(float_flag_input_denormal, s);
4686 }
4687 } else if (unlikely(exp == 0xff)) {
4688 if (frac == 0) {
4689 return INT32_MAX; /* infinity */
4690 }
4691 } else {
4692 /* normal: exp - bias */
4693 return exp - 127;
4694 }
4695 /* nan or zero */
4696 float_raise(float_flag_invalid, s);
4697 return INT32_MIN;
4698 }
4699
do_float64_logb_as_int(float64 a,float_status * s)4700 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4701 {
4702 /* Extract frac to the top of the uint64_t. */
4703 uint64_t frac = a << 12;
4704 int64_t exp = extract64(a, 52, 11);
4705
4706 if (unlikely(exp == 0)) {
4707 if (frac != 0) {
4708 if (!get_flush_inputs_to_zero(s)) {
4709 /* denormal: bias - fractional_zeros */
4710 return -1023 - clz64(frac);
4711 }
4712 /* flush to zero */
4713 float_raise(float_flag_input_denormal, s);
4714 }
4715 } else if (unlikely(exp == 0x7ff)) {
4716 if (frac == 0) {
4717 return INT64_MAX; /* infinity */
4718 }
4719 } else {
4720 /* normal: exp - bias */
4721 return exp - 1023;
4722 }
4723 /* nan or zero */
4724 float_raise(float_flag_invalid, s);
4725 return INT64_MIN;
4726 }
4727
DO_ZPZ_FP(flogb_h,float16,H1_2,do_float16_logb_as_int)4728 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4729 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4730 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4731
4732 #undef DO_ZPZ_FP
4733
4734 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4735 float_status *status, uint32_t desc,
4736 uint16_t neg1, uint16_t neg3)
4737 {
4738 intptr_t i = simd_oprsz(desc);
4739 uint64_t *g = vg;
4740
4741 do {
4742 uint64_t pg = g[(i - 1) >> 6];
4743 do {
4744 i -= 2;
4745 if (likely((pg >> (i & 63)) & 1)) {
4746 float16 e1, e2, e3, r;
4747
4748 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4749 e2 = *(uint16_t *)(vm + H1_2(i));
4750 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4751 r = float16_muladd(e1, e2, e3, 0, status);
4752 *(uint16_t *)(vd + H1_2(i)) = r;
4753 }
4754 } while (i & 63);
4755 } while (i != 0);
4756 }
4757
HELPER(sve_fmla_zpzzz_h)4758 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4759 void *vg, void *status, uint32_t desc)
4760 {
4761 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4762 }
4763
HELPER(sve_fmls_zpzzz_h)4764 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4765 void *vg, void *status, uint32_t desc)
4766 {
4767 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4768 }
4769
HELPER(sve_fnmla_zpzzz_h)4770 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4771 void *vg, void *status, uint32_t desc)
4772 {
4773 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4774 }
4775
HELPER(sve_fnmls_zpzzz_h)4776 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4777 void *vg, void *status, uint32_t desc)
4778 {
4779 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4780 }
4781
do_fmla_zpzzz_s(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint32_t neg1,uint32_t neg3)4782 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4783 float_status *status, uint32_t desc,
4784 uint32_t neg1, uint32_t neg3)
4785 {
4786 intptr_t i = simd_oprsz(desc);
4787 uint64_t *g = vg;
4788
4789 do {
4790 uint64_t pg = g[(i - 1) >> 6];
4791 do {
4792 i -= 4;
4793 if (likely((pg >> (i & 63)) & 1)) {
4794 float32 e1, e2, e3, r;
4795
4796 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4797 e2 = *(uint32_t *)(vm + H1_4(i));
4798 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4799 r = float32_muladd(e1, e2, e3, 0, status);
4800 *(uint32_t *)(vd + H1_4(i)) = r;
4801 }
4802 } while (i & 63);
4803 } while (i != 0);
4804 }
4805
HELPER(sve_fmla_zpzzz_s)4806 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4807 void *vg, void *status, uint32_t desc)
4808 {
4809 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4810 }
4811
HELPER(sve_fmls_zpzzz_s)4812 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4813 void *vg, void *status, uint32_t desc)
4814 {
4815 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4816 }
4817
HELPER(sve_fnmla_zpzzz_s)4818 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4819 void *vg, void *status, uint32_t desc)
4820 {
4821 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4822 }
4823
HELPER(sve_fnmls_zpzzz_s)4824 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4825 void *vg, void *status, uint32_t desc)
4826 {
4827 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4828 }
4829
do_fmla_zpzzz_d(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint64_t neg1,uint64_t neg3)4830 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4831 float_status *status, uint32_t desc,
4832 uint64_t neg1, uint64_t neg3)
4833 {
4834 intptr_t i = simd_oprsz(desc);
4835 uint64_t *g = vg;
4836
4837 do {
4838 uint64_t pg = g[(i - 1) >> 6];
4839 do {
4840 i -= 8;
4841 if (likely((pg >> (i & 63)) & 1)) {
4842 float64 e1, e2, e3, r;
4843
4844 e1 = *(uint64_t *)(vn + i) ^ neg1;
4845 e2 = *(uint64_t *)(vm + i);
4846 e3 = *(uint64_t *)(va + i) ^ neg3;
4847 r = float64_muladd(e1, e2, e3, 0, status);
4848 *(uint64_t *)(vd + i) = r;
4849 }
4850 } while (i & 63);
4851 } while (i != 0);
4852 }
4853
HELPER(sve_fmla_zpzzz_d)4854 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4855 void *vg, void *status, uint32_t desc)
4856 {
4857 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4858 }
4859
HELPER(sve_fmls_zpzzz_d)4860 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4861 void *vg, void *status, uint32_t desc)
4862 {
4863 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4864 }
4865
HELPER(sve_fnmla_zpzzz_d)4866 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4867 void *vg, void *status, uint32_t desc)
4868 {
4869 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4870 }
4871
HELPER(sve_fnmls_zpzzz_d)4872 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4873 void *vg, void *status, uint32_t desc)
4874 {
4875 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4876 }
4877
4878 /* Two operand floating-point comparison controlled by a predicate.
4879 * Unlike the integer version, we are not allowed to optimistically
4880 * compare operands, since the comparison may have side effects wrt
4881 * the FPSR.
4882 */
4883 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4884 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4885 void *status, uint32_t desc) \
4886 { \
4887 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4888 uint64_t *d = vd, *g = vg; \
4889 do { \
4890 uint64_t out = 0, pg = g[j]; \
4891 do { \
4892 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4893 if (likely((pg >> (i & 63)) & 1)) { \
4894 TYPE nn = *(TYPE *)(vn + H(i)); \
4895 TYPE mm = *(TYPE *)(vm + H(i)); \
4896 out |= OP(TYPE, nn, mm, status); \
4897 } \
4898 } while (i & 63); \
4899 d[j--] = out; \
4900 } while (i > 0); \
4901 }
4902
4903 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4904 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4905 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4906 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4907 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4908 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4909
4910 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4911 DO_FPCMP_PPZZ_H(NAME, OP) \
4912 DO_FPCMP_PPZZ_S(NAME, OP) \
4913 DO_FPCMP_PPZZ_D(NAME, OP)
4914
4915 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4916 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4917 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4918 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4919 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4920 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4921 #define DO_FCMUO(TYPE, X, Y, ST) \
4922 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4923 #define DO_FACGE(TYPE, X, Y, ST) \
4924 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4925 #define DO_FACGT(TYPE, X, Y, ST) \
4926 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4927
DO_FPCMP_PPZZ_ALL(sve_fcmge,DO_FCMGE)4928 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4929 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4930 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4931 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4932 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4933 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4934 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4935
4936 #undef DO_FPCMP_PPZZ_ALL
4937 #undef DO_FPCMP_PPZZ_D
4938 #undef DO_FPCMP_PPZZ_S
4939 #undef DO_FPCMP_PPZZ_H
4940 #undef DO_FPCMP_PPZZ
4941
4942 /* One operand floating-point comparison against zero, controlled
4943 * by a predicate.
4944 */
4945 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4946 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4947 void *status, uint32_t desc) \
4948 { \
4949 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4950 uint64_t *d = vd, *g = vg; \
4951 do { \
4952 uint64_t out = 0, pg = g[j]; \
4953 do { \
4954 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4955 if ((pg >> (i & 63)) & 1) { \
4956 TYPE nn = *(TYPE *)(vn + H(i)); \
4957 out |= OP(TYPE, nn, 0, status); \
4958 } \
4959 } while (i & 63); \
4960 d[j--] = out; \
4961 } while (i > 0); \
4962 }
4963
4964 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4965 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4966 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4967 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4968 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4969 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4970
4971 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4972 DO_FPCMP_PPZ0_H(NAME, OP) \
4973 DO_FPCMP_PPZ0_S(NAME, OP) \
4974 DO_FPCMP_PPZ0_D(NAME, OP)
4975
4976 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4977 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4978 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4979 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4980 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4981 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4982
4983 /* FP Trig Multiply-Add. */
4984
4985 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4986 {
4987 static const float16 coeff[16] = {
4988 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4990 };
4991 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4992 intptr_t x = simd_data(desc);
4993 float16 *d = vd, *n = vn, *m = vm;
4994 for (i = 0; i < opr_sz; i++) {
4995 float16 mm = m[i];
4996 intptr_t xx = x;
4997 if (float16_is_neg(mm)) {
4998 mm = float16_abs(mm);
4999 xx += 8;
5000 }
5001 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5002 }
5003 }
5004
HELPER(sve_ftmad_s)5005 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5006 {
5007 static const float32 coeff[16] = {
5008 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5009 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5010 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5011 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5012 };
5013 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5014 intptr_t x = simd_data(desc);
5015 float32 *d = vd, *n = vn, *m = vm;
5016 for (i = 0; i < opr_sz; i++) {
5017 float32 mm = m[i];
5018 intptr_t xx = x;
5019 if (float32_is_neg(mm)) {
5020 mm = float32_abs(mm);
5021 xx += 8;
5022 }
5023 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5024 }
5025 }
5026
HELPER(sve_ftmad_d)5027 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5028 {
5029 static const float64 coeff[16] = {
5030 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5031 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5032 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5033 0x3de5d8408868552full, 0x0000000000000000ull,
5034 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5035 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5036 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5037 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5038 };
5039 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5040 intptr_t x = simd_data(desc);
5041 float64 *d = vd, *n = vn, *m = vm;
5042 for (i = 0; i < opr_sz; i++) {
5043 float64 mm = m[i];
5044 intptr_t xx = x;
5045 if (float64_is_neg(mm)) {
5046 mm = float64_abs(mm);
5047 xx += 8;
5048 }
5049 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5050 }
5051 }
5052
5053 /*
5054 * FP Complex Add
5055 */
5056
HELPER(sve_fcadd_h)5057 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5058 void *vs, uint32_t desc)
5059 {
5060 intptr_t j, i = simd_oprsz(desc);
5061 uint64_t *g = vg;
5062 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5063 float16 neg_real = float16_chs(neg_imag);
5064
5065 do {
5066 uint64_t pg = g[(i - 1) >> 6];
5067 do {
5068 float16 e0, e1, e2, e3;
5069
5070 /* I holds the real index; J holds the imag index. */
5071 j = i - sizeof(float16);
5072 i -= 2 * sizeof(float16);
5073
5074 e0 = *(float16 *)(vn + H1_2(i));
5075 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5076 e2 = *(float16 *)(vn + H1_2(j));
5077 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5078
5079 if (likely((pg >> (i & 63)) & 1)) {
5080 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5081 }
5082 if (likely((pg >> (j & 63)) & 1)) {
5083 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5084 }
5085 } while (i & 63);
5086 } while (i != 0);
5087 }
5088
HELPER(sve_fcadd_s)5089 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5090 void *vs, uint32_t desc)
5091 {
5092 intptr_t j, i = simd_oprsz(desc);
5093 uint64_t *g = vg;
5094 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5095 float32 neg_real = float32_chs(neg_imag);
5096
5097 do {
5098 uint64_t pg = g[(i - 1) >> 6];
5099 do {
5100 float32 e0, e1, e2, e3;
5101
5102 /* I holds the real index; J holds the imag index. */
5103 j = i - sizeof(float32);
5104 i -= 2 * sizeof(float32);
5105
5106 e0 = *(float32 *)(vn + H1_2(i));
5107 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5108 e2 = *(float32 *)(vn + H1_2(j));
5109 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5110
5111 if (likely((pg >> (i & 63)) & 1)) {
5112 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5113 }
5114 if (likely((pg >> (j & 63)) & 1)) {
5115 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5116 }
5117 } while (i & 63);
5118 } while (i != 0);
5119 }
5120
HELPER(sve_fcadd_d)5121 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5122 void *vs, uint32_t desc)
5123 {
5124 intptr_t j, i = simd_oprsz(desc);
5125 uint64_t *g = vg;
5126 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5127 float64 neg_real = float64_chs(neg_imag);
5128
5129 do {
5130 uint64_t pg = g[(i - 1) >> 6];
5131 do {
5132 float64 e0, e1, e2, e3;
5133
5134 /* I holds the real index; J holds the imag index. */
5135 j = i - sizeof(float64);
5136 i -= 2 * sizeof(float64);
5137
5138 e0 = *(float64 *)(vn + H1_2(i));
5139 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5140 e2 = *(float64 *)(vn + H1_2(j));
5141 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5142
5143 if (likely((pg >> (i & 63)) & 1)) {
5144 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5145 }
5146 if (likely((pg >> (j & 63)) & 1)) {
5147 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5148 }
5149 } while (i & 63);
5150 } while (i != 0);
5151 }
5152
5153 /*
5154 * FP Complex Multiply
5155 */
5156
HELPER(sve_fcmla_zpzzz_h)5157 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5158 void *vg, void *status, uint32_t desc)
5159 {
5160 intptr_t j, i = simd_oprsz(desc);
5161 unsigned rot = simd_data(desc);
5162 bool flip = rot & 1;
5163 float16 neg_imag, neg_real;
5164 uint64_t *g = vg;
5165
5166 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5167 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5168
5169 do {
5170 uint64_t pg = g[(i - 1) >> 6];
5171 do {
5172 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5173
5174 /* I holds the real index; J holds the imag index. */
5175 j = i - sizeof(float16);
5176 i -= 2 * sizeof(float16);
5177
5178 nr = *(float16 *)(vn + H1_2(i));
5179 ni = *(float16 *)(vn + H1_2(j));
5180 mr = *(float16 *)(vm + H1_2(i));
5181 mi = *(float16 *)(vm + H1_2(j));
5182
5183 e2 = (flip ? ni : nr);
5184 e1 = (flip ? mi : mr) ^ neg_real;
5185 e4 = e2;
5186 e3 = (flip ? mr : mi) ^ neg_imag;
5187
5188 if (likely((pg >> (i & 63)) & 1)) {
5189 d = *(float16 *)(va + H1_2(i));
5190 d = float16_muladd(e2, e1, d, 0, status);
5191 *(float16 *)(vd + H1_2(i)) = d;
5192 }
5193 if (likely((pg >> (j & 63)) & 1)) {
5194 d = *(float16 *)(va + H1_2(j));
5195 d = float16_muladd(e4, e3, d, 0, status);
5196 *(float16 *)(vd + H1_2(j)) = d;
5197 }
5198 } while (i & 63);
5199 } while (i != 0);
5200 }
5201
HELPER(sve_fcmla_zpzzz_s)5202 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5203 void *vg, void *status, uint32_t desc)
5204 {
5205 intptr_t j, i = simd_oprsz(desc);
5206 unsigned rot = simd_data(desc);
5207 bool flip = rot & 1;
5208 float32 neg_imag, neg_real;
5209 uint64_t *g = vg;
5210
5211 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5212 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5213
5214 do {
5215 uint64_t pg = g[(i - 1) >> 6];
5216 do {
5217 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5218
5219 /* I holds the real index; J holds the imag index. */
5220 j = i - sizeof(float32);
5221 i -= 2 * sizeof(float32);
5222
5223 nr = *(float32 *)(vn + H1_2(i));
5224 ni = *(float32 *)(vn + H1_2(j));
5225 mr = *(float32 *)(vm + H1_2(i));
5226 mi = *(float32 *)(vm + H1_2(j));
5227
5228 e2 = (flip ? ni : nr);
5229 e1 = (flip ? mi : mr) ^ neg_real;
5230 e4 = e2;
5231 e3 = (flip ? mr : mi) ^ neg_imag;
5232
5233 if (likely((pg >> (i & 63)) & 1)) {
5234 d = *(float32 *)(va + H1_2(i));
5235 d = float32_muladd(e2, e1, d, 0, status);
5236 *(float32 *)(vd + H1_2(i)) = d;
5237 }
5238 if (likely((pg >> (j & 63)) & 1)) {
5239 d = *(float32 *)(va + H1_2(j));
5240 d = float32_muladd(e4, e3, d, 0, status);
5241 *(float32 *)(vd + H1_2(j)) = d;
5242 }
5243 } while (i & 63);
5244 } while (i != 0);
5245 }
5246
HELPER(sve_fcmla_zpzzz_d)5247 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5248 void *vg, void *status, uint32_t desc)
5249 {
5250 intptr_t j, i = simd_oprsz(desc);
5251 unsigned rot = simd_data(desc);
5252 bool flip = rot & 1;
5253 float64 neg_imag, neg_real;
5254 uint64_t *g = vg;
5255
5256 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5257 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5258
5259 do {
5260 uint64_t pg = g[(i - 1) >> 6];
5261 do {
5262 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5263
5264 /* I holds the real index; J holds the imag index. */
5265 j = i - sizeof(float64);
5266 i -= 2 * sizeof(float64);
5267
5268 nr = *(float64 *)(vn + H1_2(i));
5269 ni = *(float64 *)(vn + H1_2(j));
5270 mr = *(float64 *)(vm + H1_2(i));
5271 mi = *(float64 *)(vm + H1_2(j));
5272
5273 e2 = (flip ? ni : nr);
5274 e1 = (flip ? mi : mr) ^ neg_real;
5275 e4 = e2;
5276 e3 = (flip ? mr : mi) ^ neg_imag;
5277
5278 if (likely((pg >> (i & 63)) & 1)) {
5279 d = *(float64 *)(va + H1_2(i));
5280 d = float64_muladd(e2, e1, d, 0, status);
5281 *(float64 *)(vd + H1_2(i)) = d;
5282 }
5283 if (likely((pg >> (j & 63)) & 1)) {
5284 d = *(float64 *)(va + H1_2(j));
5285 d = float64_muladd(e4, e3, d, 0, status);
5286 *(float64 *)(vd + H1_2(j)) = d;
5287 }
5288 } while (i & 63);
5289 } while (i != 0);
5290 }
5291
5292 /*
5293 * Load contiguous data, protected by a governing predicate.
5294 */
5295
5296 /*
5297 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5298 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5299 * element >= @reg_off, or @reg_max if there were no active elements at all.
5300 */
find_next_active(uint64_t * vg,intptr_t reg_off,intptr_t reg_max,int esz)5301 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5302 intptr_t reg_max, int esz)
5303 {
5304 uint64_t pg_mask = pred_esz_masks[esz];
5305 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5306
5307 /* In normal usage, the first element is active. */
5308 if (likely(pg & 1)) {
5309 return reg_off;
5310 }
5311
5312 if (pg == 0) {
5313 reg_off &= -64;
5314 do {
5315 reg_off += 64;
5316 if (unlikely(reg_off >= reg_max)) {
5317 /* The entire predicate was false. */
5318 return reg_max;
5319 }
5320 pg = vg[reg_off >> 6] & pg_mask;
5321 } while (pg == 0);
5322 }
5323 reg_off += ctz64(pg);
5324
5325 /* We should never see an out of range predicate bit set. */
5326 tcg_debug_assert(reg_off < reg_max);
5327 return reg_off;
5328 }
5329
5330 /*
5331 * Resolve the guest virtual address to info->host and info->flags.
5332 * If @nofault, return false if the page is invalid, otherwise
5333 * exit via page fault exception.
5334 */
5335
sve_probe_page(SVEHostPage * info,bool nofault,CPUARMState * env,target_ulong addr,int mem_off,MMUAccessType access_type,int mmu_idx,uintptr_t retaddr)5336 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5337 target_ulong addr, int mem_off, MMUAccessType access_type,
5338 int mmu_idx, uintptr_t retaddr)
5339 {
5340 int flags;
5341
5342 addr += mem_off;
5343
5344 /*
5345 * User-only currently always issues with TBI. See the comment
5346 * above useronly_clean_ptr. Usually we clean this top byte away
5347 * during translation, but we can't do that for e.g. vector + imm
5348 * addressing modes.
5349 *
5350 * We currently always enable TBI for user-only, and do not provide
5351 * a way to turn it off. So clean the pointer unconditionally here,
5352 * rather than look it up here, or pass it down from above.
5353 */
5354 addr = useronly_clean_ptr(addr);
5355
5356 #ifdef CONFIG_USER_ONLY
5357 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5358 &info->host, retaddr);
5359 #else
5360 CPUTLBEntryFull *full;
5361 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5362 &info->host, &full, retaddr);
5363 #endif
5364 info->flags = flags;
5365
5366 if (flags & TLB_INVALID_MASK) {
5367 g_assert(nofault);
5368 return false;
5369 }
5370
5371 #ifdef CONFIG_USER_ONLY
5372 memset(&info->attrs, 0, sizeof(info->attrs));
5373 /* Require both ANON and MTE; see allocation_tag_mem(). */
5374 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5375 #else
5376 info->attrs = full->attrs;
5377 info->tagged = full->extra.arm.pte_attrs == 0xf0;
5378 #endif
5379
5380 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5381 info->host -= mem_off;
5382 return true;
5383 }
5384
5385 /*
5386 * Find first active element on each page, and a loose bound for the
5387 * final element on each page. Identify any single element that spans
5388 * the page boundary. Return true if there are any active elements.
5389 */
sve_cont_ldst_elements(SVEContLdSt * info,target_ulong addr,uint64_t * vg,intptr_t reg_max,int esz,int msize)5390 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5391 intptr_t reg_max, int esz, int msize)
5392 {
5393 const int esize = 1 << esz;
5394 const uint64_t pg_mask = pred_esz_masks[esz];
5395 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5396 intptr_t mem_off_last, mem_off_split;
5397 intptr_t page_split, elt_split;
5398 intptr_t i;
5399
5400 /* Set all of the element indices to -1, and the TLB data to 0. */
5401 memset(info, -1, offsetof(SVEContLdSt, page));
5402 memset(info->page, 0, sizeof(info->page));
5403
5404 /* Gross scan over the entire predicate to find bounds. */
5405 i = 0;
5406 do {
5407 uint64_t pg = vg[i] & pg_mask;
5408 if (pg) {
5409 reg_off_last = i * 64 + 63 - clz64(pg);
5410 if (reg_off_first < 0) {
5411 reg_off_first = i * 64 + ctz64(pg);
5412 }
5413 }
5414 } while (++i * 64 < reg_max);
5415
5416 if (unlikely(reg_off_first < 0)) {
5417 /* No active elements, no pages touched. */
5418 return false;
5419 }
5420 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5421
5422 info->reg_off_first[0] = reg_off_first;
5423 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5424 mem_off_last = (reg_off_last >> esz) * msize;
5425
5426 page_split = -(addr | TARGET_PAGE_MASK);
5427 if (likely(mem_off_last + msize <= page_split)) {
5428 /* The entire operation fits within a single page. */
5429 info->reg_off_last[0] = reg_off_last;
5430 return true;
5431 }
5432
5433 info->page_split = page_split;
5434 elt_split = page_split / msize;
5435 reg_off_split = elt_split << esz;
5436 mem_off_split = elt_split * msize;
5437
5438 /*
5439 * This is the last full element on the first page, but it is not
5440 * necessarily active. If there is no full element, i.e. the first
5441 * active element is the one that's split, this value remains -1.
5442 * It is useful as iteration bounds.
5443 */
5444 if (elt_split != 0) {
5445 info->reg_off_last[0] = reg_off_split - esize;
5446 }
5447
5448 /* Determine if an unaligned element spans the pages. */
5449 if (page_split % msize != 0) {
5450 /* It is helpful to know if the split element is active. */
5451 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5452 info->reg_off_split = reg_off_split;
5453 info->mem_off_split = mem_off_split;
5454
5455 if (reg_off_split == reg_off_last) {
5456 /* The page crossing element is last. */
5457 return true;
5458 }
5459 }
5460 reg_off_split += esize;
5461 mem_off_split += msize;
5462 }
5463
5464 /*
5465 * We do want the first active element on the second page, because
5466 * this may affect the address reported in an exception.
5467 */
5468 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5469 tcg_debug_assert(reg_off_split <= reg_off_last);
5470 info->reg_off_first[1] = reg_off_split;
5471 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5472 info->reg_off_last[1] = reg_off_last;
5473 return true;
5474 }
5475
5476 /*
5477 * Resolve the guest virtual addresses to info->page[].
5478 * Control the generation of page faults with @fault. Return false if
5479 * there is no work to do, which can only happen with @fault == FAULT_NO.
5480 */
sve_cont_ldst_pages(SVEContLdSt * info,SVEContFault fault,CPUARMState * env,target_ulong addr,MMUAccessType access_type,uintptr_t retaddr)5481 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5482 CPUARMState *env, target_ulong addr,
5483 MMUAccessType access_type, uintptr_t retaddr)
5484 {
5485 int mmu_idx = arm_env_mmu_index(env);
5486 int mem_off = info->mem_off_first[0];
5487 bool nofault = fault == FAULT_NO;
5488 bool have_work = true;
5489
5490 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5491 access_type, mmu_idx, retaddr)) {
5492 /* No work to be done. */
5493 return false;
5494 }
5495
5496 if (likely(info->page_split < 0)) {
5497 /* The entire operation was on the one page. */
5498 return true;
5499 }
5500
5501 /*
5502 * If the second page is invalid, then we want the fault address to be
5503 * the first byte on that page which is accessed.
5504 */
5505 if (info->mem_off_split >= 0) {
5506 /*
5507 * There is an element split across the pages. The fault address
5508 * should be the first byte of the second page.
5509 */
5510 mem_off = info->page_split;
5511 /*
5512 * If the split element is also the first active element
5513 * of the vector, then: For first-fault we should continue
5514 * to generate faults for the second page. For no-fault,
5515 * we have work only if the second page is valid.
5516 */
5517 if (info->mem_off_first[0] < info->mem_off_split) {
5518 nofault = FAULT_FIRST;
5519 have_work = false;
5520 }
5521 } else {
5522 /*
5523 * There is no element split across the pages. The fault address
5524 * should be the first active element on the second page.
5525 */
5526 mem_off = info->mem_off_first[1];
5527 /*
5528 * There must have been one active element on the first page,
5529 * so we're out of first-fault territory.
5530 */
5531 nofault = fault != FAULT_ALL;
5532 }
5533
5534 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5535 access_type, mmu_idx, retaddr);
5536 return have_work;
5537 }
5538
5539 #ifndef CONFIG_USER_ONLY
sve_cont_ldst_watchpoints(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,int wp_access,uintptr_t retaddr)5540 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5541 uint64_t *vg, target_ulong addr,
5542 int esize, int msize, int wp_access,
5543 uintptr_t retaddr)
5544 {
5545 intptr_t mem_off, reg_off, reg_last;
5546 int flags0 = info->page[0].flags;
5547 int flags1 = info->page[1].flags;
5548
5549 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5550 return;
5551 }
5552
5553 /* Indicate that watchpoints are handled. */
5554 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5555 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5556
5557 if (flags0 & TLB_WATCHPOINT) {
5558 mem_off = info->mem_off_first[0];
5559 reg_off = info->reg_off_first[0];
5560 reg_last = info->reg_off_last[0];
5561
5562 while (reg_off <= reg_last) {
5563 uint64_t pg = vg[reg_off >> 6];
5564 do {
5565 if ((pg >> (reg_off & 63)) & 1) {
5566 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5567 msize, info->page[0].attrs,
5568 wp_access, retaddr);
5569 }
5570 reg_off += esize;
5571 mem_off += msize;
5572 } while (reg_off <= reg_last && (reg_off & 63));
5573 }
5574 }
5575
5576 mem_off = info->mem_off_split;
5577 if (mem_off >= 0) {
5578 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5579 info->page[0].attrs, wp_access, retaddr);
5580 }
5581
5582 mem_off = info->mem_off_first[1];
5583 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5584 reg_off = info->reg_off_first[1];
5585 reg_last = info->reg_off_last[1];
5586
5587 do {
5588 uint64_t pg = vg[reg_off >> 6];
5589 do {
5590 if ((pg >> (reg_off & 63)) & 1) {
5591 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5592 msize, info->page[1].attrs,
5593 wp_access, retaddr);
5594 }
5595 reg_off += esize;
5596 mem_off += msize;
5597 } while (reg_off & 63);
5598 } while (reg_off <= reg_last);
5599 }
5600 }
5601 #endif
5602
sve_cont_ldst_mte_check(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,uint32_t mtedesc,uintptr_t ra)5603 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5604 uint64_t *vg, target_ulong addr, int esize,
5605 int msize, uint32_t mtedesc, uintptr_t ra)
5606 {
5607 intptr_t mem_off, reg_off, reg_last;
5608
5609 /* Process the page only if MemAttr == Tagged. */
5610 if (info->page[0].tagged) {
5611 mem_off = info->mem_off_first[0];
5612 reg_off = info->reg_off_first[0];
5613 reg_last = info->reg_off_split;
5614 if (reg_last < 0) {
5615 reg_last = info->reg_off_last[0];
5616 }
5617
5618 do {
5619 uint64_t pg = vg[reg_off >> 6];
5620 do {
5621 if ((pg >> (reg_off & 63)) & 1) {
5622 mte_check(env, mtedesc, addr, ra);
5623 }
5624 reg_off += esize;
5625 mem_off += msize;
5626 } while (reg_off <= reg_last && (reg_off & 63));
5627 } while (reg_off <= reg_last);
5628 }
5629
5630 mem_off = info->mem_off_first[1];
5631 if (mem_off >= 0 && info->page[1].tagged) {
5632 reg_off = info->reg_off_first[1];
5633 reg_last = info->reg_off_last[1];
5634
5635 do {
5636 uint64_t pg = vg[reg_off >> 6];
5637 do {
5638 if ((pg >> (reg_off & 63)) & 1) {
5639 mte_check(env, mtedesc, addr, ra);
5640 }
5641 reg_off += esize;
5642 mem_off += msize;
5643 } while (reg_off & 63);
5644 } while (reg_off <= reg_last);
5645 }
5646 }
5647
5648 /*
5649 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5650 */
5651 static inline QEMU_ALWAYS_INLINE
sve_ldN_r(CPUARMState * env,uint64_t * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const int N,uint32_t mtedesc,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5652 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5653 uint32_t desc, const uintptr_t retaddr,
5654 const int esz, const int msz, const int N, uint32_t mtedesc,
5655 sve_ldst1_host_fn *host_fn,
5656 sve_ldst1_tlb_fn *tlb_fn)
5657 {
5658 const unsigned rd = simd_data(desc);
5659 const intptr_t reg_max = simd_oprsz(desc);
5660 intptr_t reg_off, reg_last, mem_off;
5661 SVEContLdSt info;
5662 void *host;
5663 int flags, i;
5664
5665 /* Find the active elements. */
5666 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5667 /* The entire predicate was false; no load occurs. */
5668 for (i = 0; i < N; ++i) {
5669 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5670 }
5671 return;
5672 }
5673
5674 /* Probe the page(s). Exit with exception for any invalid page. */
5675 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5676
5677 /* Handle watchpoints for all active elements. */
5678 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5679 BP_MEM_READ, retaddr);
5680
5681 /*
5682 * Handle mte checks for all active elements.
5683 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5684 */
5685 if (mtedesc) {
5686 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5687 mtedesc, retaddr);
5688 }
5689
5690 flags = info.page[0].flags | info.page[1].flags;
5691 if (unlikely(flags != 0)) {
5692 /*
5693 * At least one page includes MMIO.
5694 * Any bus operation can fail with cpu_transaction_failed,
5695 * which for ARM will raise SyncExternal. Perform the load
5696 * into scratch memory to preserve register state until the end.
5697 */
5698 ARMVectorReg scratch[4] = { };
5699
5700 mem_off = info.mem_off_first[0];
5701 reg_off = info.reg_off_first[0];
5702 reg_last = info.reg_off_last[1];
5703 if (reg_last < 0) {
5704 reg_last = info.reg_off_split;
5705 if (reg_last < 0) {
5706 reg_last = info.reg_off_last[0];
5707 }
5708 }
5709
5710 do {
5711 uint64_t pg = vg[reg_off >> 6];
5712 do {
5713 if ((pg >> (reg_off & 63)) & 1) {
5714 for (i = 0; i < N; ++i) {
5715 tlb_fn(env, &scratch[i], reg_off,
5716 addr + mem_off + (i << msz), retaddr);
5717 }
5718 }
5719 reg_off += 1 << esz;
5720 mem_off += N << msz;
5721 } while (reg_off & 63);
5722 } while (reg_off <= reg_last);
5723
5724 for (i = 0; i < N; ++i) {
5725 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5726 }
5727 return;
5728 }
5729
5730 /* The entire operation is in RAM, on valid pages. */
5731
5732 for (i = 0; i < N; ++i) {
5733 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5734 }
5735
5736 mem_off = info.mem_off_first[0];
5737 reg_off = info.reg_off_first[0];
5738 reg_last = info.reg_off_last[0];
5739 host = info.page[0].host;
5740
5741 set_helper_retaddr(retaddr);
5742
5743 while (reg_off <= reg_last) {
5744 uint64_t pg = vg[reg_off >> 6];
5745 do {
5746 if ((pg >> (reg_off & 63)) & 1) {
5747 for (i = 0; i < N; ++i) {
5748 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5749 host + mem_off + (i << msz));
5750 }
5751 }
5752 reg_off += 1 << esz;
5753 mem_off += N << msz;
5754 } while (reg_off <= reg_last && (reg_off & 63));
5755 }
5756
5757 clear_helper_retaddr();
5758
5759 /*
5760 * Use the slow path to manage the cross-page misalignment.
5761 * But we know this is RAM and cannot trap.
5762 */
5763 mem_off = info.mem_off_split;
5764 if (unlikely(mem_off >= 0)) {
5765 reg_off = info.reg_off_split;
5766 for (i = 0; i < N; ++i) {
5767 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5768 addr + mem_off + (i << msz), retaddr);
5769 }
5770 }
5771
5772 mem_off = info.mem_off_first[1];
5773 if (unlikely(mem_off >= 0)) {
5774 reg_off = info.reg_off_first[1];
5775 reg_last = info.reg_off_last[1];
5776 host = info.page[1].host;
5777
5778 set_helper_retaddr(retaddr);
5779
5780 do {
5781 uint64_t pg = vg[reg_off >> 6];
5782 do {
5783 if ((pg >> (reg_off & 63)) & 1) {
5784 for (i = 0; i < N; ++i) {
5785 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5786 host + mem_off + (i << msz));
5787 }
5788 }
5789 reg_off += 1 << esz;
5790 mem_off += N << msz;
5791 } while (reg_off & 63);
5792 } while (reg_off <= reg_last);
5793
5794 clear_helper_retaddr();
5795 }
5796 }
5797
5798 static inline QEMU_ALWAYS_INLINE
sve_ldN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5799 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5800 uint32_t desc, const uintptr_t ra,
5801 const int esz, const int msz, const int N,
5802 sve_ldst1_host_fn *host_fn,
5803 sve_ldst1_tlb_fn *tlb_fn)
5804 {
5805 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5806 int bit55 = extract64(addr, 55, 1);
5807
5808 /* Remove mtedesc from the normal sve descriptor. */
5809 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5810
5811 /* Perform gross MTE suppression early. */
5812 if (!tbi_check(mtedesc, bit55) ||
5813 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
5814 mtedesc = 0;
5815 }
5816
5817 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5818 }
5819
5820 #define DO_LD1_1(NAME, ESZ) \
5821 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5822 target_ulong addr, uint32_t desc) \
5823 { \
5824 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5825 sve_##NAME##_host, sve_##NAME##_tlb); \
5826 } \
5827 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5828 target_ulong addr, uint32_t desc) \
5829 { \
5830 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5831 sve_##NAME##_host, sve_##NAME##_tlb); \
5832 }
5833
5834 #define DO_LD1_2(NAME, ESZ, MSZ) \
5835 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5836 target_ulong addr, uint32_t desc) \
5837 { \
5838 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5839 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5840 } \
5841 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5842 target_ulong addr, uint32_t desc) \
5843 { \
5844 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5845 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5846 } \
5847 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5848 target_ulong addr, uint32_t desc) \
5849 { \
5850 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5851 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5852 } \
5853 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5854 target_ulong addr, uint32_t desc) \
5855 { \
5856 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5857 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5858 }
5859
DO_LD1_1(ld1bb,MO_8)5860 DO_LD1_1(ld1bb, MO_8)
5861 DO_LD1_1(ld1bhu, MO_16)
5862 DO_LD1_1(ld1bhs, MO_16)
5863 DO_LD1_1(ld1bsu, MO_32)
5864 DO_LD1_1(ld1bss, MO_32)
5865 DO_LD1_1(ld1bdu, MO_64)
5866 DO_LD1_1(ld1bds, MO_64)
5867
5868 DO_LD1_2(ld1hh, MO_16, MO_16)
5869 DO_LD1_2(ld1hsu, MO_32, MO_16)
5870 DO_LD1_2(ld1hss, MO_32, MO_16)
5871 DO_LD1_2(ld1hdu, MO_64, MO_16)
5872 DO_LD1_2(ld1hds, MO_64, MO_16)
5873
5874 DO_LD1_2(ld1ss, MO_32, MO_32)
5875 DO_LD1_2(ld1sdu, MO_64, MO_32)
5876 DO_LD1_2(ld1sds, MO_64, MO_32)
5877
5878 DO_LD1_2(ld1dd, MO_64, MO_64)
5879
5880 #undef DO_LD1_1
5881 #undef DO_LD1_2
5882
5883 #define DO_LDN_1(N) \
5884 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5885 target_ulong addr, uint32_t desc) \
5886 { \
5887 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5888 sve_ld1bb_host, sve_ld1bb_tlb); \
5889 } \
5890 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5891 target_ulong addr, uint32_t desc) \
5892 { \
5893 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5894 sve_ld1bb_host, sve_ld1bb_tlb); \
5895 }
5896
5897 #define DO_LDN_2(N, SUFF, ESZ) \
5898 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5899 target_ulong addr, uint32_t desc) \
5900 { \
5901 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5902 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5903 } \
5904 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5905 target_ulong addr, uint32_t desc) \
5906 { \
5907 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5908 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5909 } \
5910 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5911 target_ulong addr, uint32_t desc) \
5912 { \
5913 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5914 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5915 } \
5916 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5917 target_ulong addr, uint32_t desc) \
5918 { \
5919 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5920 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5921 }
5922
5923 DO_LDN_1(2)
5924 DO_LDN_1(3)
5925 DO_LDN_1(4)
5926
5927 DO_LDN_2(2, hh, MO_16)
5928 DO_LDN_2(3, hh, MO_16)
5929 DO_LDN_2(4, hh, MO_16)
5930
5931 DO_LDN_2(2, ss, MO_32)
5932 DO_LDN_2(3, ss, MO_32)
5933 DO_LDN_2(4, ss, MO_32)
5934
5935 DO_LDN_2(2, dd, MO_64)
5936 DO_LDN_2(3, dd, MO_64)
5937 DO_LDN_2(4, dd, MO_64)
5938
5939 #undef DO_LDN_1
5940 #undef DO_LDN_2
5941
5942 /*
5943 * Load contiguous data, first-fault and no-fault.
5944 *
5945 * For user-only, we control the race between page_check_range and
5946 * another thread's munmap by using set/clear_helper_retaddr. Any
5947 * SEGV that occurs between those markers is assumed to be because
5948 * the guest page vanished. Keep that block as small as possible
5949 * so that unrelated QEMU bugs are not blamed on the guest.
5950 */
5951
5952 /* Fault on byte I. All bits in FFR from I are cleared. The vector
5953 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5954 * option, which leaves subsequent data unchanged.
5955 */
5956 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5957 {
5958 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5959
5960 if (i & 63) {
5961 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5962 i = ROUND_UP(i, 64);
5963 }
5964 for (; i < oprsz; i += 64) {
5965 ffr[i / 64] = 0;
5966 }
5967 }
5968
5969 /*
5970 * Common helper for all contiguous no-fault and first-fault loads.
5971 */
5972 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r(CPUARMState * env,void * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,uint32_t mtedesc,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5973 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5974 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5975 const int esz, const int msz, const SVEContFault fault,
5976 sve_ldst1_host_fn *host_fn,
5977 sve_ldst1_tlb_fn *tlb_fn)
5978 {
5979 const unsigned rd = simd_data(desc);
5980 void *vd = &env->vfp.zregs[rd];
5981 const intptr_t reg_max = simd_oprsz(desc);
5982 intptr_t reg_off, mem_off, reg_last;
5983 SVEContLdSt info;
5984 int flags;
5985 void *host;
5986
5987 /* Find the active elements. */
5988 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5989 /* The entire predicate was false; no load occurs. */
5990 memset(vd, 0, reg_max);
5991 return;
5992 }
5993 reg_off = info.reg_off_first[0];
5994
5995 /* Probe the page(s). */
5996 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5997 /* Fault on first element. */
5998 tcg_debug_assert(fault == FAULT_NO);
5999 memset(vd, 0, reg_max);
6000 goto do_fault;
6001 }
6002
6003 mem_off = info.mem_off_first[0];
6004 flags = info.page[0].flags;
6005
6006 /*
6007 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6008 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6009 */
6010 if (!info.page[0].tagged) {
6011 mtedesc = 0;
6012 }
6013
6014 if (fault == FAULT_FIRST) {
6015 /* Trapping mte check for the first-fault element. */
6016 if (mtedesc) {
6017 mte_check(env, mtedesc, addr + mem_off, retaddr);
6018 }
6019
6020 /*
6021 * Special handling of the first active element,
6022 * if it crosses a page boundary or is MMIO.
6023 */
6024 bool is_split = mem_off == info.mem_off_split;
6025 if (unlikely(flags != 0) || unlikely(is_split)) {
6026 /*
6027 * Use the slow path for cross-page handling.
6028 * Might trap for MMIO or watchpoints.
6029 */
6030 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6031
6032 /* After any fault, zero the other elements. */
6033 swap_memzero(vd, reg_off);
6034 reg_off += 1 << esz;
6035 mem_off += 1 << msz;
6036 swap_memzero(vd + reg_off, reg_max - reg_off);
6037
6038 if (is_split) {
6039 goto second_page;
6040 }
6041 } else {
6042 memset(vd, 0, reg_max);
6043 }
6044 } else {
6045 memset(vd, 0, reg_max);
6046 if (unlikely(mem_off == info.mem_off_split)) {
6047 /* The first active element crosses a page boundary. */
6048 flags |= info.page[1].flags;
6049 if (unlikely(flags & TLB_MMIO)) {
6050 /* Some page is MMIO, see below. */
6051 goto do_fault;
6052 }
6053 if (unlikely(flags & TLB_WATCHPOINT) &&
6054 (cpu_watchpoint_address_matches
6055 (env_cpu(env), addr + mem_off, 1 << msz)
6056 & BP_MEM_READ)) {
6057 /* Watchpoint hit, see below. */
6058 goto do_fault;
6059 }
6060 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6061 goto do_fault;
6062 }
6063 /*
6064 * Use the slow path for cross-page handling.
6065 * This is RAM, without a watchpoint, and will not trap.
6066 */
6067 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6068 goto second_page;
6069 }
6070 }
6071
6072 /*
6073 * From this point on, all memory operations are MemSingleNF.
6074 *
6075 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6076 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6077 *
6078 * Unfortuately we do not have access to the memory attributes from the
6079 * PTE to tell Device memory from Normal memory. So we make a mostly
6080 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6081 * This gives the right answer for the common cases of "Normal memory,
6082 * backed by host RAM" and "Device memory, backed by MMIO".
6083 * The architecture allows us to suppress an NF load and return
6084 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6085 * case of "Normal memory, backed by MMIO" is permitted. The case we
6086 * get wrong is "Device memory, backed by host RAM", for which we
6087 * should return (UNKNOWN, FAULT) for but do not.
6088 *
6089 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6090 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6091 * architectural breakpoints the same.
6092 */
6093 if (unlikely(flags & TLB_MMIO)) {
6094 goto do_fault;
6095 }
6096
6097 reg_last = info.reg_off_last[0];
6098 host = info.page[0].host;
6099
6100 set_helper_retaddr(retaddr);
6101
6102 do {
6103 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6104 do {
6105 if ((pg >> (reg_off & 63)) & 1) {
6106 if (unlikely(flags & TLB_WATCHPOINT) &&
6107 (cpu_watchpoint_address_matches
6108 (env_cpu(env), addr + mem_off, 1 << msz)
6109 & BP_MEM_READ)) {
6110 clear_helper_retaddr();
6111 goto do_fault;
6112 }
6113 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6114 clear_helper_retaddr();
6115 goto do_fault;
6116 }
6117 host_fn(vd, reg_off, host + mem_off);
6118 }
6119 reg_off += 1 << esz;
6120 mem_off += 1 << msz;
6121 } while (reg_off <= reg_last && (reg_off & 63));
6122 } while (reg_off <= reg_last);
6123
6124 clear_helper_retaddr();
6125
6126 /*
6127 * MemSingleNF is allowed to fail for any reason. We have special
6128 * code above to handle the first element crossing a page boundary.
6129 * As an implementation choice, decline to handle a cross-page element
6130 * in any other position.
6131 */
6132 reg_off = info.reg_off_split;
6133 if (reg_off >= 0) {
6134 goto do_fault;
6135 }
6136
6137 second_page:
6138 reg_off = info.reg_off_first[1];
6139 if (likely(reg_off < 0)) {
6140 /* No active elements on the second page. All done. */
6141 return;
6142 }
6143
6144 /*
6145 * MemSingleNF is allowed to fail for any reason. As an implementation
6146 * choice, decline to handle elements on the second page. This should
6147 * be low frequency as the guest walks through memory -- the next
6148 * iteration of the guest's loop should be aligned on the page boundary,
6149 * and then all following iterations will stay aligned.
6150 */
6151
6152 do_fault:
6153 record_fault(env, reg_off, reg_max);
6154 }
6155
6156 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r_mte(CPUARMState * env,void * vg,target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6157 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6158 uint32_t desc, const uintptr_t retaddr,
6159 const int esz, const int msz, const SVEContFault fault,
6160 sve_ldst1_host_fn *host_fn,
6161 sve_ldst1_tlb_fn *tlb_fn)
6162 {
6163 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6164 int bit55 = extract64(addr, 55, 1);
6165
6166 /* Remove mtedesc from the normal sve descriptor. */
6167 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6168
6169 /* Perform gross MTE suppression early. */
6170 if (!tbi_check(mtedesc, bit55) ||
6171 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6172 mtedesc = 0;
6173 }
6174
6175 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6176 esz, msz, fault, host_fn, tlb_fn);
6177 }
6178
6179 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6180 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6181 target_ulong addr, uint32_t desc) \
6182 { \
6183 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6184 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6185 } \
6186 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6187 target_ulong addr, uint32_t desc) \
6188 { \
6189 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6190 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6191 } \
6192 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6193 target_ulong addr, uint32_t desc) \
6194 { \
6195 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6196 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6197 } \
6198 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6199 target_ulong addr, uint32_t desc) \
6200 { \
6201 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6202 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6203 }
6204
6205 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6206 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6207 target_ulong addr, uint32_t desc) \
6208 { \
6209 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6210 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6211 } \
6212 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6213 target_ulong addr, uint32_t desc) \
6214 { \
6215 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6216 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6217 } \
6218 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6219 target_ulong addr, uint32_t desc) \
6220 { \
6221 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6222 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6223 } \
6224 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6225 target_ulong addr, uint32_t desc) \
6226 { \
6227 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6228 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6229 } \
6230 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6231 target_ulong addr, uint32_t desc) \
6232 { \
6233 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6234 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6235 } \
6236 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6237 target_ulong addr, uint32_t desc) \
6238 { \
6239 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6240 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6241 } \
6242 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6243 target_ulong addr, uint32_t desc) \
6244 { \
6245 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6246 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6247 } \
6248 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6249 target_ulong addr, uint32_t desc) \
6250 { \
6251 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6252 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6253 }
6254
DO_LDFF1_LDNF1_1(bb,MO_8)6255 DO_LDFF1_LDNF1_1(bb, MO_8)
6256 DO_LDFF1_LDNF1_1(bhu, MO_16)
6257 DO_LDFF1_LDNF1_1(bhs, MO_16)
6258 DO_LDFF1_LDNF1_1(bsu, MO_32)
6259 DO_LDFF1_LDNF1_1(bss, MO_32)
6260 DO_LDFF1_LDNF1_1(bdu, MO_64)
6261 DO_LDFF1_LDNF1_1(bds, MO_64)
6262
6263 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6264 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6265 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6266 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6267 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6268
6269 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6270 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6271 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6272
6273 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6274
6275 #undef DO_LDFF1_LDNF1_1
6276 #undef DO_LDFF1_LDNF1_2
6277
6278 /*
6279 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6280 */
6281
6282 static inline QEMU_ALWAYS_INLINE
6283 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6284 uint32_t desc, const uintptr_t retaddr,
6285 const int esz, const int msz, const int N, uint32_t mtedesc,
6286 sve_ldst1_host_fn *host_fn,
6287 sve_ldst1_tlb_fn *tlb_fn)
6288 {
6289 const unsigned rd = simd_data(desc);
6290 const intptr_t reg_max = simd_oprsz(desc);
6291 intptr_t reg_off, reg_last, mem_off;
6292 SVEContLdSt info;
6293 void *host;
6294 int i, flags;
6295
6296 /* Find the active elements. */
6297 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6298 /* The entire predicate was false; no store occurs. */
6299 return;
6300 }
6301
6302 /* Probe the page(s). Exit with exception for any invalid page. */
6303 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6304
6305 /* Handle watchpoints for all active elements. */
6306 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6307 BP_MEM_WRITE, retaddr);
6308
6309 /*
6310 * Handle mte checks for all active elements.
6311 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6312 */
6313 if (mtedesc) {
6314 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6315 mtedesc, retaddr);
6316 }
6317
6318 flags = info.page[0].flags | info.page[1].flags;
6319 if (unlikely(flags != 0)) {
6320 /*
6321 * At least one page includes MMIO.
6322 * Any bus operation can fail with cpu_transaction_failed,
6323 * which for ARM will raise SyncExternal. We cannot avoid
6324 * this fault and will leave with the store incomplete.
6325 */
6326 mem_off = info.mem_off_first[0];
6327 reg_off = info.reg_off_first[0];
6328 reg_last = info.reg_off_last[1];
6329 if (reg_last < 0) {
6330 reg_last = info.reg_off_split;
6331 if (reg_last < 0) {
6332 reg_last = info.reg_off_last[0];
6333 }
6334 }
6335
6336 do {
6337 uint64_t pg = vg[reg_off >> 6];
6338 do {
6339 if ((pg >> (reg_off & 63)) & 1) {
6340 for (i = 0; i < N; ++i) {
6341 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6342 addr + mem_off + (i << msz), retaddr);
6343 }
6344 }
6345 reg_off += 1 << esz;
6346 mem_off += N << msz;
6347 } while (reg_off & 63);
6348 } while (reg_off <= reg_last);
6349 return;
6350 }
6351
6352 mem_off = info.mem_off_first[0];
6353 reg_off = info.reg_off_first[0];
6354 reg_last = info.reg_off_last[0];
6355 host = info.page[0].host;
6356
6357 set_helper_retaddr(retaddr);
6358
6359 while (reg_off <= reg_last) {
6360 uint64_t pg = vg[reg_off >> 6];
6361 do {
6362 if ((pg >> (reg_off & 63)) & 1) {
6363 for (i = 0; i < N; ++i) {
6364 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6365 host + mem_off + (i << msz));
6366 }
6367 }
6368 reg_off += 1 << esz;
6369 mem_off += N << msz;
6370 } while (reg_off <= reg_last && (reg_off & 63));
6371 }
6372
6373 clear_helper_retaddr();
6374
6375 /*
6376 * Use the slow path to manage the cross-page misalignment.
6377 * But we know this is RAM and cannot trap.
6378 */
6379 mem_off = info.mem_off_split;
6380 if (unlikely(mem_off >= 0)) {
6381 reg_off = info.reg_off_split;
6382 for (i = 0; i < N; ++i) {
6383 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6384 addr + mem_off + (i << msz), retaddr);
6385 }
6386 }
6387
6388 mem_off = info.mem_off_first[1];
6389 if (unlikely(mem_off >= 0)) {
6390 reg_off = info.reg_off_first[1];
6391 reg_last = info.reg_off_last[1];
6392 host = info.page[1].host;
6393
6394 set_helper_retaddr(retaddr);
6395
6396 do {
6397 uint64_t pg = vg[reg_off >> 6];
6398 do {
6399 if ((pg >> (reg_off & 63)) & 1) {
6400 for (i = 0; i < N; ++i) {
6401 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6402 host + mem_off + (i << msz));
6403 }
6404 }
6405 reg_off += 1 << esz;
6406 mem_off += N << msz;
6407 } while (reg_off & 63);
6408 } while (reg_off <= reg_last);
6409
6410 clear_helper_retaddr();
6411 }
6412 }
6413
6414 static inline QEMU_ALWAYS_INLINE
sve_stN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6415 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6416 uint32_t desc, const uintptr_t ra,
6417 const int esz, const int msz, const int N,
6418 sve_ldst1_host_fn *host_fn,
6419 sve_ldst1_tlb_fn *tlb_fn)
6420 {
6421 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6422 int bit55 = extract64(addr, 55, 1);
6423
6424 /* Remove mtedesc from the normal sve descriptor. */
6425 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6426
6427 /* Perform gross MTE suppression early. */
6428 if (!tbi_check(mtedesc, bit55) ||
6429 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6430 mtedesc = 0;
6431 }
6432
6433 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6434 }
6435
6436 #define DO_STN_1(N, NAME, ESZ) \
6437 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6438 target_ulong addr, uint32_t desc) \
6439 { \
6440 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6441 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6442 } \
6443 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6444 target_ulong addr, uint32_t desc) \
6445 { \
6446 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6447 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6448 }
6449
6450 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6451 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6452 target_ulong addr, uint32_t desc) \
6453 { \
6454 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6455 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6456 } \
6457 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6458 target_ulong addr, uint32_t desc) \
6459 { \
6460 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6461 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6462 } \
6463 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6464 target_ulong addr, uint32_t desc) \
6465 { \
6466 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6467 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6468 } \
6469 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6470 target_ulong addr, uint32_t desc) \
6471 { \
6472 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6473 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6474 }
6475
6476 DO_STN_1(1, bb, MO_8)
6477 DO_STN_1(1, bh, MO_16)
6478 DO_STN_1(1, bs, MO_32)
6479 DO_STN_1(1, bd, MO_64)
6480 DO_STN_1(2, bb, MO_8)
6481 DO_STN_1(3, bb, MO_8)
6482 DO_STN_1(4, bb, MO_8)
6483
6484 DO_STN_2(1, hh, MO_16, MO_16)
6485 DO_STN_2(1, hs, MO_32, MO_16)
6486 DO_STN_2(1, hd, MO_64, MO_16)
6487 DO_STN_2(2, hh, MO_16, MO_16)
6488 DO_STN_2(3, hh, MO_16, MO_16)
6489 DO_STN_2(4, hh, MO_16, MO_16)
6490
6491 DO_STN_2(1, ss, MO_32, MO_32)
6492 DO_STN_2(1, sd, MO_64, MO_32)
6493 DO_STN_2(2, ss, MO_32, MO_32)
6494 DO_STN_2(3, ss, MO_32, MO_32)
6495 DO_STN_2(4, ss, MO_32, MO_32)
6496
6497 DO_STN_2(1, dd, MO_64, MO_64)
6498 DO_STN_2(2, dd, MO_64, MO_64)
6499 DO_STN_2(3, dd, MO_64, MO_64)
6500 DO_STN_2(4, dd, MO_64, MO_64)
6501
6502 #undef DO_STN_1
6503 #undef DO_STN_2
6504
6505 /*
6506 * Loads with a vector index.
6507 */
6508
6509 /*
6510 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6511 */
6512 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6513
off_zsu_s(void * reg,intptr_t reg_ofs)6514 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6515 {
6516 return *(uint32_t *)(reg + H1_4(reg_ofs));
6517 }
6518
off_zss_s(void * reg,intptr_t reg_ofs)6519 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6520 {
6521 return *(int32_t *)(reg + H1_4(reg_ofs));
6522 }
6523
off_zsu_d(void * reg,intptr_t reg_ofs)6524 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6525 {
6526 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6527 }
6528
off_zss_d(void * reg,intptr_t reg_ofs)6529 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6530 {
6531 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6532 }
6533
off_zd_d(void * reg,intptr_t reg_ofs)6534 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6535 {
6536 return *(uint64_t *)(reg + reg_ofs);
6537 }
6538
6539 static inline QEMU_ALWAYS_INLINE
sve_ld1_z(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,uint32_t mtedesc,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6540 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6541 target_ulong base, uint32_t desc, uintptr_t retaddr,
6542 uint32_t mtedesc, int esize, int msize,
6543 zreg_off_fn *off_fn,
6544 sve_ldst1_host_fn *host_fn,
6545 sve_ldst1_tlb_fn *tlb_fn)
6546 {
6547 const int mmu_idx = arm_env_mmu_index(env);
6548 const intptr_t reg_max = simd_oprsz(desc);
6549 const int scale = simd_data(desc);
6550 ARMVectorReg scratch;
6551 intptr_t reg_off;
6552 SVEHostPage info, info2;
6553
6554 memset(&scratch, 0, reg_max);
6555 reg_off = 0;
6556 do {
6557 uint64_t pg = vg[reg_off >> 6];
6558 do {
6559 if (likely(pg & 1)) {
6560 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6561 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6562
6563 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6564 mmu_idx, retaddr);
6565
6566 if (likely(in_page >= msize)) {
6567 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6568 cpu_check_watchpoint(env_cpu(env), addr, msize,
6569 info.attrs, BP_MEM_READ, retaddr);
6570 }
6571 if (mtedesc && info.tagged) {
6572 mte_check(env, mtedesc, addr, retaddr);
6573 }
6574 if (unlikely(info.flags & TLB_MMIO)) {
6575 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6576 } else {
6577 set_helper_retaddr(retaddr);
6578 host_fn(&scratch, reg_off, info.host);
6579 clear_helper_retaddr();
6580 }
6581 } else {
6582 /* Element crosses the page boundary. */
6583 sve_probe_page(&info2, false, env, addr + in_page, 0,
6584 MMU_DATA_LOAD, mmu_idx, retaddr);
6585 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6586 cpu_check_watchpoint(env_cpu(env), addr,
6587 msize, info.attrs,
6588 BP_MEM_READ, retaddr);
6589 }
6590 if (mtedesc && info.tagged) {
6591 mte_check(env, mtedesc, addr, retaddr);
6592 }
6593 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6594 }
6595 }
6596 reg_off += esize;
6597 pg >>= esize;
6598 } while (reg_off & 63);
6599 } while (reg_off < reg_max);
6600
6601 /* Wait until all exceptions have been raised to write back. */
6602 memcpy(vd, &scratch, reg_max);
6603 }
6604
6605 static inline QEMU_ALWAYS_INLINE
sve_ld1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6606 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6607 target_ulong base, uint32_t desc, uintptr_t retaddr,
6608 int esize, int msize, zreg_off_fn *off_fn,
6609 sve_ldst1_host_fn *host_fn,
6610 sve_ldst1_tlb_fn *tlb_fn)
6611 {
6612 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6613 /* Remove mtedesc from the normal sve descriptor. */
6614 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6615
6616 /*
6617 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6618 * offset base entirely over the address space hole to change the
6619 * pointer tag, or change the bit55 selector. So we could here
6620 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6621 */
6622 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6623 esize, msize, off_fn, host_fn, tlb_fn);
6624 }
6625
6626 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6627 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6628 void *vm, target_ulong base, uint32_t desc) \
6629 { \
6630 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6631 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6632 } \
6633 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6634 void *vm, target_ulong base, uint32_t desc) \
6635 { \
6636 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6637 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6638 }
6639
6640 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6641 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6642 void *vm, target_ulong base, uint32_t desc) \
6643 { \
6644 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6645 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6646 } \
6647 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6648 void *vm, target_ulong base, uint32_t desc) \
6649 { \
6650 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6651 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6652 }
6653
DO_LD1_ZPZ_S(bsu,zsu,MO_8)6654 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6655 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6656 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6657 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6658 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6659
6660 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6661 DO_LD1_ZPZ_S(bss, zss, MO_8)
6662 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6663 DO_LD1_ZPZ_D(bds, zss, MO_8)
6664 DO_LD1_ZPZ_D(bds, zd, MO_8)
6665
6666 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6667 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6668 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6669 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6670 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6671
6672 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6673 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6674 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6675 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6676 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6677
6678 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6679 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6680 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6681 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6682 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6683
6684 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6685 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6686 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6687 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6688 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6689
6690 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6691 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6692 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6693 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6694 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6695
6696 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6697 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6698 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6699 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6700 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6701
6702 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6703 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6704 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6705
6706 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6707 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6708 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6709
6710 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6711 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6712 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6713
6714 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6715 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6716 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6717
6718 #undef DO_LD1_ZPZ_S
6719 #undef DO_LD1_ZPZ_D
6720
6721 /* First fault loads with a vector index. */
6722
6723 /*
6724 * Common helpers for all gather first-faulting loads.
6725 */
6726
6727 static inline QEMU_ALWAYS_INLINE
6728 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6729 target_ulong base, uint32_t desc, uintptr_t retaddr,
6730 uint32_t mtedesc, const int esz, const int msz,
6731 zreg_off_fn *off_fn,
6732 sve_ldst1_host_fn *host_fn,
6733 sve_ldst1_tlb_fn *tlb_fn)
6734 {
6735 const int mmu_idx = arm_env_mmu_index(env);
6736 const intptr_t reg_max = simd_oprsz(desc);
6737 const int scale = simd_data(desc);
6738 const int esize = 1 << esz;
6739 const int msize = 1 << msz;
6740 intptr_t reg_off;
6741 SVEHostPage info;
6742 target_ulong addr, in_page;
6743 ARMVectorReg scratch;
6744
6745 /* Skip to the first true predicate. */
6746 reg_off = find_next_active(vg, 0, reg_max, esz);
6747 if (unlikely(reg_off >= reg_max)) {
6748 /* The entire predicate was false; no load occurs. */
6749 memset(vd, 0, reg_max);
6750 return;
6751 }
6752
6753 /* Protect against overlap between vd and vm. */
6754 if (unlikely(vd == vm)) {
6755 vm = memcpy(&scratch, vm, reg_max);
6756 }
6757
6758 /*
6759 * Probe the first element, allowing faults.
6760 */
6761 addr = base + (off_fn(vm, reg_off) << scale);
6762 if (mtedesc) {
6763 mte_check(env, mtedesc, addr, retaddr);
6764 }
6765 tlb_fn(env, vd, reg_off, addr, retaddr);
6766
6767 /* After any fault, zero the other elements. */
6768 swap_memzero(vd, reg_off);
6769 reg_off += esize;
6770 swap_memzero(vd + reg_off, reg_max - reg_off);
6771
6772 /*
6773 * Probe the remaining elements, not allowing faults.
6774 */
6775 while (reg_off < reg_max) {
6776 uint64_t pg = vg[reg_off >> 6];
6777 do {
6778 if (likely((pg >> (reg_off & 63)) & 1)) {
6779 addr = base + (off_fn(vm, reg_off) << scale);
6780 in_page = -(addr | TARGET_PAGE_MASK);
6781
6782 if (unlikely(in_page < msize)) {
6783 /* Stop if the element crosses a page boundary. */
6784 goto fault;
6785 }
6786
6787 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6788 mmu_idx, retaddr);
6789 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6790 goto fault;
6791 }
6792 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6793 (cpu_watchpoint_address_matches
6794 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6795 goto fault;
6796 }
6797 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6798 goto fault;
6799 }
6800
6801 set_helper_retaddr(retaddr);
6802 host_fn(vd, reg_off, info.host);
6803 clear_helper_retaddr();
6804 }
6805 reg_off += esize;
6806 } while (reg_off & 63);
6807 }
6808 return;
6809
6810 fault:
6811 record_fault(env, reg_off, reg_max);
6812 }
6813
6814 static inline QEMU_ALWAYS_INLINE
sve_ldff1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,const int esz,const int msz,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6815 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6816 target_ulong base, uint32_t desc, uintptr_t retaddr,
6817 const int esz, const int msz,
6818 zreg_off_fn *off_fn,
6819 sve_ldst1_host_fn *host_fn,
6820 sve_ldst1_tlb_fn *tlb_fn)
6821 {
6822 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6823 /* Remove mtedesc from the normal sve descriptor. */
6824 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6825
6826 /*
6827 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6828 * offset base entirely over the address space hole to change the
6829 * pointer tag, or change the bit55 selector. So we could here
6830 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6831 */
6832 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6833 esz, msz, off_fn, host_fn, tlb_fn);
6834 }
6835
6836 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6837 void HELPER(sve_ldff##MEM##_##OFS) \
6838 (CPUARMState *env, void *vd, void *vg, \
6839 void *vm, target_ulong base, uint32_t desc) \
6840 { \
6841 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6842 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6843 } \
6844 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6845 (CPUARMState *env, void *vd, void *vg, \
6846 void *vm, target_ulong base, uint32_t desc) \
6847 { \
6848 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6849 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6850 }
6851
6852 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6853 void HELPER(sve_ldff##MEM##_##OFS) \
6854 (CPUARMState *env, void *vd, void *vg, \
6855 void *vm, target_ulong base, uint32_t desc) \
6856 { \
6857 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6858 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6859 } \
6860 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6861 (CPUARMState *env, void *vd, void *vg, \
6862 void *vm, target_ulong base, uint32_t desc) \
6863 { \
6864 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6865 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6866 }
6867
DO_LDFF1_ZPZ_S(bsu,zsu,MO_8)6868 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6869 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6870 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6871 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6872 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6873
6874 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6875 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6876 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6877 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6878 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6879
6880 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6881 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6882 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6883 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6884 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6885
6886 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6887 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6888 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6889 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6890 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6891
6892 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6893 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6894 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6895 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6896 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6897
6898 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6899 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6900 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6901 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6902 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6903
6904 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6905 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6906 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6907 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6908 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6909
6910 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6911 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6912 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6913 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6914 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6915
6916 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6917 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6918 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6919
6920 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6921 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6922 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6923
6924 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6925 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6926 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6927
6928 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6929 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6930 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6931
6932 /* Stores with a vector index. */
6933
6934 static inline QEMU_ALWAYS_INLINE
6935 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6936 target_ulong base, uint32_t desc, uintptr_t retaddr,
6937 uint32_t mtedesc, int esize, int msize,
6938 zreg_off_fn *off_fn,
6939 sve_ldst1_host_fn *host_fn,
6940 sve_ldst1_tlb_fn *tlb_fn)
6941 {
6942 const int mmu_idx = arm_env_mmu_index(env);
6943 const intptr_t reg_max = simd_oprsz(desc);
6944 const int scale = simd_data(desc);
6945 void *host[ARM_MAX_VQ * 4];
6946 intptr_t reg_off, i;
6947 SVEHostPage info, info2;
6948
6949 /*
6950 * Probe all of the elements for host addresses and flags.
6951 */
6952 i = reg_off = 0;
6953 do {
6954 uint64_t pg = vg[reg_off >> 6];
6955 do {
6956 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6957 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6958
6959 host[i] = NULL;
6960 if (likely((pg >> (reg_off & 63)) & 1)) {
6961 if (likely(in_page >= msize)) {
6962 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6963 mmu_idx, retaddr);
6964 if (!(info.flags & TLB_MMIO)) {
6965 host[i] = info.host;
6966 }
6967 } else {
6968 /*
6969 * Element crosses the page boundary.
6970 * Probe both pages, but do not record the host address,
6971 * so that we use the slow path.
6972 */
6973 sve_probe_page(&info, false, env, addr, 0,
6974 MMU_DATA_STORE, mmu_idx, retaddr);
6975 sve_probe_page(&info2, false, env, addr + in_page, 0,
6976 MMU_DATA_STORE, mmu_idx, retaddr);
6977 info.flags |= info2.flags;
6978 }
6979
6980 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6981 cpu_check_watchpoint(env_cpu(env), addr, msize,
6982 info.attrs, BP_MEM_WRITE, retaddr);
6983 }
6984
6985 if (mtedesc && info.tagged) {
6986 mte_check(env, mtedesc, addr, retaddr);
6987 }
6988 }
6989 i += 1;
6990 reg_off += esize;
6991 } while (reg_off & 63);
6992 } while (reg_off < reg_max);
6993
6994 /*
6995 * Now that we have recognized all exceptions except SyncExternal
6996 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6997 *
6998 * Note for the common case of an element in RAM, not crossing a page
6999 * boundary, we have stored the host address in host[]. This doubles
7000 * as a first-level check against the predicate, since only enabled
7001 * elements have non-null host addresses.
7002 */
7003 i = reg_off = 0;
7004 do {
7005 void *h = host[i];
7006 if (likely(h != NULL)) {
7007 set_helper_retaddr(retaddr);
7008 host_fn(vd, reg_off, h);
7009 clear_helper_retaddr();
7010 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7011 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7012 tlb_fn(env, vd, reg_off, addr, retaddr);
7013 }
7014 i += 1;
7015 reg_off += esize;
7016 } while (reg_off < reg_max);
7017 }
7018
7019 static inline QEMU_ALWAYS_INLINE
sve_st1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7020 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7021 target_ulong base, uint32_t desc, uintptr_t retaddr,
7022 int esize, int msize, zreg_off_fn *off_fn,
7023 sve_ldst1_host_fn *host_fn,
7024 sve_ldst1_tlb_fn *tlb_fn)
7025 {
7026 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7027 /* Remove mtedesc from the normal sve descriptor. */
7028 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7029
7030 /*
7031 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7032 * offset base entirely over the address space hole to change the
7033 * pointer tag, or change the bit55 selector. So we could here
7034 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7035 */
7036 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7037 esize, msize, off_fn, host_fn, tlb_fn);
7038 }
7039
7040 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7041 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7042 void *vm, target_ulong base, uint32_t desc) \
7043 { \
7044 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7045 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7046 } \
7047 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7048 void *vm, target_ulong base, uint32_t desc) \
7049 { \
7050 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7051 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7052 }
7053
7054 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7055 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7056 void *vm, target_ulong base, uint32_t desc) \
7057 { \
7058 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7059 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7060 } \
7061 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7062 void *vm, target_ulong base, uint32_t desc) \
7063 { \
7064 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7065 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7066 }
7067
DO_ST1_ZPZ_S(bs,zsu,MO_8)7068 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7069 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7070 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7071 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7072 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7073
7074 DO_ST1_ZPZ_S(bs, zss, MO_8)
7075 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7076 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7077 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7078 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7079
7080 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7081 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7082 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7083 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7084 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7085 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7086 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7087
7088 DO_ST1_ZPZ_D(bd, zss, MO_8)
7089 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7090 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7091 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7092 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7093 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7094 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7095
7096 DO_ST1_ZPZ_D(bd, zd, MO_8)
7097 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7098 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7099 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7100 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7101 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7102 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7103
7104 #undef DO_ST1_ZPZ_S
7105 #undef DO_ST1_ZPZ_D
7106
7107 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7108 {
7109 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7110 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7111
7112 for (i = 0; i < opr_sz; ++i) {
7113 d[i] = n[i] ^ m[i] ^ k[i];
7114 }
7115 }
7116
HELPER(sve2_bcax)7117 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7118 {
7119 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7120 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7121
7122 for (i = 0; i < opr_sz; ++i) {
7123 d[i] = n[i] ^ (m[i] & ~k[i]);
7124 }
7125 }
7126
HELPER(sve2_bsl1n)7127 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7128 {
7129 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7130 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7131
7132 for (i = 0; i < opr_sz; ++i) {
7133 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7134 }
7135 }
7136
HELPER(sve2_bsl2n)7137 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7138 {
7139 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7140 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7141
7142 for (i = 0; i < opr_sz; ++i) {
7143 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7144 }
7145 }
7146
HELPER(sve2_nbsl)7147 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7148 {
7149 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7150 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7151
7152 for (i = 0; i < opr_sz; ++i) {
7153 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7154 }
7155 }
7156
7157 /*
7158 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7159 * See hasless(v,1) from
7160 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7161 */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)7162 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7163 {
7164 int bits = 8 << esz;
7165 uint64_t ones = dup_const(esz, 1);
7166 uint64_t signs = ones << (bits - 1);
7167 uint64_t cmp0, cmp1;
7168
7169 cmp1 = dup_const(esz, n);
7170 cmp0 = cmp1 ^ m0;
7171 cmp1 = cmp1 ^ m1;
7172 cmp0 = (cmp0 - ones) & ~cmp0;
7173 cmp1 = (cmp1 - ones) & ~cmp1;
7174 return (cmp0 | cmp1) & signs;
7175 }
7176
do_match(void * vd,void * vn,void * vm,void * vg,uint32_t desc,int esz,bool nmatch)7177 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7178 uint32_t desc, int esz, bool nmatch)
7179 {
7180 uint16_t esz_mask = pred_esz_masks[esz];
7181 intptr_t opr_sz = simd_oprsz(desc);
7182 uint32_t flags = PREDTEST_INIT;
7183 intptr_t i, j, k;
7184
7185 for (i = 0; i < opr_sz; i += 16) {
7186 uint64_t m0 = *(uint64_t *)(vm + i);
7187 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7188 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7189 uint16_t out = 0;
7190
7191 for (j = 0; j < 16; j += 8) {
7192 uint64_t n = *(uint64_t *)(vn + i + j);
7193
7194 for (k = 0; k < 8; k += 1 << esz) {
7195 if (pg & (1 << (j + k))) {
7196 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7197 out |= (o ^ nmatch) << (j + k);
7198 }
7199 }
7200 }
7201 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7202 flags = iter_predtest_fwd(out, pg, flags);
7203 }
7204 return flags;
7205 }
7206
7207 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7208 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7209 { \
7210 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7211 }
7212
DO_PPZZ_MATCH(sve2_match_ppzz_b,MO_8,false)7213 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7214 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7215
7216 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7217 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7218
7219 #undef DO_PPZZ_MATCH
7220
7221 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7222 uint32_t desc)
7223 {
7224 ARMVectorReg scratch;
7225 intptr_t i, j;
7226 intptr_t opr_sz = simd_oprsz(desc);
7227 uint32_t *d = vd, *n = vn, *m = vm;
7228 uint8_t *pg = vg;
7229
7230 if (d == n) {
7231 n = memcpy(&scratch, n, opr_sz);
7232 if (d == m) {
7233 m = n;
7234 }
7235 } else if (d == m) {
7236 m = memcpy(&scratch, m, opr_sz);
7237 }
7238
7239 for (i = 0; i < opr_sz; i += 4) {
7240 uint64_t count = 0;
7241 uint8_t pred;
7242
7243 pred = pg[H1(i >> 3)] >> (i & 7);
7244 if (pred & 1) {
7245 uint32_t nn = n[H4(i >> 2)];
7246
7247 for (j = 0; j <= i; j += 4) {
7248 pred = pg[H1(j >> 3)] >> (j & 7);
7249 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7250 ++count;
7251 }
7252 }
7253 }
7254 d[H4(i >> 2)] = count;
7255 }
7256 }
7257
HELPER(sve2_histcnt_d)7258 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7259 uint32_t desc)
7260 {
7261 ARMVectorReg scratch;
7262 intptr_t i, j;
7263 intptr_t opr_sz = simd_oprsz(desc);
7264 uint64_t *d = vd, *n = vn, *m = vm;
7265 uint8_t *pg = vg;
7266
7267 if (d == n) {
7268 n = memcpy(&scratch, n, opr_sz);
7269 if (d == m) {
7270 m = n;
7271 }
7272 } else if (d == m) {
7273 m = memcpy(&scratch, m, opr_sz);
7274 }
7275
7276 for (i = 0; i < opr_sz / 8; ++i) {
7277 uint64_t count = 0;
7278 if (pg[H1(i)] & 1) {
7279 uint64_t nn = n[i];
7280 for (j = 0; j <= i; ++j) {
7281 if ((pg[H1(j)] & 1) && nn == m[j]) {
7282 ++count;
7283 }
7284 }
7285 }
7286 d[i] = count;
7287 }
7288 }
7289
7290 /*
7291 * Returns the number of bytes in m0 and m1 that match n.
7292 * Unlike do_match2 we don't just need true/false, we need an exact count.
7293 * This requires two extra logical operations.
7294 */
do_histseg_cnt(uint8_t n,uint64_t m0,uint64_t m1)7295 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7296 {
7297 const uint64_t mask = dup_const(MO_8, 0x7f);
7298 uint64_t cmp0, cmp1;
7299
7300 cmp1 = dup_const(MO_8, n);
7301 cmp0 = cmp1 ^ m0;
7302 cmp1 = cmp1 ^ m1;
7303
7304 /*
7305 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7306 * 2: carry in to msb if byte != 0 (+ mask)
7307 * 3: set msb if cmp has msb set (| cmp)
7308 * 4: set ~msb to ignore them (| mask)
7309 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7310 * 5: invert, resulting in 0x80 if and only if byte == 0.
7311 */
7312 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7313 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7314
7315 /*
7316 * Combine the two compares in a way that the bits do
7317 * not overlap, and so preserves the count of set bits.
7318 * If the host has an efficient instruction for ctpop,
7319 * then ctpop(x) + ctpop(y) has the same number of
7320 * operations as ctpop(x | (y >> 1)). If the host does
7321 * not have an efficient ctpop, then we only want to
7322 * use it once.
7323 */
7324 return ctpop64(cmp0 | (cmp1 >> 1));
7325 }
7326
HELPER(sve2_histseg)7327 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7328 {
7329 intptr_t i, j;
7330 intptr_t opr_sz = simd_oprsz(desc);
7331
7332 for (i = 0; i < opr_sz; i += 16) {
7333 uint64_t n0 = *(uint64_t *)(vn + i);
7334 uint64_t m0 = *(uint64_t *)(vm + i);
7335 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7336 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7337 uint64_t out0 = 0;
7338 uint64_t out1 = 0;
7339
7340 for (j = 0; j < 64; j += 8) {
7341 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7342 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7343 out0 |= cnt0 << j;
7344 out1 |= cnt1 << j;
7345 }
7346
7347 *(uint64_t *)(vd + i) = out0;
7348 *(uint64_t *)(vd + i + 8) = out1;
7349 }
7350 }
7351
HELPER(sve2_xar_b)7352 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7353 {
7354 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7355 int shr = simd_data(desc);
7356 int shl = 8 - shr;
7357 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7358 uint64_t *d = vd, *n = vn, *m = vm;
7359
7360 for (i = 0; i < opr_sz; ++i) {
7361 uint64_t t = n[i] ^ m[i];
7362 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7363 }
7364 }
7365
HELPER(sve2_xar_h)7366 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7367 {
7368 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7369 int shr = simd_data(desc);
7370 int shl = 16 - shr;
7371 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7372 uint64_t *d = vd, *n = vn, *m = vm;
7373
7374 for (i = 0; i < opr_sz; ++i) {
7375 uint64_t t = n[i] ^ m[i];
7376 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7377 }
7378 }
7379
HELPER(sve2_xar_s)7380 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7381 {
7382 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7383 int shr = simd_data(desc);
7384 uint32_t *d = vd, *n = vn, *m = vm;
7385
7386 for (i = 0; i < opr_sz; ++i) {
7387 d[i] = ror32(n[i] ^ m[i], shr);
7388 }
7389 }
7390
HELPER(fmmla_s)7391 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7392 void *status, uint32_t desc)
7393 {
7394 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7395
7396 for (s = 0; s < opr_sz; ++s) {
7397 float32 *n = vn + s * sizeof(float32) * 4;
7398 float32 *m = vm + s * sizeof(float32) * 4;
7399 float32 *a = va + s * sizeof(float32) * 4;
7400 float32 *d = vd + s * sizeof(float32) * 4;
7401 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7402 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7403 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7404 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7405 float32 p0, p1;
7406
7407 /* i = 0, j = 0 */
7408 p0 = float32_mul(n00, m00, status);
7409 p1 = float32_mul(n01, m01, status);
7410 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7411
7412 /* i = 0, j = 1 */
7413 p0 = float32_mul(n00, m10, status);
7414 p1 = float32_mul(n01, m11, status);
7415 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7416
7417 /* i = 1, j = 0 */
7418 p0 = float32_mul(n10, m00, status);
7419 p1 = float32_mul(n11, m01, status);
7420 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7421
7422 /* i = 1, j = 1 */
7423 p0 = float32_mul(n10, m10, status);
7424 p1 = float32_mul(n11, m11, status);
7425 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7426 }
7427 }
7428
HELPER(fmmla_d)7429 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7430 void *status, uint32_t desc)
7431 {
7432 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7433
7434 for (s = 0; s < opr_sz; ++s) {
7435 float64 *n = vn + s * sizeof(float64) * 4;
7436 float64 *m = vm + s * sizeof(float64) * 4;
7437 float64 *a = va + s * sizeof(float64) * 4;
7438 float64 *d = vd + s * sizeof(float64) * 4;
7439 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7440 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7441 float64 p0, p1;
7442
7443 /* i = 0, j = 0 */
7444 p0 = float64_mul(n00, m00, status);
7445 p1 = float64_mul(n01, m01, status);
7446 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7447
7448 /* i = 0, j = 1 */
7449 p0 = float64_mul(n00, m10, status);
7450 p1 = float64_mul(n01, m11, status);
7451 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7452
7453 /* i = 1, j = 0 */
7454 p0 = float64_mul(n10, m00, status);
7455 p1 = float64_mul(n11, m01, status);
7456 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7457
7458 /* i = 1, j = 1 */
7459 p0 = float64_mul(n10, m10, status);
7460 p1 = float64_mul(n11, m11, status);
7461 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7462 }
7463 }
7464
7465 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7466 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7467 { \
7468 intptr_t i = simd_oprsz(desc); \
7469 uint64_t *g = vg; \
7470 do { \
7471 uint64_t pg = g[(i - 1) >> 6]; \
7472 do { \
7473 i -= sizeof(TYPEW); \
7474 if (likely((pg >> (i & 63)) & 1)) { \
7475 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7476 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7477 } \
7478 } while (i & 63); \
7479 } while (i != 0); \
7480 }
7481
7482 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7483 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7484 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7485
7486 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7487 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7488 { \
7489 intptr_t i = simd_oprsz(desc); \
7490 uint64_t *g = vg; \
7491 do { \
7492 uint64_t pg = g[(i - 1) >> 6]; \
7493 do { \
7494 i -= sizeof(TYPEW); \
7495 if (likely((pg >> (i & 63)) & 1)) { \
7496 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7497 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7498 } \
7499 } while (i & 63); \
7500 } while (i != 0); \
7501 }
7502
7503 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7504 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7505
7506 #undef DO_FCVTLT
7507 #undef DO_FCVTNT
7508