1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/page-protection.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29 #include "vec_internal.h"
30 #include "sve_ldst_internal.h"
31 #include "accel/tcg/cpu-ops.h"
32 #ifdef CONFIG_USER_ONLY
33 #include "user/page-protection.h"
34 #endif
35
36
37 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
38 *
39 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
40 * and bit 0 set if C is set. Compare the definitions of these variables
41 * within CPUARMState.
42 */
43
44 /* For no G bits set, NZCV = C. */
45 #define PREDTEST_INIT 1
46
47 /* This is an iterative function, called for each Pd and Pg word
48 * moving forward.
49 */
iter_predtest_fwd(uint64_t d,uint64_t g,uint32_t flags)50 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
51 {
52 if (likely(g)) {
53 /* Compute N from first D & G.
54 Use bit 2 to signal first G bit seen. */
55 if (!(flags & 4)) {
56 flags |= ((d & (g & -g)) != 0) << 31;
57 flags |= 4;
58 }
59
60 /* Accumulate Z from each D & G. */
61 flags |= ((d & g) != 0) << 1;
62
63 /* Compute C from last !(D & G). Replace previous. */
64 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
65 }
66 return flags;
67 }
68
69 /* This is an iterative function, called for each Pd and Pg word
70 * moving backward.
71 */
iter_predtest_bwd(uint64_t d,uint64_t g,uint32_t flags)72 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
73 {
74 if (likely(g)) {
75 /* Compute C from first (i.e last) !(D & G).
76 Use bit 2 to signal first G bit seen. */
77 if (!(flags & 4)) {
78 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
79 flags |= (d & pow2floor(g)) == 0;
80 }
81
82 /* Accumulate Z from each D & G. */
83 flags |= ((d & g) != 0) << 1;
84
85 /* Compute N from last (i.e first) D & G. Replace previous. */
86 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
87 }
88 return flags;
89 }
90
91 /* The same for a single word predicate. */
HELPER(sve_predtest1)92 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
93 {
94 return iter_predtest_fwd(d, g, PREDTEST_INIT);
95 }
96
97 /* The same for a multi-word predicate. */
HELPER(sve_predtest)98 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
99 {
100 uint32_t flags = PREDTEST_INIT;
101 uint64_t *d = vd, *g = vg;
102 uintptr_t i = 0;
103
104 do {
105 flags = iter_predtest_fwd(d[i], g[i], flags);
106 } while (++i < words);
107
108 return flags;
109 }
110
111 /* Similarly for single word elements. */
expand_pred_s(uint8_t byte)112 static inline uint64_t expand_pred_s(uint8_t byte)
113 {
114 static const uint64_t word[] = {
115 [0x01] = 0x00000000ffffffffull,
116 [0x10] = 0xffffffff00000000ull,
117 [0x11] = 0xffffffffffffffffull,
118 };
119 return word[byte & 0x11];
120 }
121
122 #define LOGICAL_PPPP(NAME, FUNC) \
123 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
124 { \
125 uintptr_t opr_sz = simd_oprsz(desc); \
126 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
127 uintptr_t i; \
128 for (i = 0; i < opr_sz / 8; ++i) { \
129 d[i] = FUNC(n[i], m[i], g[i]); \
130 } \
131 }
132
133 #define DO_AND(N, M, G) (((N) & (M)) & (G))
134 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
135 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
136 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
137 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
138 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
139 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
140 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
141
LOGICAL_PPPP(sve_and_pppp,DO_AND)142 LOGICAL_PPPP(sve_and_pppp, DO_AND)
143 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
144 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
145 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
146 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
147 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
148 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
149 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
150
151 #undef DO_AND
152 #undef DO_BIC
153 #undef DO_EOR
154 #undef DO_ORR
155 #undef DO_ORN
156 #undef DO_NOR
157 #undef DO_NAND
158 #undef DO_SEL
159 #undef LOGICAL_PPPP
160
161 /* Fully general three-operand expander, controlled by a predicate.
162 * This is complicated by the host-endian storage of the register file.
163 */
164 /* ??? I don't expect the compiler could ever vectorize this itself.
165 * With some tables we can convert bit masks to byte masks, and with
166 * extra care wrt byte/word ordering we could use gcc generic vectors
167 * and do 16 bytes at a time.
168 */
169 #define DO_ZPZZ(NAME, TYPE, H, OP) \
170 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
171 { \
172 intptr_t i, opr_sz = simd_oprsz(desc); \
173 for (i = 0; i < opr_sz; ) { \
174 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
175 do { \
176 if (pg & 1) { \
177 TYPE nn = *(TYPE *)(vn + H(i)); \
178 TYPE mm = *(TYPE *)(vm + H(i)); \
179 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
180 } \
181 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
182 } while (i & 15); \
183 } \
184 }
185
186 /* Similarly, specialized for 64-bit operands. */
187 #define DO_ZPZZ_D(NAME, TYPE, OP) \
188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
189 { \
190 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
191 TYPE *d = vd, *n = vn, *m = vm; \
192 uint8_t *pg = vg; \
193 for (i = 0; i < opr_sz; i += 1) { \
194 if (pg[H1(i)] & 1) { \
195 TYPE nn = n[i], mm = m[i]; \
196 d[i] = OP(nn, mm); \
197 } \
198 } \
199 }
200
201 #define DO_AND(N, M) (N & M)
202 #define DO_EOR(N, M) (N ^ M)
203 #define DO_ORR(N, M) (N | M)
204 #define DO_BIC(N, M) (N & ~M)
205 #define DO_ADD(N, M) (N + M)
206 #define DO_SUB(N, M) (N - M)
207 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
208 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
209 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
210 #define DO_MUL(N, M) (N * M)
211
212
213 /*
214 * We must avoid the C undefined behaviour cases: division by
215 * zero and signed division of INT_MIN by -1. Both of these
216 * have architecturally defined required results for Arm.
217 * We special case all signed divisions by -1 to avoid having
218 * to deduce the minimum integer for the type involved.
219 */
220 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
221 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
222
223 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
224 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
225 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
226 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
227
228 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
229 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
230 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
231 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
232
233 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
234 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
235 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
236 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
237
238 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
239 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
240 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
241 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
242
243 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
244 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
245 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
246 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
247
248 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
249 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
250 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
251 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
252
253 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
254 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
255 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
256 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
257
258 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
259 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
260 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
261 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
262
263 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
264 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
265 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
266 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
267
268 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
269 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
270 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
271 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
272
273 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
274 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
275 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
276 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
277
278 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
279 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
280 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
281 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
282
283 /* Because the computation type is at least twice as large as required,
284 these work for both signed and unsigned source types. */
285 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
286 {
287 return (n * m) >> 8;
288 }
289
do_mulh_h(int32_t n,int32_t m)290 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
291 {
292 return (n * m) >> 16;
293 }
294
do_mulh_s(int64_t n,int64_t m)295 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
296 {
297 return (n * m) >> 32;
298 }
299
do_smulh_d(uint64_t n,uint64_t m)300 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
301 {
302 uint64_t lo, hi;
303 muls64(&lo, &hi, n, m);
304 return hi;
305 }
306
do_umulh_d(uint64_t n,uint64_t m)307 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
308 {
309 uint64_t lo, hi;
310 mulu64(&lo, &hi, n, m);
311 return hi;
312 }
313
DO_ZPZZ(sve_mul_zpzz_b,uint8_t,H1,DO_MUL)314 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
315 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
316 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
317 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
318
319 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
320 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
321 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
322 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
323
324 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
325 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
326 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
327 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
328
329 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
330 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
331
332 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
333 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
334
335 /* Note that all bits of the shift are significant
336 and not modulo the element size. */
337 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
338 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
339 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
340
341 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
342 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
343 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
344
345 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
346 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
347 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
348
349 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
350 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
351 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
352
353 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
354 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
355 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
356
357 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
358 {
359 int8_t n1 = n, n2 = n >> 8;
360 return m + n1 + n2;
361 }
362
do_sadalp_s(int32_t n,int32_t m)363 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
364 {
365 int16_t n1 = n, n2 = n >> 16;
366 return m + n1 + n2;
367 }
368
do_sadalp_d(int64_t n,int64_t m)369 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
370 {
371 int32_t n1 = n, n2 = n >> 32;
372 return m + n1 + n2;
373 }
374
DO_ZPZZ(sve2_sadalp_zpzz_h,int16_t,H1_2,do_sadalp_h)375 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
376 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
377 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
378
379 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
380 {
381 uint8_t n1 = n, n2 = n >> 8;
382 return m + n1 + n2;
383 }
384
do_uadalp_s(uint32_t n,uint32_t m)385 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
386 {
387 uint16_t n1 = n, n2 = n >> 16;
388 return m + n1 + n2;
389 }
390
do_uadalp_d(uint64_t n,uint64_t m)391 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
392 {
393 uint32_t n1 = n, n2 = n >> 32;
394 return m + n1 + n2;
395 }
396
DO_ZPZZ(sve2_uadalp_zpzz_h,uint16_t,H1_2,do_uadalp_h)397 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
398 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
399 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
400
401 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
402 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
403 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
404 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
405
406 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
407 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
408 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
409 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
410
411 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
412 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
413 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
414 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
415
416 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
417 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
418 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
419 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
420
421 /*
422 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
423 * We pass in a pointer to a dummy saturation field to trigger
424 * the saturating arithmetic but discard the information about
425 * whether it has occurred.
426 */
427 #define do_sqshl_b(n, m) \
428 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
429 #define do_sqshl_h(n, m) \
430 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
431 #define do_sqshl_s(n, m) \
432 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
433 #define do_sqshl_d(n, m) \
434 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
435
436 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
437 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
438 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
439 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
440
441 #define do_uqshl_b(n, m) \
442 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
443 #define do_uqshl_h(n, m) \
444 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
445 #define do_uqshl_s(n, m) \
446 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
447 #define do_uqshl_d(n, m) \
448 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
449
450 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
451 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
452 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
453 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
454
455 #define do_sqrshl_b(n, m) \
456 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
457 #define do_sqrshl_h(n, m) \
458 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
459 #define do_sqrshl_s(n, m) \
460 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
461 #define do_sqrshl_d(n, m) \
462 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
463
464 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
465 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
466 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
467 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
468
469 #undef do_sqrshl_d
470
471 #define do_uqrshl_b(n, m) \
472 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
473 #define do_uqrshl_h(n, m) \
474 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
475 #define do_uqrshl_s(n, m) \
476 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
477 #define do_uqrshl_d(n, m) \
478 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
479
480 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
481 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
482 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
483 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
484
485 #undef do_uqrshl_d
486
487 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
488 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
489
490 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
491 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
492 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
493 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
494
495 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
496 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
497 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
498 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
499
500 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
501 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
502
503 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
504 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
505 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
506 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
507
508 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
509 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
510 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
511 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
512
513 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
514 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
515
516 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
517 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
518 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
519 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
520
521 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
522 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
523 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
524 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
525
526 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
527 {
528 return val >= max ? max : val <= min ? min : val;
529 }
530
531 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
532 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
533 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
534
do_sqadd_d(int64_t n,int64_t m)535 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
536 {
537 int64_t r = n + m;
538 if (((r ^ n) & ~(n ^ m)) < 0) {
539 /* Signed overflow. */
540 return r < 0 ? INT64_MAX : INT64_MIN;
541 }
542 return r;
543 }
544
DO_ZPZZ(sve2_sqadd_zpzz_b,int8_t,H1,DO_SQADD_B)545 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
546 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
547 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
548 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
549
550 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
551 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
552 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
553
554 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
555 {
556 uint64_t r = n + m;
557 return r < n ? UINT64_MAX : r;
558 }
559
DO_ZPZZ(sve2_uqadd_zpzz_b,uint8_t,H1,DO_UQADD_B)560 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
561 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
562 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
563 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
564
565 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
566 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
567 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
568
569 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
570 {
571 int64_t r = n - m;
572 if (((r ^ n) & (n ^ m)) < 0) {
573 /* Signed overflow. */
574 return r < 0 ? INT64_MAX : INT64_MIN;
575 }
576 return r;
577 }
578
DO_ZPZZ(sve2_sqsub_zpzz_b,int8_t,H1,DO_SQSUB_B)579 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
580 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
581 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
582 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
583
584 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
585 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
586 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
587
588 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
589 {
590 return n > m ? n - m : 0;
591 }
592
DO_ZPZZ(sve2_uqsub_zpzz_b,uint8_t,H1,DO_UQSUB_B)593 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
594 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
595 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
596 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
597
598 #define DO_SUQADD_B(n, m) \
599 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
600 #define DO_SUQADD_H(n, m) \
601 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
602 #define DO_SUQADD_S(n, m) \
603 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
604
605 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
606 {
607 uint64_t r = n + m;
608
609 if (n < 0) {
610 /* Note that m - abs(n) cannot underflow. */
611 if (r > INT64_MAX) {
612 /* Result is either very large positive or negative. */
613 if (m > -n) {
614 /* m > abs(n), so r is a very large positive. */
615 return INT64_MAX;
616 }
617 /* Result is negative. */
618 }
619 } else {
620 /* Both inputs are positive: check for overflow. */
621 if (r < m || r > INT64_MAX) {
622 return INT64_MAX;
623 }
624 }
625 return r;
626 }
627
DO_ZPZZ(sve2_suqadd_zpzz_b,uint8_t,H1,DO_SUQADD_B)628 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
629 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
630 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
631 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
632
633 #define DO_USQADD_B(n, m) \
634 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
635 #define DO_USQADD_H(n, m) \
636 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
637 #define DO_USQADD_S(n, m) \
638 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
639
640 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
641 {
642 uint64_t r = n + m;
643
644 if (m < 0) {
645 return n < -m ? 0 : r;
646 }
647 return r < n ? UINT64_MAX : r;
648 }
649
DO_ZPZZ(sve2_usqadd_zpzz_b,uint8_t,H1,DO_USQADD_B)650 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
651 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
652 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
653 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
654
655 #undef DO_ZPZZ
656 #undef DO_ZPZZ_D
657
658 /*
659 * Three operand expander, operating on element pairs.
660 * If the slot I is even, the elements from from VN {I, I+1}.
661 * If the slot I is odd, the elements from from VM {I-1, I}.
662 * Load all of the input elements in each pair before overwriting output.
663 */
664 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
665 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
666 { \
667 intptr_t i, opr_sz = simd_oprsz(desc); \
668 for (i = 0; i < opr_sz; ) { \
669 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
670 do { \
671 TYPE n0 = *(TYPE *)(vn + H(i)); \
672 TYPE m0 = *(TYPE *)(vm + H(i)); \
673 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
674 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
675 if (pg & 1) { \
676 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
677 } \
678 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
679 if (pg & 1) { \
680 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
681 } \
682 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
683 } while (i & 15); \
684 } \
685 }
686
687 /* Similarly, specialized for 64-bit operands. */
688 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
689 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
690 { \
691 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
692 TYPE *d = vd, *n = vn, *m = vm; \
693 uint8_t *pg = vg; \
694 for (i = 0; i < opr_sz; i += 2) { \
695 TYPE n0 = n[i], n1 = n[i + 1]; \
696 TYPE m0 = m[i], m1 = m[i + 1]; \
697 if (pg[H1(i)] & 1) { \
698 d[i] = OP(n0, n1); \
699 } \
700 if (pg[H1(i + 1)] & 1) { \
701 d[i + 1] = OP(m0, m1); \
702 } \
703 } \
704 }
705
706 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
707 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
708 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
709 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
710
711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
713 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
714 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
715
716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
718 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
719 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
720
721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
723 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
724 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
725
726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
728 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
729 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
730
731 #undef DO_ZPZZ_PAIR
732 #undef DO_ZPZZ_PAIR_D
733
734 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
735 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
736 float_status *status, uint32_t desc) \
737 { \
738 intptr_t i, opr_sz = simd_oprsz(desc); \
739 for (i = 0; i < opr_sz; ) { \
740 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
741 do { \
742 TYPE n0 = *(TYPE *)(vn + H(i)); \
743 TYPE m0 = *(TYPE *)(vm + H(i)); \
744 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
745 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
746 if (pg & 1) { \
747 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
748 } \
749 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
750 if (pg & 1) { \
751 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
752 } \
753 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
754 } while (i & 15); \
755 } \
756 }
757
758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
760 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
761
762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
764 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
765
766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
768 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
769
770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
772 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
773
774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
776 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
777
778 #undef DO_ZPZZ_PAIR_FP
779
780 /* Three-operand expander, controlled by a predicate, in which the
781 * third operand is "wide". That is, for D = N op M, the same 64-bit
782 * value of M is used with all of the narrower values of N.
783 */
784 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
785 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
786 { \
787 intptr_t i, opr_sz = simd_oprsz(desc); \
788 for (i = 0; i < opr_sz; ) { \
789 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
790 TYPEW mm = *(TYPEW *)(vm + i); \
791 do { \
792 if (pg & 1) { \
793 TYPE nn = *(TYPE *)(vn + H(i)); \
794 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
795 } \
796 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
797 } while (i & 7); \
798 } \
799 }
800
801 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
802 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
803 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
804
805 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
806 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
807 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
808
809 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
810 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
811 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
812
813 #undef DO_ZPZW
814
815 /* Fully general two-operand expander, controlled by a predicate.
816 */
817 #define DO_ZPZ(NAME, TYPE, H, OP) \
818 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
819 { \
820 intptr_t i, opr_sz = simd_oprsz(desc); \
821 for (i = 0; i < opr_sz; ) { \
822 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
823 do { \
824 if (pg & 1) { \
825 TYPE nn = *(TYPE *)(vn + H(i)); \
826 *(TYPE *)(vd + H(i)) = OP(nn); \
827 } \
828 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
829 } while (i & 15); \
830 } \
831 }
832
833 /* Similarly, specialized for 64-bit operands. */
834 #define DO_ZPZ_D(NAME, TYPE, OP) \
835 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
836 { \
837 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
838 TYPE *d = vd, *n = vn; \
839 uint8_t *pg = vg; \
840 for (i = 0; i < opr_sz; i += 1) { \
841 if (pg[H1(i)] & 1) { \
842 TYPE nn = n[i]; \
843 d[i] = OP(nn); \
844 } \
845 } \
846 }
847
848 #define DO_CLS_B(N) (clrsb32(N) - 24)
849 #define DO_CLS_H(N) (clrsb32(N) - 16)
850
851 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
852 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
853 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
854 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
855
856 #define DO_CLZ_B(N) (clz32(N) - 24)
857 #define DO_CLZ_H(N) (clz32(N) - 16)
858
859 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
860 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
861 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
862 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
863
864 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
865 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
866 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
867 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
868
869 #define DO_CNOT(N) (N == 0)
870
871 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
872 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
873 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
874 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
875
876 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
877
878 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
879 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
880 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
881
882 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N))
883 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N))
884 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N))
885
886 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H)
887 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S)
888 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D)
889
890 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
891
892 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
893 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
894 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
895
896 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N))
897 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N))
898 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N))
899
900 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H)
901 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S)
902 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D)
903
904 #define DO_NOT(N) (~N)
905
906 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
907 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
908 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
909 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
910
911 #define DO_SXTB(N) ((int8_t)N)
912 #define DO_SXTH(N) ((int16_t)N)
913 #define DO_SXTS(N) ((int32_t)N)
914 #define DO_UXTB(N) ((uint8_t)N)
915 #define DO_UXTH(N) ((uint16_t)N)
916 #define DO_UXTS(N) ((uint32_t)N)
917
918 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
919 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
920 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
921 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
922 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
923 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
924
925 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
926 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
927 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
928 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
929 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
930 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
931
932 #define DO_ABS(N) (N < 0 ? -N : N)
933
934 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
935 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
936 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
937 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
938
939 #define DO_NEG(N) (-N)
940
941 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
942 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
943 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
944 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
945
946 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
947 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
948 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
949
950 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
951 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
952
953 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
954
955 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
956 {
957 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
958 uint64_t *d = vd, *n = vn;
959 uint8_t *pg = vg;
960
961 for (i = 0; i < opr_sz; i += 2) {
962 if (pg[H1(i)] & 1) {
963 uint64_t n0 = n[i + 0];
964 uint64_t n1 = n[i + 1];
965 d[i + 0] = n1;
966 d[i + 1] = n0;
967 }
968 }
969 }
970
DO_ZPZ(sve_rbit_b,uint8_t,H1,revbit8)971 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
972 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
973 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
974 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
975
976 #define DO_SQABS(X) \
977 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
978 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
979
980 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
981 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
982 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
983 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
984
985 #define DO_SQNEG(X) \
986 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
987 x_ == min_ ? -min_ - 1 : -x_; })
988
989 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
990 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
991 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
992 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
993
994 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
995 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
996
997 /* Three-operand expander, unpredicated, in which the third operand is "wide".
998 */
999 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1000 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1001 { \
1002 intptr_t i, opr_sz = simd_oprsz(desc); \
1003 for (i = 0; i < opr_sz; ) { \
1004 TYPEW mm = *(TYPEW *)(vm + i); \
1005 do { \
1006 TYPE nn = *(TYPE *)(vn + H(i)); \
1007 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1008 i += sizeof(TYPE); \
1009 } while (i & 7); \
1010 } \
1011 }
1012
1013 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1014 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1015 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1016
1017 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1018 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1019 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1020
1021 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1022 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1023 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1024
1025 #undef DO_ZZW
1026
1027 #undef DO_CLS_B
1028 #undef DO_CLS_H
1029 #undef DO_CLZ_B
1030 #undef DO_CLZ_H
1031 #undef DO_CNOT
1032 #undef DO_FABS
1033 #undef DO_FNEG
1034 #undef DO_ABS
1035 #undef DO_NEG
1036 #undef DO_ZPZ
1037 #undef DO_ZPZ_D
1038
1039 /*
1040 * Three-operand expander, unpredicated, in which the two inputs are
1041 * selected from the top or bottom half of the wide column.
1042 */
1043 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1044 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1045 { \
1046 intptr_t i, opr_sz = simd_oprsz(desc); \
1047 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1048 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1049 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1050 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1051 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1052 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1053 } \
1054 }
1055
1056 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1057 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1058 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1059
1060 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1061 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1062 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1063
1064 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1065 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1066 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1067
1068 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1069 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1070 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1071
1072 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1073 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1074 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1075
1076 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1077 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1078 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1079
1080 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1081 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1082 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1083
1084 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1085 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1086 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1087
1088 /* Note that the multiply cannot overflow, but the doubling can. */
1089 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1090 {
1091 int16_t val = n * m;
1092 return DO_SQADD_H(val, val);
1093 }
1094
do_sqdmull_s(int32_t n,int32_t m)1095 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1096 {
1097 int32_t val = n * m;
1098 return DO_SQADD_S(val, val);
1099 }
1100
do_sqdmull_d(int64_t n,int64_t m)1101 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1102 {
1103 int64_t val = n * m;
1104 return do_sqadd_d(val, val);
1105 }
1106
DO_ZZZ_TB(sve2_sqdmull_zzz_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h)1107 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1108 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1109 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1110
1111 #undef DO_ZZZ_TB
1112
1113 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1114 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1115 { \
1116 intptr_t i, opr_sz = simd_oprsz(desc); \
1117 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1118 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1119 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1120 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1121 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1122 } \
1123 }
1124
1125 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1126 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1127 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1128
1129 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1130 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1131 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1132
1133 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1134 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1135 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1136
1137 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1138 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1139 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1140
1141 #undef DO_ZZZ_WTB
1142
1143 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1144 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1145 { \
1146 intptr_t i, opr_sz = simd_oprsz(desc); \
1147 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1148 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1149 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1150 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1151 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1152 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1153 } \
1154 }
1155
1156 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1157 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1158 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1159 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1160
1161 #undef DO_ZZZ_NTB
1162
1163 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1164 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1165 { \
1166 intptr_t i, opr_sz = simd_oprsz(desc); \
1167 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1168 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1169 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1170 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1171 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1172 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1173 } \
1174 }
1175
1176 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1177 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1178 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1179
1180 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1181 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1182 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1183
1184 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1185 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1186 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1187
1188 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1189 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1190 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1191
1192 #define DO_NMUL(N, M) -(N * M)
1193
1194 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1196 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1197
1198 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1200 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1201
1202 #undef DO_ZZZW_ACC
1203
1204 #define DO_XTNB(NAME, TYPE, OP) \
1205 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1206 { \
1207 intptr_t i, opr_sz = simd_oprsz(desc); \
1208 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1209 TYPE nn = *(TYPE *)(vn + i); \
1210 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1211 *(TYPE *)(vd + i) = nn; \
1212 } \
1213 }
1214
1215 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1216 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1217 { \
1218 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1219 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1220 TYPE nn = *(TYPE *)(vn + i); \
1221 *(TYPEN *)(vd + i + odd) = OP(nn); \
1222 } \
1223 }
1224
1225 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1226 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1227 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1228
1229 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1230 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1231 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1232
1233 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1234 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1235 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1236
1237 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1238 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1239 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1240
1241 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1242 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1243 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1244
1245 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1246 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1247 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1248
1249 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1250 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1251 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1252
1253 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1254 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1255 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1256
1257 #undef DO_XTNB
1258 #undef DO_XTNT
1259
1260 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1261 {
1262 intptr_t i, opr_sz = simd_oprsz(desc);
1263 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1264 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1265 uint32_t *a = va, *n = vn;
1266 uint64_t *d = vd, *m = vm;
1267
1268 for (i = 0; i < opr_sz / 8; ++i) {
1269 uint32_t e1 = a[2 * i + H4(0)];
1270 uint32_t e2 = n[2 * i + sel] ^ inv;
1271 uint64_t c = extract64(m[i], 32, 1);
1272 /* Compute and store the entire 33-bit result at once. */
1273 d[i] = c + e1 + e2;
1274 }
1275 }
1276
HELPER(sve2_adcl_d)1277 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1278 {
1279 intptr_t i, opr_sz = simd_oprsz(desc);
1280 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1281 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1282 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1283
1284 for (i = 0; i < opr_sz / 8; i += 2) {
1285 Int128 e1 = int128_make64(a[i]);
1286 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1287 Int128 c = int128_make64(m[i + 1] & 1);
1288 Int128 r = int128_add(int128_add(e1, e2), c);
1289 d[i + 0] = int128_getlo(r);
1290 d[i + 1] = int128_gethi(r);
1291 }
1292 }
1293
1294 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1295 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1296 { \
1297 intptr_t i, opr_sz = simd_oprsz(desc); \
1298 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1299 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1300 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1301 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1302 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1303 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1304 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1305 } \
1306 }
1307
DO_SQDMLAL(sve2_sqdmlal_zzzw_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h,DO_SQADD_H)1308 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1309 do_sqdmull_h, DO_SQADD_H)
1310 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1311 do_sqdmull_s, DO_SQADD_S)
1312 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1313 do_sqdmull_d, do_sqadd_d)
1314
1315 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1316 do_sqdmull_h, DO_SQSUB_H)
1317 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1318 do_sqdmull_s, DO_SQSUB_S)
1319 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1320 do_sqdmull_d, do_sqsub_d)
1321
1322 #undef DO_SQDMLAL
1323
1324 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1325 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1326 { \
1327 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1328 int rot = simd_data(desc); \
1329 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1330 bool sub_r = rot == 1 || rot == 2; \
1331 bool sub_i = rot >= 2; \
1332 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1333 for (i = 0; i < opr_sz; i += 2) { \
1334 TYPE elt1_a = n[H(i + sel_a)]; \
1335 TYPE elt2_a = m[H(i + sel_a)]; \
1336 TYPE elt2_b = m[H(i + sel_b)]; \
1337 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1338 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1339 } \
1340 }
1341
1342 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1343
1344 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1345 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1346 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1347 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1348
1349 #define DO_SQRDMLAH_B(N, M, A, S) \
1350 do_sqrdmlah_b(N, M, A, S, true)
1351 #define DO_SQRDMLAH_H(N, M, A, S) \
1352 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1353 #define DO_SQRDMLAH_S(N, M, A, S) \
1354 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1355 #define DO_SQRDMLAH_D(N, M, A, S) \
1356 do_sqrdmlah_d(N, M, A, S, true)
1357
1358 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1359 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1360 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1361 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1362
1363 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1364 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1365 { \
1366 intptr_t i, j, oprsz = simd_oprsz(desc); \
1367 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1368 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1369 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1370 bool sub_r = rot == 1 || rot == 2; \
1371 bool sub_i = rot >= 2; \
1372 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1373 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1374 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1375 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1376 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1377 TYPE elt1_a = n[H(i + j + sel_a)]; \
1378 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1379 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1380 } \
1381 } \
1382 }
1383
1384 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1385 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1386
1387 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1388 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1389
1390 #undef DO_CMLA
1391 #undef DO_CMLA_FUNC
1392 #undef DO_CMLA_IDX_FUNC
1393 #undef DO_SQRDMLAH_B
1394 #undef DO_SQRDMLAH_H
1395 #undef DO_SQRDMLAH_S
1396 #undef DO_SQRDMLAH_D
1397
1398 /* Note N and M are 4 elements bundled into one unit. */
1399 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1400 int sel_a, int sel_b, int sub_i)
1401 {
1402 for (int i = 0; i <= 1; i++) {
1403 int32_t elt1_r = (int8_t)(n >> (16 * i));
1404 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1405 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1406 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1407
1408 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1409 }
1410 return a;
1411 }
1412
do_cdot_d(uint64_t n,uint64_t m,int64_t a,int sel_a,int sel_b,int sub_i)1413 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1414 int sel_a, int sel_b, int sub_i)
1415 {
1416 for (int i = 0; i <= 1; i++) {
1417 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1418 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1419 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1420 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1421
1422 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1423 }
1424 return a;
1425 }
1426
HELPER(sve2_cdot_zzzz_s)1427 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1428 void *va, uint32_t desc)
1429 {
1430 int opr_sz = simd_oprsz(desc);
1431 int rot = simd_data(desc);
1432 int sel_a = rot & 1;
1433 int sel_b = sel_a ^ 1;
1434 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1435 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1436
1437 for (int e = 0; e < opr_sz / 4; e++) {
1438 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1439 }
1440 }
1441
HELPER(sve2_cdot_zzzz_d)1442 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1443 void *va, uint32_t desc)
1444 {
1445 int opr_sz = simd_oprsz(desc);
1446 int rot = simd_data(desc);
1447 int sel_a = rot & 1;
1448 int sel_b = sel_a ^ 1;
1449 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1450 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1451
1452 for (int e = 0; e < opr_sz / 8; e++) {
1453 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1454 }
1455 }
1456
HELPER(sve2_cdot_idx_s)1457 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1458 void *va, uint32_t desc)
1459 {
1460 int opr_sz = simd_oprsz(desc);
1461 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1462 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1463 int sel_a = rot & 1;
1464 int sel_b = sel_a ^ 1;
1465 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1466 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1467
1468 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1469 uint32_t seg_m = m[seg + idx];
1470 for (int e = 0; e < 4; e++) {
1471 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1472 sel_a, sel_b, sub_i);
1473 }
1474 }
1475 }
1476
HELPER(sve2_cdot_idx_d)1477 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1478 void *va, uint32_t desc)
1479 {
1480 int seg, opr_sz = simd_oprsz(desc);
1481 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1482 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1483 int sel_a = rot & 1;
1484 int sel_b = sel_a ^ 1;
1485 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1486 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1487
1488 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1489 uint64_t seg_m = m[seg + idx];
1490 for (int e = 0; e < 2; e++) {
1491 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1492 sel_a, sel_b, sub_i);
1493 }
1494 }
1495 }
1496
1497 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1498 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1499 { \
1500 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1501 intptr_t i, j, idx = simd_data(desc); \
1502 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1503 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1504 TYPE mm = m[i]; \
1505 for (j = 0; j < segment; j++) { \
1506 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1507 } \
1508 } \
1509 }
1510
1511 #define DO_SQRDMLAH_H(N, M, A) \
1512 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1513 #define DO_SQRDMLAH_S(N, M, A) \
1514 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1515 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1516
DO_ZZXZ(sve2_sqrdmlah_idx_h,int16_t,H2,DO_SQRDMLAH_H)1517 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1518 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1519 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1520
1521 #define DO_SQRDMLSH_H(N, M, A) \
1522 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1523 #define DO_SQRDMLSH_S(N, M, A) \
1524 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1525 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1526
1527 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1528 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1529 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1530
1531 #undef DO_ZZXZ
1532
1533 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1534 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1535 { \
1536 intptr_t i, j, oprsz = simd_oprsz(desc); \
1537 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1538 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1539 for (i = 0; i < oprsz; i += 16) { \
1540 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1541 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1542 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1543 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1544 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1545 } \
1546 } \
1547 }
1548
1549 #define DO_MLA(N, M, A) (A + N * M)
1550
1551 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1552 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1553 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1554 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1555
1556 #define DO_MLS(N, M, A) (A - N * M)
1557
1558 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1559 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1560 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1561 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1562
1563 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1564 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1565
1566 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1567 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1568
1569 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1570 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1571
1572 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1573 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1574
1575 #undef DO_MLA
1576 #undef DO_MLS
1577 #undef DO_ZZXW
1578
1579 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1580 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1581 { \
1582 intptr_t i, j, oprsz = simd_oprsz(desc); \
1583 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1584 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1585 for (i = 0; i < oprsz; i += 16) { \
1586 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1587 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1588 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1589 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1590 } \
1591 } \
1592 }
1593
1594 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1595 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1596
1597 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1598 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1599
1600 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1601 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1602
1603 #undef DO_ZZX
1604
1605 #define DO_BITPERM(NAME, TYPE, OP) \
1606 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1607 { \
1608 intptr_t i, opr_sz = simd_oprsz(desc); \
1609 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1610 TYPE nn = *(TYPE *)(vn + i); \
1611 TYPE mm = *(TYPE *)(vm + i); \
1612 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1613 } \
1614 }
1615
1616 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1617 {
1618 uint64_t res = 0;
1619 int db, rb = 0;
1620
1621 for (db = 0; db < n; ++db) {
1622 if ((mask >> db) & 1) {
1623 res |= ((data >> db) & 1) << rb;
1624 ++rb;
1625 }
1626 }
1627 return res;
1628 }
1629
DO_BITPERM(sve2_bext_b,uint8_t,bitextract)1630 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1631 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1632 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1633 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1634
1635 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1636 {
1637 uint64_t res = 0;
1638 int rb, db = 0;
1639
1640 for (rb = 0; rb < n; ++rb) {
1641 if ((mask >> rb) & 1) {
1642 res |= ((data >> db) & 1) << rb;
1643 ++db;
1644 }
1645 }
1646 return res;
1647 }
1648
DO_BITPERM(sve2_bdep_b,uint8_t,bitdeposit)1649 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1650 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1651 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1652 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1653
1654 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1655 {
1656 uint64_t resm = 0, resu = 0;
1657 int db, rbm = 0, rbu = 0;
1658
1659 for (db = 0; db < n; ++db) {
1660 uint64_t val = (data >> db) & 1;
1661 if ((mask >> db) & 1) {
1662 resm |= val << rbm++;
1663 } else {
1664 resu |= val << rbu++;
1665 }
1666 }
1667
1668 return resm | (resu << rbm);
1669 }
1670
DO_BITPERM(sve2_bgrp_b,uint8_t,bitgroup)1671 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1672 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1673 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1674 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1675
1676 #undef DO_BITPERM
1677
1678 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1679 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1680 { \
1681 intptr_t i, opr_sz = simd_oprsz(desc); \
1682 int sub_r = simd_data(desc); \
1683 if (sub_r) { \
1684 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1685 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1686 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1687 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1688 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1689 acc_r = ADD_OP(acc_r, el2_i); \
1690 acc_i = SUB_OP(acc_i, el2_r); \
1691 *(TYPE *)(vd + H(i)) = acc_r; \
1692 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1693 } \
1694 } else { \
1695 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1696 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1697 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1698 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1699 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1700 acc_r = SUB_OP(acc_r, el2_i); \
1701 acc_i = ADD_OP(acc_i, el2_r); \
1702 *(TYPE *)(vd + H(i)) = acc_r; \
1703 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1704 } \
1705 } \
1706 }
1707
1708 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1709 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1710 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1711 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1712
1713 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1714 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1715 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1716 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1717
1718 #undef DO_CADD
1719
1720 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1721 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1722 { \
1723 intptr_t i, opr_sz = simd_oprsz(desc); \
1724 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1725 int shift = simd_data(desc) >> 1; \
1726 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1727 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1728 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1729 } \
1730 }
1731
1732 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1733 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1734 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1735
1736 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1737 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1738 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1739
1740 #undef DO_ZZI_SHLL
1741
1742 /* Two-operand reduction expander, controlled by a predicate.
1743 * The difference between TYPERED and TYPERET has to do with
1744 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1745 * but TYPERET must be unsigned so that e.g. a 32-bit value
1746 * is not sign-extended to the ABI uint64_t return type.
1747 */
1748 /* ??? If we were to vectorize this by hand the reduction ordering
1749 * would change. For integer operands, this is perfectly fine.
1750 */
1751 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1752 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1753 { \
1754 intptr_t i, opr_sz = simd_oprsz(desc); \
1755 TYPERED ret = INIT; \
1756 for (i = 0; i < opr_sz; ) { \
1757 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1758 do { \
1759 if (pg & 1) { \
1760 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1761 ret = OP(ret, nn); \
1762 } \
1763 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1764 } while (i & 15); \
1765 } \
1766 return (TYPERET)ret; \
1767 }
1768
1769 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1770 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1771 { \
1772 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1773 TYPEE *n = vn; \
1774 uint8_t *pg = vg; \
1775 TYPER ret = INIT; \
1776 for (i = 0; i < opr_sz; i += 1) { \
1777 if (pg[H1(i)] & 1) { \
1778 TYPEE nn = n[i]; \
1779 ret = OP(ret, nn); \
1780 } \
1781 } \
1782 return ret; \
1783 }
1784
1785 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1786 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1787 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1788 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1789
1790 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1791 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1792 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1793 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1794
1795 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1796 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1797 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1798 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1799
1800 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1801 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1802 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1803
1804 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1805 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1806 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1807 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1808
1809 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1810 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1811 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1812 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1813
1814 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1815 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1816 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1817 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1818
1819 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1820 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1821 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1822 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1823
1824 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1825 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1826 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1827 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1828
1829 #undef DO_VPZ
1830 #undef DO_VPZ_D
1831
1832 /* Two vector operand, one scalar operand, unpredicated. */
1833 #define DO_ZZI(NAME, TYPE, OP) \
1834 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1835 { \
1836 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1837 TYPE s = s64, *d = vd, *n = vn; \
1838 for (i = 0; i < opr_sz; ++i) { \
1839 d[i] = OP(n[i], s); \
1840 } \
1841 }
1842
1843 #define DO_SUBR(X, Y) (Y - X)
1844
1845 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1846 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1847 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1848 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1849
1850 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1851 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1852 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1853 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1854
1855 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1856 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1857 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1858 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1859
1860 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1861 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1862 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1863 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1864
1865 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1866 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1867 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1868 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1869
1870 #undef DO_ZZI
1871
1872 #undef DO_AND
1873 #undef DO_ORR
1874 #undef DO_EOR
1875 #undef DO_BIC
1876 #undef DO_ADD
1877 #undef DO_SUB
1878 #undef DO_MAX
1879 #undef DO_MIN
1880 #undef DO_ABD
1881 #undef DO_MUL
1882 #undef DO_DIV
1883 #undef DO_ASR
1884 #undef DO_LSR
1885 #undef DO_LSL
1886 #undef DO_SUBR
1887
1888 /* Similar to the ARM LastActiveElement pseudocode function, except the
1889 result is multiplied by the element size. This includes the not found
1890 indication; e.g. not found for esz=3 is -8. */
1891 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1892 {
1893 uint64_t mask = pred_esz_masks[esz];
1894 intptr_t i = words;
1895
1896 do {
1897 uint64_t this_g = g[--i] & mask;
1898 if (this_g) {
1899 return i * 64 + (63 - clz64(this_g));
1900 }
1901 } while (i > 0);
1902 return (intptr_t)-1 << esz;
1903 }
1904
HELPER(sve_pfirst)1905 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1906 {
1907 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1908 uint32_t flags = PREDTEST_INIT;
1909 uint64_t *d = vd, *g = vg;
1910 intptr_t i = 0;
1911
1912 do {
1913 uint64_t this_d = d[i];
1914 uint64_t this_g = g[i];
1915
1916 if (this_g) {
1917 if (!(flags & 4)) {
1918 /* Set in D the first bit of G. */
1919 this_d |= this_g & -this_g;
1920 d[i] = this_d;
1921 }
1922 flags = iter_predtest_fwd(this_d, this_g, flags);
1923 }
1924 } while (++i < words);
1925
1926 return flags;
1927 }
1928
HELPER(sve_pnext)1929 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1930 {
1931 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1932 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1933 uint32_t flags = PREDTEST_INIT;
1934 uint64_t *d = vd, *g = vg, esz_mask;
1935 intptr_t i, next;
1936
1937 next = last_active_element(vd, words, esz) + (1 << esz);
1938 esz_mask = pred_esz_masks[esz];
1939
1940 /* Similar to the pseudocode for pnext, but scaled by ESZ
1941 so that we find the correct bit. */
1942 if (next < words * 64) {
1943 uint64_t mask = -1;
1944
1945 if (next & 63) {
1946 mask = ~((1ull << (next & 63)) - 1);
1947 next &= -64;
1948 }
1949 do {
1950 uint64_t this_g = g[next / 64] & esz_mask & mask;
1951 if (this_g != 0) {
1952 next = (next & -64) + ctz64(this_g);
1953 break;
1954 }
1955 next += 64;
1956 mask = -1;
1957 } while (next < words * 64);
1958 }
1959
1960 i = 0;
1961 do {
1962 uint64_t this_d = 0;
1963 if (i == next / 64) {
1964 this_d = 1ull << (next & 63);
1965 }
1966 d[i] = this_d;
1967 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1968 } while (++i < words);
1969
1970 return flags;
1971 }
1972
1973 /*
1974 * Copy Zn into Zd, and store zero into inactive elements.
1975 * If inv, store zeros into the active elements.
1976 */
HELPER(sve_movz_b)1977 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1978 {
1979 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1980 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1981 uint64_t *d = vd, *n = vn;
1982 uint8_t *pg = vg;
1983
1984 for (i = 0; i < opr_sz; i += 1) {
1985 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1986 }
1987 }
1988
HELPER(sve_movz_h)1989 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1990 {
1991 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1992 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1993 uint64_t *d = vd, *n = vn;
1994 uint8_t *pg = vg;
1995
1996 for (i = 0; i < opr_sz; i += 1) {
1997 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1998 }
1999 }
2000
HELPER(sve_movz_s)2001 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2002 {
2003 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2004 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2005 uint64_t *d = vd, *n = vn;
2006 uint8_t *pg = vg;
2007
2008 for (i = 0; i < opr_sz; i += 1) {
2009 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2010 }
2011 }
2012
HELPER(sve_movz_d)2013 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2014 {
2015 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2016 uint64_t *d = vd, *n = vn;
2017 uint8_t *pg = vg;
2018 uint8_t inv = simd_data(desc);
2019
2020 for (i = 0; i < opr_sz; i += 1) {
2021 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2022 }
2023 }
2024
2025 /* Three-operand expander, immediate operand, controlled by a predicate.
2026 */
2027 #define DO_ZPZI(NAME, TYPE, H, OP) \
2028 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2029 { \
2030 intptr_t i, opr_sz = simd_oprsz(desc); \
2031 TYPE imm = simd_data(desc); \
2032 for (i = 0; i < opr_sz; ) { \
2033 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2034 do { \
2035 if (pg & 1) { \
2036 TYPE nn = *(TYPE *)(vn + H(i)); \
2037 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2038 } \
2039 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2040 } while (i & 15); \
2041 } \
2042 }
2043
2044 /* Similarly, specialized for 64-bit operands. */
2045 #define DO_ZPZI_D(NAME, TYPE, OP) \
2046 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2047 { \
2048 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2049 TYPE *d = vd, *n = vn; \
2050 TYPE imm = simd_data(desc); \
2051 uint8_t *pg = vg; \
2052 for (i = 0; i < opr_sz; i += 1) { \
2053 if (pg[H1(i)] & 1) { \
2054 TYPE nn = n[i]; \
2055 d[i] = OP(nn, imm); \
2056 } \
2057 } \
2058 }
2059
2060 #define DO_SHR(N, M) (N >> M)
2061 #define DO_SHL(N, M) (N << M)
2062
2063 /* Arithmetic shift right for division. This rounds negative numbers
2064 toward zero as per signed division. Therefore before shifting,
2065 when N is negative, add 2**M-1. */
2066 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2067
do_urshr(uint64_t x,unsigned sh)2068 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2069 {
2070 if (likely(sh < 64)) {
2071 return (x >> sh) + ((x >> (sh - 1)) & 1);
2072 } else if (sh == 64) {
2073 return x >> 63;
2074 } else {
2075 return 0;
2076 }
2077 }
2078
do_srshr(int64_t x,unsigned sh)2079 static inline int64_t do_srshr(int64_t x, unsigned sh)
2080 {
2081 if (likely(sh < 64)) {
2082 return (x >> sh) + ((x >> (sh - 1)) & 1);
2083 } else {
2084 /* Rounding the sign bit always produces 0. */
2085 return 0;
2086 }
2087 }
2088
DO_ZPZI(sve_asr_zpzi_b,int8_t,H1,DO_SHR)2089 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2090 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2091 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2092 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2093
2094 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2095 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2096 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2097 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2098
2099 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2100 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2101 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2102 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2103
2104 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2105 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2106 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2107 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2108
2109 /* SVE2 bitwise shift by immediate */
2110 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2111 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2112 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2113 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2114
2115 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2116 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2117 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2118 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2119
2120 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2121 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2122 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2123 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2124
2125 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2126 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2127 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2128 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2129
2130 #define do_suqrshl_b(n, m) \
2131 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2132 #define do_suqrshl_h(n, m) \
2133 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2134 #define do_suqrshl_s(n, m) \
2135 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2136 #define do_suqrshl_d(n, m) \
2137 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2138
2139 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2140 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2141 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2142 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2143
2144 #undef DO_ASRD
2145 #undef DO_ZPZI
2146 #undef DO_ZPZI_D
2147
2148 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2149 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2150 { \
2151 intptr_t i, opr_sz = simd_oprsz(desc); \
2152 int shift = simd_data(desc); \
2153 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2154 TYPEW nn = *(TYPEW *)(vn + i); \
2155 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2156 } \
2157 }
2158
2159 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2160 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2161 { \
2162 intptr_t i, opr_sz = simd_oprsz(desc); \
2163 int shift = simd_data(desc); \
2164 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2165 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2166 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2167 } \
2168 }
2169
2170 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2171 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2172 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2173
2174 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2175 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2176 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2177
2178 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2179 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2180 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2181
2182 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2183 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2184 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2185
2186 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2187 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2188 #define DO_SQSHRUN_D(x, sh) \
2189 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2190
2191 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2192 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2193 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2194
2195 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2196 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2197 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2198
2199 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2200 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2201 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2202
2203 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2204 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2205 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2206
2207 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2208 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2209 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2210
2211 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2212 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2213 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2214
2215 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2216 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2217 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2218
2219 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2220 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2221 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2222
2223 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2224 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2225 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2226
2227 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2228 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2229 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2230
2231 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2232 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2233 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2234
2235 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2236 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2237 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2238
2239 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2240 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2241 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2242
2243 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2244 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2245 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2246
2247 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2248 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2249 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2250
2251 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2252 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2253 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2254
2255 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2256 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2257 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2258
2259 #undef DO_SHRNB
2260 #undef DO_SHRNT
2261
2262 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2263 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2264 { \
2265 intptr_t i, opr_sz = simd_oprsz(desc); \
2266 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2267 TYPEW nn = *(TYPEW *)(vn + i); \
2268 TYPEW mm = *(TYPEW *)(vm + i); \
2269 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2270 } \
2271 }
2272
2273 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2274 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2275 { \
2276 intptr_t i, opr_sz = simd_oprsz(desc); \
2277 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2278 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2279 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2280 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2281 } \
2282 }
2283
2284 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2285 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2286 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2287 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2288
2289 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2290 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2291 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2292
2293 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2294 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2295 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2296
2297 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2298 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2299 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2300
2301 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2302 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2303 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2304
2305 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2306 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2307 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2308
2309 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2310 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2311 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2312
2313 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2314 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2315 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2316
2317 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2318 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2319 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2320
2321 #undef DO_RSUBHN
2322 #undef DO_SUBHN
2323 #undef DO_RADDHN
2324 #undef DO_ADDHN
2325
2326 #undef DO_BINOPNB
2327
2328 /* Fully general four-operand expander, controlled by a predicate.
2329 */
2330 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2331 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2332 void *vg, uint32_t desc) \
2333 { \
2334 intptr_t i, opr_sz = simd_oprsz(desc); \
2335 for (i = 0; i < opr_sz; ) { \
2336 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2337 do { \
2338 if (pg & 1) { \
2339 TYPE nn = *(TYPE *)(vn + H(i)); \
2340 TYPE mm = *(TYPE *)(vm + H(i)); \
2341 TYPE aa = *(TYPE *)(va + H(i)); \
2342 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2343 } \
2344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2345 } while (i & 15); \
2346 } \
2347 }
2348
2349 /* Similarly, specialized for 64-bit operands. */
2350 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2351 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2352 void *vg, uint32_t desc) \
2353 { \
2354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2355 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2356 uint8_t *pg = vg; \
2357 for (i = 0; i < opr_sz; i += 1) { \
2358 if (pg[H1(i)] & 1) { \
2359 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2360 d[i] = OP(aa, nn, mm); \
2361 } \
2362 } \
2363 }
2364
2365 #define DO_MLA(A, N, M) (A + N * M)
2366 #define DO_MLS(A, N, M) (A - N * M)
2367
2368 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2369 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2370
2371 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2372 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2373
2374 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2375 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2376
2377 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2378 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2379
2380 #undef DO_MLA
2381 #undef DO_MLS
2382 #undef DO_ZPZZZ
2383 #undef DO_ZPZZZ_D
2384
2385 void HELPER(sve_index_b)(void *vd, uint32_t start,
2386 uint32_t incr, uint32_t desc)
2387 {
2388 intptr_t i, opr_sz = simd_oprsz(desc);
2389 uint8_t *d = vd;
2390 for (i = 0; i < opr_sz; i += 1) {
2391 d[H1(i)] = start + i * incr;
2392 }
2393 }
2394
HELPER(sve_index_h)2395 void HELPER(sve_index_h)(void *vd, uint32_t start,
2396 uint32_t incr, uint32_t desc)
2397 {
2398 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2399 uint16_t *d = vd;
2400 for (i = 0; i < opr_sz; i += 1) {
2401 d[H2(i)] = start + i * incr;
2402 }
2403 }
2404
HELPER(sve_index_s)2405 void HELPER(sve_index_s)(void *vd, uint32_t start,
2406 uint32_t incr, uint32_t desc)
2407 {
2408 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2409 uint32_t *d = vd;
2410 for (i = 0; i < opr_sz; i += 1) {
2411 d[H4(i)] = start + i * incr;
2412 }
2413 }
2414
HELPER(sve_index_d)2415 void HELPER(sve_index_d)(void *vd, uint64_t start,
2416 uint64_t incr, uint32_t desc)
2417 {
2418 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2419 uint64_t *d = vd;
2420 for (i = 0; i < opr_sz; i += 1) {
2421 d[i] = start + i * incr;
2422 }
2423 }
2424
HELPER(sve_adr_p32)2425 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2426 {
2427 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2428 uint32_t sh = simd_data(desc);
2429 uint32_t *d = vd, *n = vn, *m = vm;
2430 for (i = 0; i < opr_sz; i += 1) {
2431 d[i] = n[i] + (m[i] << sh);
2432 }
2433 }
2434
HELPER(sve_adr_p64)2435 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2436 {
2437 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2438 uint64_t sh = simd_data(desc);
2439 uint64_t *d = vd, *n = vn, *m = vm;
2440 for (i = 0; i < opr_sz; i += 1) {
2441 d[i] = n[i] + (m[i] << sh);
2442 }
2443 }
2444
HELPER(sve_adr_s32)2445 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2446 {
2447 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2448 uint64_t sh = simd_data(desc);
2449 uint64_t *d = vd, *n = vn, *m = vm;
2450 for (i = 0; i < opr_sz; i += 1) {
2451 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2452 }
2453 }
2454
HELPER(sve_adr_u32)2455 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2456 {
2457 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2458 uint64_t sh = simd_data(desc);
2459 uint64_t *d = vd, *n = vn, *m = vm;
2460 for (i = 0; i < opr_sz; i += 1) {
2461 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2462 }
2463 }
2464
HELPER(sve_fexpa_h)2465 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2466 {
2467 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2468 static const uint16_t coeff[] = {
2469 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2470 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2471 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2472 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2473 };
2474 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2475 uint16_t *d = vd, *n = vn;
2476
2477 for (i = 0; i < opr_sz; i++) {
2478 uint16_t nn = n[i];
2479 intptr_t idx = extract32(nn, 0, 5);
2480 uint16_t exp = extract32(nn, 5, 5);
2481 d[i] = coeff[idx] | (exp << 10);
2482 }
2483 }
2484
HELPER(sve_fexpa_s)2485 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2486 {
2487 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2488 static const uint32_t coeff[] = {
2489 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2490 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2491 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2492 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2493 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2494 0x1ef532, 0x20b051, 0x227043, 0x243516,
2495 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2496 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2497 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2498 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2499 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2500 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2501 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2502 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2503 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2504 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2505 };
2506 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2507 uint32_t *d = vd, *n = vn;
2508
2509 for (i = 0; i < opr_sz; i++) {
2510 uint32_t nn = n[i];
2511 intptr_t idx = extract32(nn, 0, 6);
2512 uint32_t exp = extract32(nn, 6, 8);
2513 d[i] = coeff[idx] | (exp << 23);
2514 }
2515 }
2516
HELPER(sve_fexpa_d)2517 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2518 {
2519 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2520 static const uint64_t coeff[] = {
2521 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2522 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2523 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2524 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2525 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2526 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2527 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2528 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2529 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2530 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2531 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2532 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2533 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2534 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2535 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2536 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2537 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2538 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2539 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2540 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2541 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2542 0xFA7C1819E90D8ull,
2543 };
2544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2545 uint64_t *d = vd, *n = vn;
2546
2547 for (i = 0; i < opr_sz; i++) {
2548 uint64_t nn = n[i];
2549 intptr_t idx = extract32(nn, 0, 6);
2550 uint64_t exp = extract32(nn, 6, 11);
2551 d[i] = coeff[idx] | (exp << 52);
2552 }
2553 }
2554
HELPER(sve_ftssel_h)2555 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2556 {
2557 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2558 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2559 uint16_t *d = vd, *n = vn, *m = vm;
2560 for (i = 0; i < opr_sz; i += 1) {
2561 uint16_t nn = n[i];
2562 uint16_t mm = m[i];
2563 if (mm & 1) {
2564 nn = float16_one;
2565 }
2566 if (mm & 2) {
2567 nn = float16_maybe_ah_chs(nn, fpcr_ah);
2568 }
2569 d[i] = nn;
2570 }
2571 }
2572
HELPER(sve_ftssel_s)2573 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2574 {
2575 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2576 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2577 uint32_t *d = vd, *n = vn, *m = vm;
2578 for (i = 0; i < opr_sz; i += 1) {
2579 uint32_t nn = n[i];
2580 uint32_t mm = m[i];
2581 if (mm & 1) {
2582 nn = float32_one;
2583 }
2584 if (mm & 2) {
2585 nn = float32_maybe_ah_chs(nn, fpcr_ah);
2586 }
2587 d[i] = nn;
2588 }
2589 }
2590
HELPER(sve_ftssel_d)2591 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2592 {
2593 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2594 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2595 uint64_t *d = vd, *n = vn, *m = vm;
2596 for (i = 0; i < opr_sz; i += 1) {
2597 uint64_t nn = n[i];
2598 uint64_t mm = m[i];
2599 if (mm & 1) {
2600 nn = float64_one;
2601 }
2602 if (mm & 2) {
2603 nn = float64_maybe_ah_chs(nn, fpcr_ah);
2604 }
2605 d[i] = nn;
2606 }
2607 }
2608
2609 /*
2610 * Signed saturating addition with scalar operand.
2611 */
2612
HELPER(sve_sqaddi_b)2613 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2614 {
2615 intptr_t i, oprsz = simd_oprsz(desc);
2616
2617 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2618 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2619 }
2620 }
2621
HELPER(sve_sqaddi_h)2622 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2623 {
2624 intptr_t i, oprsz = simd_oprsz(desc);
2625
2626 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2627 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2628 }
2629 }
2630
HELPER(sve_sqaddi_s)2631 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2632 {
2633 intptr_t i, oprsz = simd_oprsz(desc);
2634
2635 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2636 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2637 }
2638 }
2639
HELPER(sve_sqaddi_d)2640 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2641 {
2642 intptr_t i, oprsz = simd_oprsz(desc);
2643
2644 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2645 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2646 }
2647 }
2648
2649 /*
2650 * Unsigned saturating addition with scalar operand.
2651 */
2652
HELPER(sve_uqaddi_b)2653 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2654 {
2655 intptr_t i, oprsz = simd_oprsz(desc);
2656
2657 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2658 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2659 }
2660 }
2661
HELPER(sve_uqaddi_h)2662 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2663 {
2664 intptr_t i, oprsz = simd_oprsz(desc);
2665
2666 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2667 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2668 }
2669 }
2670
HELPER(sve_uqaddi_s)2671 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2672 {
2673 intptr_t i, oprsz = simd_oprsz(desc);
2674
2675 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2676 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2677 }
2678 }
2679
HELPER(sve_uqaddi_d)2680 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2681 {
2682 intptr_t i, oprsz = simd_oprsz(desc);
2683
2684 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2685 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2686 }
2687 }
2688
HELPER(sve_uqsubi_d)2689 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2690 {
2691 intptr_t i, oprsz = simd_oprsz(desc);
2692
2693 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2694 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2695 }
2696 }
2697
2698 /* Two operand predicated copy immediate with merge. All valid immediates
2699 * can fit within 17 signed bits in the simd_data field.
2700 */
HELPER(sve_cpy_m_b)2701 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2702 uint64_t mm, uint32_t desc)
2703 {
2704 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2705 uint64_t *d = vd, *n = vn;
2706 uint8_t *pg = vg;
2707
2708 mm = dup_const(MO_8, mm);
2709 for (i = 0; i < opr_sz; i += 1) {
2710 uint64_t nn = n[i];
2711 uint64_t pp = expand_pred_b(pg[H1(i)]);
2712 d[i] = (mm & pp) | (nn & ~pp);
2713 }
2714 }
2715
HELPER(sve_cpy_m_h)2716 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2717 uint64_t mm, uint32_t desc)
2718 {
2719 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2720 uint64_t *d = vd, *n = vn;
2721 uint8_t *pg = vg;
2722
2723 mm = dup_const(MO_16, mm);
2724 for (i = 0; i < opr_sz; i += 1) {
2725 uint64_t nn = n[i];
2726 uint64_t pp = expand_pred_h(pg[H1(i)]);
2727 d[i] = (mm & pp) | (nn & ~pp);
2728 }
2729 }
2730
HELPER(sve_cpy_m_s)2731 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2732 uint64_t mm, uint32_t desc)
2733 {
2734 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2735 uint64_t *d = vd, *n = vn;
2736 uint8_t *pg = vg;
2737
2738 mm = dup_const(MO_32, mm);
2739 for (i = 0; i < opr_sz; i += 1) {
2740 uint64_t nn = n[i];
2741 uint64_t pp = expand_pred_s(pg[H1(i)]);
2742 d[i] = (mm & pp) | (nn & ~pp);
2743 }
2744 }
2745
HELPER(sve_cpy_m_d)2746 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2747 uint64_t mm, uint32_t desc)
2748 {
2749 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2750 uint64_t *d = vd, *n = vn;
2751 uint8_t *pg = vg;
2752
2753 for (i = 0; i < opr_sz; i += 1) {
2754 uint64_t nn = n[i];
2755 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2756 }
2757 }
2758
HELPER(sve_cpy_z_b)2759 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2760 {
2761 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2762 uint64_t *d = vd;
2763 uint8_t *pg = vg;
2764
2765 val = dup_const(MO_8, val);
2766 for (i = 0; i < opr_sz; i += 1) {
2767 d[i] = val & expand_pred_b(pg[H1(i)]);
2768 }
2769 }
2770
HELPER(sve_cpy_z_h)2771 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2772 {
2773 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2774 uint64_t *d = vd;
2775 uint8_t *pg = vg;
2776
2777 val = dup_const(MO_16, val);
2778 for (i = 0; i < opr_sz; i += 1) {
2779 d[i] = val & expand_pred_h(pg[H1(i)]);
2780 }
2781 }
2782
HELPER(sve_cpy_z_s)2783 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2784 {
2785 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2786 uint64_t *d = vd;
2787 uint8_t *pg = vg;
2788
2789 val = dup_const(MO_32, val);
2790 for (i = 0; i < opr_sz; i += 1) {
2791 d[i] = val & expand_pred_s(pg[H1(i)]);
2792 }
2793 }
2794
HELPER(sve_cpy_z_d)2795 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2796 {
2797 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2798 uint64_t *d = vd;
2799 uint8_t *pg = vg;
2800
2801 for (i = 0; i < opr_sz; i += 1) {
2802 d[i] = (pg[H1(i)] & 1 ? val : 0);
2803 }
2804 }
2805
2806 /* Big-endian hosts need to frob the byte indices. If the copy
2807 * happens to be 8-byte aligned, then no frobbing necessary.
2808 */
swap_memmove(void * vd,void * vs,size_t n)2809 static void swap_memmove(void *vd, void *vs, size_t n)
2810 {
2811 uintptr_t d = (uintptr_t)vd;
2812 uintptr_t s = (uintptr_t)vs;
2813 uintptr_t o = (d | s | n) & 7;
2814 size_t i;
2815
2816 #if !HOST_BIG_ENDIAN
2817 o = 0;
2818 #endif
2819 switch (o) {
2820 case 0:
2821 memmove(vd, vs, n);
2822 break;
2823
2824 case 4:
2825 if (d < s || d >= s + n) {
2826 for (i = 0; i < n; i += 4) {
2827 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2828 }
2829 } else {
2830 for (i = n; i > 0; ) {
2831 i -= 4;
2832 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2833 }
2834 }
2835 break;
2836
2837 case 2:
2838 case 6:
2839 if (d < s || d >= s + n) {
2840 for (i = 0; i < n; i += 2) {
2841 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2842 }
2843 } else {
2844 for (i = n; i > 0; ) {
2845 i -= 2;
2846 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2847 }
2848 }
2849 break;
2850
2851 default:
2852 if (d < s || d >= s + n) {
2853 for (i = 0; i < n; i++) {
2854 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2855 }
2856 } else {
2857 for (i = n; i > 0; ) {
2858 i -= 1;
2859 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2860 }
2861 }
2862 break;
2863 }
2864 }
2865
2866 /* Similarly for memset of 0. */
swap_memzero(void * vd,size_t n)2867 static void swap_memzero(void *vd, size_t n)
2868 {
2869 uintptr_t d = (uintptr_t)vd;
2870 uintptr_t o = (d | n) & 7;
2871 size_t i;
2872
2873 /* Usually, the first bit of a predicate is set, so N is 0. */
2874 if (likely(n == 0)) {
2875 return;
2876 }
2877
2878 #if !HOST_BIG_ENDIAN
2879 o = 0;
2880 #endif
2881 switch (o) {
2882 case 0:
2883 memset(vd, 0, n);
2884 break;
2885
2886 case 4:
2887 for (i = 0; i < n; i += 4) {
2888 *(uint32_t *)H1_4(d + i) = 0;
2889 }
2890 break;
2891
2892 case 2:
2893 case 6:
2894 for (i = 0; i < n; i += 2) {
2895 *(uint16_t *)H1_2(d + i) = 0;
2896 }
2897 break;
2898
2899 default:
2900 for (i = 0; i < n; i++) {
2901 *(uint8_t *)H1(d + i) = 0;
2902 }
2903 break;
2904 }
2905 }
2906
HELPER(sve_ext)2907 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2908 {
2909 intptr_t opr_sz = simd_oprsz(desc);
2910 size_t n_ofs = simd_data(desc);
2911 size_t n_siz = opr_sz - n_ofs;
2912
2913 if (vd != vm) {
2914 swap_memmove(vd, vn + n_ofs, n_siz);
2915 swap_memmove(vd + n_siz, vm, n_ofs);
2916 } else if (vd != vn) {
2917 swap_memmove(vd + n_siz, vd, n_ofs);
2918 swap_memmove(vd, vn + n_ofs, n_siz);
2919 } else {
2920 /* vd == vn == vm. Need temp space. */
2921 ARMVectorReg tmp;
2922 swap_memmove(&tmp, vm, n_ofs);
2923 swap_memmove(vd, vd + n_ofs, n_siz);
2924 memcpy(vd + n_siz, &tmp, n_ofs);
2925 }
2926 }
2927
2928 #define DO_INSR(NAME, TYPE, H) \
2929 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2930 { \
2931 intptr_t opr_sz = simd_oprsz(desc); \
2932 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2933 *(TYPE *)(vd + H(0)) = val; \
2934 }
2935
DO_INSR(sve_insr_b,uint8_t,H1)2936 DO_INSR(sve_insr_b, uint8_t, H1)
2937 DO_INSR(sve_insr_h, uint16_t, H1_2)
2938 DO_INSR(sve_insr_s, uint32_t, H1_4)
2939 DO_INSR(sve_insr_d, uint64_t, H1_8)
2940
2941 #undef DO_INSR
2942
2943 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2944 {
2945 intptr_t i, j, opr_sz = simd_oprsz(desc);
2946 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2947 uint64_t f = *(uint64_t *)(vn + i);
2948 uint64_t b = *(uint64_t *)(vn + j);
2949 *(uint64_t *)(vd + i) = bswap64(b);
2950 *(uint64_t *)(vd + j) = bswap64(f);
2951 }
2952 }
2953
HELPER(sve_rev_h)2954 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2955 {
2956 intptr_t i, j, opr_sz = simd_oprsz(desc);
2957 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2958 uint64_t f = *(uint64_t *)(vn + i);
2959 uint64_t b = *(uint64_t *)(vn + j);
2960 *(uint64_t *)(vd + i) = hswap64(b);
2961 *(uint64_t *)(vd + j) = hswap64(f);
2962 }
2963 }
2964
HELPER(sve_rev_s)2965 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2966 {
2967 intptr_t i, j, opr_sz = simd_oprsz(desc);
2968 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2969 uint64_t f = *(uint64_t *)(vn + i);
2970 uint64_t b = *(uint64_t *)(vn + j);
2971 *(uint64_t *)(vd + i) = rol64(b, 32);
2972 *(uint64_t *)(vd + j) = rol64(f, 32);
2973 }
2974 }
2975
HELPER(sve_rev_d)2976 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2977 {
2978 intptr_t i, j, opr_sz = simd_oprsz(desc);
2979 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2980 uint64_t f = *(uint64_t *)(vn + i);
2981 uint64_t b = *(uint64_t *)(vn + j);
2982 *(uint64_t *)(vd + i) = b;
2983 *(uint64_t *)(vd + j) = f;
2984 }
2985 }
2986
2987 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2988
do_tbl1(void * vd,void * vn,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)2989 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2990 bool is_tbx, tb_impl_fn *fn)
2991 {
2992 ARMVectorReg scratch;
2993 uintptr_t oprsz = simd_oprsz(desc);
2994
2995 if (unlikely(vd == vn)) {
2996 vn = memcpy(&scratch, vn, oprsz);
2997 }
2998
2999 fn(vd, vn, NULL, vm, oprsz, is_tbx);
3000 }
3001
do_tbl2(void * vd,void * vn0,void * vn1,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)3002 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3003 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3004 {
3005 ARMVectorReg scratch;
3006 uintptr_t oprsz = simd_oprsz(desc);
3007
3008 if (unlikely(vd == vn0)) {
3009 vn0 = memcpy(&scratch, vn0, oprsz);
3010 if (vd == vn1) {
3011 vn1 = vn0;
3012 }
3013 } else if (unlikely(vd == vn1)) {
3014 vn1 = memcpy(&scratch, vn1, oprsz);
3015 }
3016
3017 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3018 }
3019
3020 #define DO_TB(SUFF, TYPE, H) \
3021 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
3022 void *vm, uintptr_t oprsz, bool is_tbx) \
3023 { \
3024 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
3025 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
3026 for (i = 0; i < nelem; ++i) { \
3027 TYPE index = indexes[H1(i)], val = 0; \
3028 if (index < nelem) { \
3029 val = tbl0[H(index)]; \
3030 } else { \
3031 index -= nelem; \
3032 if (tbl1 && index < nelem) { \
3033 val = tbl1[H(index)]; \
3034 } else if (is_tbx) { \
3035 continue; \
3036 } \
3037 } \
3038 d[H(i)] = val; \
3039 } \
3040 } \
3041 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3042 { \
3043 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3044 } \
3045 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3046 void *vm, uint32_t desc) \
3047 { \
3048 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3049 } \
3050 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3051 { \
3052 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3053 }
3054
3055 DO_TB(b, uint8_t, H1)
3056 DO_TB(h, uint16_t, H2)
3057 DO_TB(s, uint32_t, H4)
3058 DO_TB(d, uint64_t, H8)
3059
3060 #undef DO_TB
3061
3062 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3063 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3064 { \
3065 intptr_t i, opr_sz = simd_oprsz(desc); \
3066 TYPED *d = vd; \
3067 TYPES *n = vn; \
3068 ARMVectorReg tmp; \
3069 if (unlikely(vn - vd < opr_sz)) { \
3070 n = memcpy(&tmp, n, opr_sz / 2); \
3071 } \
3072 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3073 d[HD(i)] = n[HS(i)]; \
3074 } \
3075 }
3076
3077 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3078 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3079 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3080
3081 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3082 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3083 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3084
3085 #undef DO_UNPK
3086
3087 /* Mask of bits included in the even numbered predicates of width esz.
3088 * We also use this for expand_bits/compress_bits, and so extend the
3089 * same pattern out to 16-bit units.
3090 */
3091 static const uint64_t even_bit_esz_masks[5] = {
3092 0x5555555555555555ull,
3093 0x3333333333333333ull,
3094 0x0f0f0f0f0f0f0f0full,
3095 0x00ff00ff00ff00ffull,
3096 0x0000ffff0000ffffull,
3097 };
3098
3099 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3100 * For N==0, this corresponds to the operation that in qemu/bitops.h
3101 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3102 * section 7-2 Shuffling Bits.
3103 */
expand_bits(uint64_t x,int n)3104 static uint64_t expand_bits(uint64_t x, int n)
3105 {
3106 int i;
3107
3108 x &= 0xffffffffu;
3109 for (i = 4; i >= n; i--) {
3110 int sh = 1 << i;
3111 x = ((x << sh) | x) & even_bit_esz_masks[i];
3112 }
3113 return x;
3114 }
3115
3116 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3117 * For N==0, this corresponds to the operation that in qemu/bitops.h
3118 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3119 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3120 */
compress_bits(uint64_t x,int n)3121 static uint64_t compress_bits(uint64_t x, int n)
3122 {
3123 int i;
3124
3125 for (i = n; i <= 4; i++) {
3126 int sh = 1 << i;
3127 x &= even_bit_esz_masks[i];
3128 x = (x >> sh) | x;
3129 }
3130 return x & 0xffffffffu;
3131 }
3132
HELPER(sve_zip_p)3133 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3134 {
3135 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3136 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3137 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3138 int esize = 1 << esz;
3139 uint64_t *d = vd;
3140 intptr_t i;
3141
3142 if (oprsz <= 8) {
3143 uint64_t nn = *(uint64_t *)vn;
3144 uint64_t mm = *(uint64_t *)vm;
3145 int half = 4 * oprsz;
3146
3147 nn = extract64(nn, high * half, half);
3148 mm = extract64(mm, high * half, half);
3149 nn = expand_bits(nn, esz);
3150 mm = expand_bits(mm, esz);
3151 d[0] = nn | (mm << esize);
3152 } else {
3153 ARMPredicateReg tmp;
3154
3155 /* We produce output faster than we consume input.
3156 Therefore we must be mindful of possible overlap. */
3157 if (vd == vn) {
3158 vn = memcpy(&tmp, vn, oprsz);
3159 if (vd == vm) {
3160 vm = vn;
3161 }
3162 } else if (vd == vm) {
3163 vm = memcpy(&tmp, vm, oprsz);
3164 }
3165 if (high) {
3166 high = oprsz >> 1;
3167 }
3168
3169 if ((oprsz & 7) == 0) {
3170 uint32_t *n = vn, *m = vm;
3171 high >>= 2;
3172
3173 for (i = 0; i < oprsz / 8; i++) {
3174 uint64_t nn = n[H4(high + i)];
3175 uint64_t mm = m[H4(high + i)];
3176
3177 nn = expand_bits(nn, esz);
3178 mm = expand_bits(mm, esz);
3179 d[i] = nn | (mm << esize);
3180 }
3181 } else {
3182 uint8_t *n = vn, *m = vm;
3183 uint16_t *d16 = vd;
3184
3185 for (i = 0; i < oprsz / 2; i++) {
3186 uint16_t nn = n[H1(high + i)];
3187 uint16_t mm = m[H1(high + i)];
3188
3189 nn = expand_bits(nn, esz);
3190 mm = expand_bits(mm, esz);
3191 d16[H2(i)] = nn | (mm << esize);
3192 }
3193 }
3194 }
3195 }
3196
HELPER(sve_uzp_p)3197 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3198 {
3199 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3200 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3201 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3202 uint64_t *d = vd, *n = vn, *m = vm;
3203 uint64_t l, h;
3204 intptr_t i;
3205
3206 if (oprsz <= 8) {
3207 l = compress_bits(n[0] >> odd, esz);
3208 h = compress_bits(m[0] >> odd, esz);
3209 d[0] = l | (h << (4 * oprsz));
3210 } else {
3211 ARMPredicateReg tmp_m;
3212 intptr_t oprsz_16 = oprsz / 16;
3213
3214 if ((vm - vd) < (uintptr_t)oprsz) {
3215 m = memcpy(&tmp_m, vm, oprsz);
3216 }
3217
3218 for (i = 0; i < oprsz_16; i++) {
3219 l = n[2 * i + 0];
3220 h = n[2 * i + 1];
3221 l = compress_bits(l >> odd, esz);
3222 h = compress_bits(h >> odd, esz);
3223 d[i] = l | (h << 32);
3224 }
3225
3226 /*
3227 * For VL which is not a multiple of 512, the results from M do not
3228 * align nicely with the uint64_t for D. Put the aligned results
3229 * from M into TMP_M and then copy it into place afterward.
3230 */
3231 if (oprsz & 15) {
3232 int final_shift = (oprsz & 15) * 2;
3233
3234 l = n[2 * i + 0];
3235 h = n[2 * i + 1];
3236 l = compress_bits(l >> odd, esz);
3237 h = compress_bits(h >> odd, esz);
3238 d[i] = l | (h << final_shift);
3239
3240 for (i = 0; i < oprsz_16; i++) {
3241 l = m[2 * i + 0];
3242 h = m[2 * i + 1];
3243 l = compress_bits(l >> odd, esz);
3244 h = compress_bits(h >> odd, esz);
3245 tmp_m.p[i] = l | (h << 32);
3246 }
3247 l = m[2 * i + 0];
3248 h = m[2 * i + 1];
3249 l = compress_bits(l >> odd, esz);
3250 h = compress_bits(h >> odd, esz);
3251 tmp_m.p[i] = l | (h << final_shift);
3252
3253 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3254 } else {
3255 for (i = 0; i < oprsz_16; i++) {
3256 l = m[2 * i + 0];
3257 h = m[2 * i + 1];
3258 l = compress_bits(l >> odd, esz);
3259 h = compress_bits(h >> odd, esz);
3260 d[oprsz_16 + i] = l | (h << 32);
3261 }
3262 }
3263 }
3264 }
3265
HELPER(sve_trn_p)3266 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3267 {
3268 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3269 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3270 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3271 uint64_t *d = vd, *n = vn, *m = vm;
3272 uint64_t mask;
3273 int shr, shl;
3274 intptr_t i;
3275
3276 shl = 1 << esz;
3277 shr = 0;
3278 mask = even_bit_esz_masks[esz];
3279 if (odd) {
3280 mask <<= shl;
3281 shr = shl;
3282 shl = 0;
3283 }
3284
3285 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3286 uint64_t nn = (n[i] & mask) >> shr;
3287 uint64_t mm = (m[i] & mask) << shl;
3288 d[i] = nn + mm;
3289 }
3290 }
3291
3292 /* Reverse units of 2**N bits. */
reverse_bits_64(uint64_t x,int n)3293 static uint64_t reverse_bits_64(uint64_t x, int n)
3294 {
3295 int i, sh;
3296
3297 x = bswap64(x);
3298 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3299 uint64_t mask = even_bit_esz_masks[i];
3300 x = ((x & mask) << sh) | ((x >> sh) & mask);
3301 }
3302 return x;
3303 }
3304
reverse_bits_8(uint8_t x,int n)3305 static uint8_t reverse_bits_8(uint8_t x, int n)
3306 {
3307 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3308 int i, sh;
3309
3310 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3311 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3312 }
3313 return x;
3314 }
3315
HELPER(sve_rev_p)3316 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3317 {
3318 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3319 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3320 intptr_t i, oprsz_2 = oprsz / 2;
3321
3322 if (oprsz <= 8) {
3323 uint64_t l = *(uint64_t *)vn;
3324 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3325 *(uint64_t *)vd = l;
3326 } else if ((oprsz & 15) == 0) {
3327 for (i = 0; i < oprsz_2; i += 8) {
3328 intptr_t ih = oprsz - 8 - i;
3329 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3330 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3331 *(uint64_t *)(vd + i) = h;
3332 *(uint64_t *)(vd + ih) = l;
3333 }
3334 } else {
3335 for (i = 0; i < oprsz_2; i += 1) {
3336 intptr_t il = H1(i);
3337 intptr_t ih = H1(oprsz - 1 - i);
3338 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3339 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3340 *(uint8_t *)(vd + il) = h;
3341 *(uint8_t *)(vd + ih) = l;
3342 }
3343 }
3344 }
3345
HELPER(sve_punpk_p)3346 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3347 {
3348 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3349 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3350 uint64_t *d = vd;
3351 intptr_t i;
3352
3353 if (oprsz <= 8) {
3354 uint64_t nn = *(uint64_t *)vn;
3355 int half = 4 * oprsz;
3356
3357 nn = extract64(nn, high * half, half);
3358 nn = expand_bits(nn, 0);
3359 d[0] = nn;
3360 } else {
3361 ARMPredicateReg tmp_n;
3362
3363 /* We produce output faster than we consume input.
3364 Therefore we must be mindful of possible overlap. */
3365 if ((vn - vd) < (uintptr_t)oprsz) {
3366 vn = memcpy(&tmp_n, vn, oprsz);
3367 }
3368 if (high) {
3369 high = oprsz >> 1;
3370 }
3371
3372 if ((oprsz & 7) == 0) {
3373 uint32_t *n = vn;
3374 high >>= 2;
3375
3376 for (i = 0; i < oprsz / 8; i++) {
3377 uint64_t nn = n[H4(high + i)];
3378 d[i] = expand_bits(nn, 0);
3379 }
3380 } else {
3381 uint16_t *d16 = vd;
3382 uint8_t *n = vn;
3383
3384 for (i = 0; i < oprsz / 2; i++) {
3385 uint16_t nn = n[H1(high + i)];
3386 d16[H2(i)] = expand_bits(nn, 0);
3387 }
3388 }
3389 }
3390 }
3391
3392 #define DO_ZIP(NAME, TYPE, H) \
3393 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3394 { \
3395 intptr_t oprsz = simd_oprsz(desc); \
3396 intptr_t odd_ofs = simd_data(desc); \
3397 intptr_t i, oprsz_2 = oprsz / 2; \
3398 ARMVectorReg tmp_n, tmp_m; \
3399 /* We produce output faster than we consume input. \
3400 Therefore we must be mindful of possible overlap. */ \
3401 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3402 vn = memcpy(&tmp_n, vn, oprsz); \
3403 } \
3404 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3405 vm = memcpy(&tmp_m, vm, oprsz); \
3406 } \
3407 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3408 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3409 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3410 *(TYPE *)(vm + odd_ofs + H(i)); \
3411 } \
3412 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3413 memset(vd + oprsz - 16, 0, 16); \
3414 } \
3415 }
3416
DO_ZIP(sve_zip_b,uint8_t,H1)3417 DO_ZIP(sve_zip_b, uint8_t, H1)
3418 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3419 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3420 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3421 DO_ZIP(sve2_zip_q, Int128, )
3422
3423 #define DO_UZP(NAME, TYPE, H) \
3424 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3425 { \
3426 intptr_t oprsz = simd_oprsz(desc); \
3427 intptr_t odd_ofs = simd_data(desc); \
3428 intptr_t i, p; \
3429 ARMVectorReg tmp_m; \
3430 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3431 vm = memcpy(&tmp_m, vm, oprsz); \
3432 } \
3433 i = 0, p = odd_ofs; \
3434 do { \
3435 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3436 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3437 } while (p < oprsz); \
3438 p -= oprsz; \
3439 do { \
3440 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3441 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3442 } while (p < oprsz); \
3443 tcg_debug_assert(i == oprsz); \
3444 }
3445
3446 DO_UZP(sve_uzp_b, uint8_t, H1)
3447 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3448 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3449 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3450 DO_UZP(sve2_uzp_q, Int128, )
3451
3452 #define DO_TRN(NAME, TYPE, H) \
3453 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3454 { \
3455 intptr_t oprsz = simd_oprsz(desc); \
3456 intptr_t odd_ofs = simd_data(desc); \
3457 intptr_t i; \
3458 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3459 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3460 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3461 *(TYPE *)(vd + H(i + 0)) = ae; \
3462 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3463 } \
3464 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3465 memset(vd + oprsz - 16, 0, 16); \
3466 } \
3467 }
3468
3469 DO_TRN(sve_trn_b, uint8_t, H1)
3470 DO_TRN(sve_trn_h, uint16_t, H1_2)
3471 DO_TRN(sve_trn_s, uint32_t, H1_4)
3472 DO_TRN(sve_trn_d, uint64_t, H1_8)
3473 DO_TRN(sve2_trn_q, Int128, )
3474
3475 #undef DO_ZIP
3476 #undef DO_UZP
3477 #undef DO_TRN
3478
3479 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3480 {
3481 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3482 uint32_t *d = vd, *n = vn;
3483 uint8_t *pg = vg;
3484
3485 for (i = j = 0; i < opr_sz; i++) {
3486 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3487 d[H4(j)] = n[H4(i)];
3488 j++;
3489 }
3490 }
3491 for (; j < opr_sz; j++) {
3492 d[H4(j)] = 0;
3493 }
3494 }
3495
HELPER(sve_compact_d)3496 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3497 {
3498 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3499 uint64_t *d = vd, *n = vn;
3500 uint8_t *pg = vg;
3501
3502 for (i = j = 0; i < opr_sz; i++) {
3503 if (pg[H1(i)] & 1) {
3504 d[j] = n[i];
3505 j++;
3506 }
3507 }
3508 for (; j < opr_sz; j++) {
3509 d[j] = 0;
3510 }
3511 }
3512
3513 /* Similar to the ARM LastActiveElement pseudocode function, except the
3514 * result is multiplied by the element size. This includes the not found
3515 * indication; e.g. not found for esz=3 is -8.
3516 */
HELPER(sve_last_active_element)3517 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3518 {
3519 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3520 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3521
3522 return last_active_element(vg, words, esz);
3523 }
3524
HELPER(sve_splice)3525 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3526 {
3527 intptr_t opr_sz = simd_oprsz(desc) / 8;
3528 int esz = simd_data(desc);
3529 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3530 intptr_t i, first_i, last_i;
3531 ARMVectorReg tmp;
3532
3533 first_i = last_i = 0;
3534 first_g = last_g = 0;
3535
3536 /* Find the extent of the active elements within VG. */
3537 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3538 pg = *(uint64_t *)(vg + i) & mask;
3539 if (pg) {
3540 if (last_g == 0) {
3541 last_g = pg;
3542 last_i = i;
3543 }
3544 first_g = pg;
3545 first_i = i;
3546 }
3547 }
3548
3549 len = 0;
3550 if (first_g != 0) {
3551 first_i = first_i * 8 + ctz64(first_g);
3552 last_i = last_i * 8 + 63 - clz64(last_g);
3553 len = last_i - first_i + (1 << esz);
3554 if (vd == vm) {
3555 vm = memcpy(&tmp, vm, opr_sz * 8);
3556 }
3557 swap_memmove(vd, vn + first_i, len);
3558 }
3559 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3560 }
3561
HELPER(sve_sel_zpzz_b)3562 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3563 void *vg, uint32_t desc)
3564 {
3565 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3566 uint64_t *d = vd, *n = vn, *m = vm;
3567 uint8_t *pg = vg;
3568
3569 for (i = 0; i < opr_sz; i += 1) {
3570 uint64_t nn = n[i], mm = m[i];
3571 uint64_t pp = expand_pred_b(pg[H1(i)]);
3572 d[i] = (nn & pp) | (mm & ~pp);
3573 }
3574 }
3575
HELPER(sve_sel_zpzz_h)3576 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3577 void *vg, uint32_t desc)
3578 {
3579 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3580 uint64_t *d = vd, *n = vn, *m = vm;
3581 uint8_t *pg = vg;
3582
3583 for (i = 0; i < opr_sz; i += 1) {
3584 uint64_t nn = n[i], mm = m[i];
3585 uint64_t pp = expand_pred_h(pg[H1(i)]);
3586 d[i] = (nn & pp) | (mm & ~pp);
3587 }
3588 }
3589
HELPER(sve_sel_zpzz_s)3590 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3591 void *vg, uint32_t desc)
3592 {
3593 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3594 uint64_t *d = vd, *n = vn, *m = vm;
3595 uint8_t *pg = vg;
3596
3597 for (i = 0; i < opr_sz; i += 1) {
3598 uint64_t nn = n[i], mm = m[i];
3599 uint64_t pp = expand_pred_s(pg[H1(i)]);
3600 d[i] = (nn & pp) | (mm & ~pp);
3601 }
3602 }
3603
HELPER(sve_sel_zpzz_d)3604 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3605 void *vg, uint32_t desc)
3606 {
3607 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3608 uint64_t *d = vd, *n = vn, *m = vm;
3609 uint8_t *pg = vg;
3610
3611 for (i = 0; i < opr_sz; i += 1) {
3612 uint64_t nn = n[i], mm = m[i];
3613 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3614 }
3615 }
3616
HELPER(sve_sel_zpzz_q)3617 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3618 void *vg, uint32_t desc)
3619 {
3620 intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3621 Int128 *d = vd, *n = vn, *m = vm;
3622 uint16_t *pg = vg;
3623
3624 for (i = 0; i < opr_sz; i += 1) {
3625 d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3626 }
3627 }
3628
3629 /* Two operand comparison controlled by a predicate.
3630 * ??? It is very tempting to want to be able to expand this inline
3631 * with x86 instructions, e.g.
3632 *
3633 * vcmpeqw zm, zn, %ymm0
3634 * vpmovmskb %ymm0, %eax
3635 * and $0x5555, %eax
3636 * and pg, %eax
3637 *
3638 * or even aarch64, e.g.
3639 *
3640 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3641 * cmeq v0.8h, zn, zm
3642 * and v0.8h, v0.8h, mask
3643 * addv h0, v0.8h
3644 * and v0.8b, pg
3645 *
3646 * However, coming up with an abstraction that allows vector inputs and
3647 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3648 * scalar outputs, is tricky.
3649 */
3650 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3651 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3652 { \
3653 intptr_t opr_sz = simd_oprsz(desc); \
3654 uint32_t flags = PREDTEST_INIT; \
3655 intptr_t i = opr_sz; \
3656 do { \
3657 uint64_t out = 0, pg; \
3658 do { \
3659 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3660 TYPE nn = *(TYPE *)(vn + H(i)); \
3661 TYPE mm = *(TYPE *)(vm + H(i)); \
3662 out |= nn OP mm; \
3663 } while (i & 63); \
3664 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3665 out &= pg; \
3666 *(uint64_t *)(vd + (i >> 3)) = out; \
3667 flags = iter_predtest_bwd(out, pg, flags); \
3668 } while (i > 0); \
3669 return flags; \
3670 }
3671
3672 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3673 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3674 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3675 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3676 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3677 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3678 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3679 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3680
3681 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3682 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3683 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3684 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3685
3686 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3687 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3688 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3689 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3690
3691 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3692 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3693 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3694 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3695
3696 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3697 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3698 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3699 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3700
3701 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3702 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3703 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3704 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3705
3706 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3707 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3708 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3709 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3710
3711 #undef DO_CMP_PPZZ_B
3712 #undef DO_CMP_PPZZ_H
3713 #undef DO_CMP_PPZZ_S
3714 #undef DO_CMP_PPZZ_D
3715 #undef DO_CMP_PPZZ
3716
3717 /* Similar, but the second source is "wide". */
3718 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3719 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3720 { \
3721 intptr_t opr_sz = simd_oprsz(desc); \
3722 uint32_t flags = PREDTEST_INIT; \
3723 intptr_t i = opr_sz; \
3724 do { \
3725 uint64_t out = 0, pg; \
3726 do { \
3727 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3728 do { \
3729 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3730 TYPE nn = *(TYPE *)(vn + H(i)); \
3731 out |= nn OP mm; \
3732 } while (i & 7); \
3733 } while (i & 63); \
3734 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3735 out &= pg; \
3736 *(uint64_t *)(vd + (i >> 3)) = out; \
3737 flags = iter_predtest_bwd(out, pg, flags); \
3738 } while (i > 0); \
3739 return flags; \
3740 }
3741
3742 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3743 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3744 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3745 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3746 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3747 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3748
3749 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3750 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3751 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3752
3753 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3754 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3755 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3756
3757 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3758 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3759 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3760
3761 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3762 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3763 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3764
3765 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3766 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3767 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3768
3769 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3770 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3771 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3772
3773 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3774 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3775 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3776
3777 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3778 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3779 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3780
3781 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3782 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3783 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3784
3785 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3786 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3787 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3788
3789 #undef DO_CMP_PPZW_B
3790 #undef DO_CMP_PPZW_H
3791 #undef DO_CMP_PPZW_S
3792 #undef DO_CMP_PPZW
3793
3794 /* Similar, but the second source is immediate. */
3795 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3796 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3797 { \
3798 intptr_t opr_sz = simd_oprsz(desc); \
3799 uint32_t flags = PREDTEST_INIT; \
3800 TYPE mm = simd_data(desc); \
3801 intptr_t i = opr_sz; \
3802 do { \
3803 uint64_t out = 0, pg; \
3804 do { \
3805 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3806 TYPE nn = *(TYPE *)(vn + H(i)); \
3807 out |= nn OP mm; \
3808 } while (i & 63); \
3809 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3810 out &= pg; \
3811 *(uint64_t *)(vd + (i >> 3)) = out; \
3812 flags = iter_predtest_bwd(out, pg, flags); \
3813 } while (i > 0); \
3814 return flags; \
3815 }
3816
3817 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3818 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3819 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3820 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3821 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3822 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3823 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3824 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3825
3826 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3827 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3828 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3829 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3830
3831 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3832 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3833 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3834 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3835
3836 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3837 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3838 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3839 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3840
3841 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3842 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3843 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3844 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3845
3846 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3847 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3848 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3849 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3850
3851 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3852 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3853 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3854 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3855
3856 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3857 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3858 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3859 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3860
3861 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3862 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3863 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3864 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3865
3866 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3867 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3868 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3869 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3870
3871 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3872 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3873 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3874 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3875
3876 #undef DO_CMP_PPZI_B
3877 #undef DO_CMP_PPZI_H
3878 #undef DO_CMP_PPZI_S
3879 #undef DO_CMP_PPZI_D
3880 #undef DO_CMP_PPZI
3881
3882 /* Similar to the ARM LastActive pseudocode function. */
last_active_pred(void * vd,void * vg,intptr_t oprsz)3883 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3884 {
3885 intptr_t i;
3886
3887 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3888 uint64_t pg = *(uint64_t *)(vg + i);
3889 if (pg) {
3890 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3891 }
3892 }
3893 return 0;
3894 }
3895
3896 /* Compute a mask into RETB that is true for all G, up to and including
3897 * (if after) or excluding (if !after) the first G & N.
3898 * Return true if BRK found.
3899 */
compute_brk(uint64_t * retb,uint64_t n,uint64_t g,bool brk,bool after)3900 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3901 bool brk, bool after)
3902 {
3903 uint64_t b;
3904
3905 if (brk) {
3906 b = 0;
3907 } else if ((g & n) == 0) {
3908 /* For all G, no N are set; break not found. */
3909 b = g;
3910 } else {
3911 /* Break somewhere in N. Locate it. */
3912 b = g & n; /* guard true, pred true */
3913 b = b & -b; /* first such */
3914 if (after) {
3915 b = b | (b - 1); /* break after same */
3916 } else {
3917 b = b - 1; /* break before same */
3918 }
3919 brk = true;
3920 }
3921
3922 *retb = b;
3923 return brk;
3924 }
3925
3926 /* Compute a zeroing BRK. */
compute_brk_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3927 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3928 intptr_t oprsz, bool after)
3929 {
3930 bool brk = false;
3931 intptr_t i;
3932
3933 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3934 uint64_t this_b, this_g = g[i];
3935
3936 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3937 d[i] = this_b & this_g;
3938 }
3939 }
3940
3941 /* Likewise, but also compute flags. */
compute_brks_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3942 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3943 intptr_t oprsz, bool after)
3944 {
3945 uint32_t flags = PREDTEST_INIT;
3946 bool brk = false;
3947 intptr_t i;
3948
3949 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3950 uint64_t this_b, this_d, this_g = g[i];
3951
3952 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3953 d[i] = this_d = this_b & this_g;
3954 flags = iter_predtest_fwd(this_d, this_g, flags);
3955 }
3956 return flags;
3957 }
3958
3959 /* Compute a merging BRK. */
compute_brk_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3960 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3961 intptr_t oprsz, bool after)
3962 {
3963 bool brk = false;
3964 intptr_t i;
3965
3966 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3967 uint64_t this_b, this_g = g[i];
3968
3969 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3970 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3971 }
3972 }
3973
3974 /* Likewise, but also compute flags. */
compute_brks_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3975 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3976 intptr_t oprsz, bool after)
3977 {
3978 uint32_t flags = PREDTEST_INIT;
3979 bool brk = false;
3980 intptr_t i;
3981
3982 for (i = 0; i < oprsz / 8; ++i) {
3983 uint64_t this_b, this_d = d[i], this_g = g[i];
3984
3985 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3986 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3987 flags = iter_predtest_fwd(this_d, this_g, flags);
3988 }
3989 return flags;
3990 }
3991
do_zero(ARMPredicateReg * d,intptr_t oprsz)3992 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3993 {
3994 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3995 * The compiler should turn this into 4 64-bit integer stores.
3996 */
3997 memset(d, 0, sizeof(ARMPredicateReg));
3998 return PREDTEST_INIT;
3999 }
4000
HELPER(sve_brkpa)4001 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4002 uint32_t pred_desc)
4003 {
4004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4005 if (last_active_pred(vn, vg, oprsz)) {
4006 compute_brk_z(vd, vm, vg, oprsz, true);
4007 } else {
4008 do_zero(vd, oprsz);
4009 }
4010 }
4011
HELPER(sve_brkpas)4012 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4013 uint32_t pred_desc)
4014 {
4015 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4016 if (last_active_pred(vn, vg, oprsz)) {
4017 return compute_brks_z(vd, vm, vg, oprsz, true);
4018 } else {
4019 return do_zero(vd, oprsz);
4020 }
4021 }
4022
HELPER(sve_brkpb)4023 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4024 uint32_t pred_desc)
4025 {
4026 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4027 if (last_active_pred(vn, vg, oprsz)) {
4028 compute_brk_z(vd, vm, vg, oprsz, false);
4029 } else {
4030 do_zero(vd, oprsz);
4031 }
4032 }
4033
HELPER(sve_brkpbs)4034 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4035 uint32_t pred_desc)
4036 {
4037 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4038 if (last_active_pred(vn, vg, oprsz)) {
4039 return compute_brks_z(vd, vm, vg, oprsz, false);
4040 } else {
4041 return do_zero(vd, oprsz);
4042 }
4043 }
4044
HELPER(sve_brka_z)4045 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4046 {
4047 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4048 compute_brk_z(vd, vn, vg, oprsz, true);
4049 }
4050
HELPER(sve_brkas_z)4051 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4052 {
4053 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4054 return compute_brks_z(vd, vn, vg, oprsz, true);
4055 }
4056
HELPER(sve_brkb_z)4057 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4058 {
4059 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4060 compute_brk_z(vd, vn, vg, oprsz, false);
4061 }
4062
HELPER(sve_brkbs_z)4063 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4064 {
4065 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4066 return compute_brks_z(vd, vn, vg, oprsz, false);
4067 }
4068
HELPER(sve_brka_m)4069 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4070 {
4071 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4072 compute_brk_m(vd, vn, vg, oprsz, true);
4073 }
4074
HELPER(sve_brkas_m)4075 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4076 {
4077 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4078 return compute_brks_m(vd, vn, vg, oprsz, true);
4079 }
4080
HELPER(sve_brkb_m)4081 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4082 {
4083 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4084 compute_brk_m(vd, vn, vg, oprsz, false);
4085 }
4086
HELPER(sve_brkbs_m)4087 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4088 {
4089 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4090 return compute_brks_m(vd, vn, vg, oprsz, false);
4091 }
4092
HELPER(sve_brkn)4093 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4094 {
4095 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4096 if (!last_active_pred(vn, vg, oprsz)) {
4097 do_zero(vd, oprsz);
4098 }
4099 }
4100
4101 /* As if PredTest(Ones(PL), D, esz). */
predtest_ones(ARMPredicateReg * d,intptr_t oprsz,uint64_t esz_mask)4102 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4103 uint64_t esz_mask)
4104 {
4105 uint32_t flags = PREDTEST_INIT;
4106 intptr_t i;
4107
4108 for (i = 0; i < oprsz / 8; i++) {
4109 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4110 }
4111 if (oprsz & 7) {
4112 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4113 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4114 }
4115 return flags;
4116 }
4117
HELPER(sve_brkns)4118 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4119 {
4120 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4121 if (last_active_pred(vn, vg, oprsz)) {
4122 return predtest_ones(vd, oprsz, -1);
4123 } else {
4124 return do_zero(vd, oprsz);
4125 }
4126 }
4127
HELPER(sve_cntp)4128 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4129 {
4130 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4131 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4132 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4133 intptr_t i;
4134
4135 for (i = 0; i < words; ++i) {
4136 uint64_t t = n[i] & g[i] & mask;
4137 sum += ctpop64(t);
4138 }
4139 return sum;
4140 }
4141
HELPER(sve_whilel)4142 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4143 {
4144 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4145 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4146 uint64_t esz_mask = pred_esz_masks[esz];
4147 ARMPredicateReg *d = vd;
4148 uint32_t flags;
4149 intptr_t i;
4150
4151 /* Begin with a zero predicate register. */
4152 flags = do_zero(d, oprsz);
4153 if (count == 0) {
4154 return flags;
4155 }
4156
4157 /* Set all of the requested bits. */
4158 for (i = 0; i < count / 64; ++i) {
4159 d->p[i] = esz_mask;
4160 }
4161 if (count & 63) {
4162 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4163 }
4164
4165 return predtest_ones(d, oprsz, esz_mask);
4166 }
4167
HELPER(sve_whileg)4168 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4169 {
4170 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4171 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4172 uint64_t esz_mask = pred_esz_masks[esz];
4173 ARMPredicateReg *d = vd;
4174 intptr_t i, invcount, oprbits;
4175 uint64_t bits;
4176
4177 if (count == 0) {
4178 return do_zero(d, oprsz);
4179 }
4180
4181 oprbits = oprsz * 8;
4182 tcg_debug_assert(count <= oprbits);
4183
4184 bits = esz_mask;
4185 if (oprbits & 63) {
4186 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4187 }
4188
4189 invcount = oprbits - count;
4190 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4191 d->p[i] = bits;
4192 bits = esz_mask;
4193 }
4194
4195 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4196
4197 while (--i >= 0) {
4198 d->p[i] = 0;
4199 }
4200
4201 return predtest_ones(d, oprsz, esz_mask);
4202 }
4203
4204 /* Recursive reduction on a function;
4205 * C.f. the ARM ARM function ReducePredicated.
4206 *
4207 * While it would be possible to write this without the DATA temporary,
4208 * it is much simpler to process the predicate register this way.
4209 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4210 * little to gain with a more complex non-recursive form.
4211 */
4212 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4213 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4214 { \
4215 if (n == 1) { \
4216 return *data; \
4217 } else { \
4218 uintptr_t half = n / 2; \
4219 TYPE lo = NAME##_reduce(data, status, half); \
4220 TYPE hi = NAME##_reduce(data + half, status, half); \
4221 return FUNC(lo, hi, status); \
4222 } \
4223 } \
4224 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
4225 { \
4226 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4227 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4228 for (i = 0; i < oprsz; ) { \
4229 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4230 do { \
4231 TYPE nn = *(TYPE *)(vn + H(i)); \
4232 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4233 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4234 } while (i & 15); \
4235 } \
4236 for (; i < maxsz; i += sizeof(TYPE)) { \
4237 *(TYPE *)((void *)data + i) = IDENT; \
4238 } \
4239 return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \
4240 }
4241
DO_REDUCE(sve_faddv_h,float16,H1_2,float16_add,float16_zero)4242 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero)
4243 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero)
4244 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero)
4245
4246 /* Identity is floatN_default_nan, without the function call. */
4247 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00)
4248 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000)
4249 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
4250
4251 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00)
4252 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000)
4253 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
4254
4255 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity)
4256 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity)
4257 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity)
4258
4259 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity))
4260 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity))
4261 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity))
4262
4263 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
4264 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
4265 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
4266
4267 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh,
4268 float16_chs(float16_infinity))
4269 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs,
4270 float32_chs(float32_infinity))
4271 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd,
4272 float64_chs(float64_infinity))
4273
4274 #undef DO_REDUCE
4275
4276 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4277 float_status *status, uint32_t desc)
4278 {
4279 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4280 float16 result = nn;
4281
4282 do {
4283 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4284 do {
4285 if (pg & 1) {
4286 float16 mm = *(float16 *)(vm + H1_2(i));
4287 result = float16_add(result, mm, status);
4288 }
4289 i += sizeof(float16), pg >>= sizeof(float16);
4290 } while (i & 15);
4291 } while (i < opr_sz);
4292
4293 return result;
4294 }
4295
HELPER(sve_fadda_s)4296 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4297 float_status *status, uint32_t desc)
4298 {
4299 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4300 float32 result = nn;
4301
4302 do {
4303 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4304 do {
4305 if (pg & 1) {
4306 float32 mm = *(float32 *)(vm + H1_2(i));
4307 result = float32_add(result, mm, status);
4308 }
4309 i += sizeof(float32), pg >>= sizeof(float32);
4310 } while (i & 15);
4311 } while (i < opr_sz);
4312
4313 return result;
4314 }
4315
HELPER(sve_fadda_d)4316 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4317 float_status *status, uint32_t desc)
4318 {
4319 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4320 uint64_t *m = vm;
4321 uint8_t *pg = vg;
4322
4323 for (i = 0; i < opr_sz; i++) {
4324 if (pg[H1(i)] & 1) {
4325 nn = float64_add(nn, m[i], status);
4326 }
4327 }
4328
4329 return nn;
4330 }
4331
4332 /* Fully general three-operand expander, controlled by a predicate,
4333 * With the extra float_status parameter.
4334 */
4335 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4336 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4337 float_status *status, uint32_t desc) \
4338 { \
4339 intptr_t i = simd_oprsz(desc); \
4340 uint64_t *g = vg; \
4341 do { \
4342 uint64_t pg = g[(i - 1) >> 6]; \
4343 do { \
4344 i -= sizeof(TYPE); \
4345 if (likely((pg >> (i & 63)) & 1)) { \
4346 TYPE nn = *(TYPE *)(vn + H(i)); \
4347 TYPE mm = *(TYPE *)(vm + H(i)); \
4348 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4349 } \
4350 } while (i & 63); \
4351 } while (i != 0); \
4352 }
4353
DO_ZPZZ_FP(sve_fadd_h,uint16_t,H1_2,float16_add)4354 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4355 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4356 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4357
4358 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4359 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4360 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4361
4362 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4363 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4364 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4365
4366 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4367 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4368 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4369
4370 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4371 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4372 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4373
4374 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4375 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4376 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4377
4378 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh)
4379 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins)
4380 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind)
4381
4382 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh)
4383 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs)
4384 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd)
4385
4386 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4387 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4388 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4389
4390 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4391 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4392 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4393
4394 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4395 {
4396 return float16_abs(float16_sub(a, b, s));
4397 }
4398
abd_s(float32 a,float32 b,float_status * s)4399 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4400 {
4401 return float32_abs(float32_sub(a, b, s));
4402 }
4403
abd_d(float64 a,float64 b,float_status * s)4404 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4405 {
4406 return float64_abs(float64_sub(a, b, s));
4407 }
4408
4409 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
ah_abd_h(float16 op1,float16 op2,float_status * stat)4410 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat)
4411 {
4412 float16 r = float16_sub(op1, op2, stat);
4413 return float16_is_any_nan(r) ? r : float16_abs(r);
4414 }
4415
ah_abd_s(float32 op1,float32 op2,float_status * stat)4416 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat)
4417 {
4418 float32 r = float32_sub(op1, op2, stat);
4419 return float32_is_any_nan(r) ? r : float32_abs(r);
4420 }
4421
ah_abd_d(float64 op1,float64 op2,float_status * stat)4422 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat)
4423 {
4424 float64 r = float64_sub(op1, op2, stat);
4425 return float64_is_any_nan(r) ? r : float64_abs(r);
4426 }
4427
DO_ZPZZ_FP(sve_fabd_h,uint16_t,H1_2,abd_h)4428 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4429 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4430 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4431 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h)
4432 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s)
4433 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d)
4434
4435 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4436 {
4437 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4438 return float64_scalbn(a, b_int, s);
4439 }
4440
DO_ZPZZ_FP(sve_fscalbn_h,int16_t,H1_2,float16_scalbn)4441 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4442 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4443 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4444
4445 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4446 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4447 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4448
4449 #undef DO_ZPZZ_FP
4450
4451 /* Three-operand expander, with one scalar operand, controlled by
4452 * a predicate, with the extra float_status parameter.
4453 */
4454 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4455 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4456 float_status *status, uint32_t desc) \
4457 { \
4458 intptr_t i = simd_oprsz(desc); \
4459 uint64_t *g = vg; \
4460 TYPE mm = scalar; \
4461 do { \
4462 uint64_t pg = g[(i - 1) >> 6]; \
4463 do { \
4464 i -= sizeof(TYPE); \
4465 if (likely((pg >> (i & 63)) & 1)) { \
4466 TYPE nn = *(TYPE *)(vn + H(i)); \
4467 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4468 } \
4469 } while (i & 63); \
4470 } while (i != 0); \
4471 }
4472
4473 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4474 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4475 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4476
4477 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4478 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4479 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4480
4481 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4482 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4483 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4484
4485 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4486 {
4487 return float16_sub(b, a, s);
4488 }
4489
subr_s(float32 a,float32 b,float_status * s)4490 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4491 {
4492 return float32_sub(b, a, s);
4493 }
4494
subr_d(float64 a,float64 b,float_status * s)4495 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4496 {
4497 return float64_sub(b, a, s);
4498 }
4499
DO_ZPZS_FP(sve_fsubrs_h,float16,H1_2,subr_h)4500 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4501 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4502 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4503
4504 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4505 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4506 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4507
4508 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4509 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4510 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4511
4512 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4513 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4514 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4515
4516 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4517 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4518 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4519
4520 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh)
4521 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs)
4522 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd)
4523
4524 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh)
4525 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins)
4526 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind)
4527
4528 /* Fully general two-operand expander, controlled by a predicate,
4529 * With the extra float_status parameter.
4530 */
4531 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4532 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4533 float_status *status, uint32_t desc) \
4534 { \
4535 intptr_t i = simd_oprsz(desc); \
4536 uint64_t *g = vg; \
4537 do { \
4538 uint64_t pg = g[(i - 1) >> 6]; \
4539 do { \
4540 i -= sizeof(TYPE); \
4541 if (likely((pg >> (i & 63)) & 1)) { \
4542 TYPE nn = *(TYPE *)(vn + H(i)); \
4543 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4544 } \
4545 } while (i & 63); \
4546 } while (i != 0); \
4547 }
4548
4549 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4550 * FZ16. When converting from fp16, this affects flushing input denormals;
4551 * when converting to fp16, this affects flushing output denormals.
4552 */
4553 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4554 {
4555 bool save = get_flush_inputs_to_zero(fpst);
4556 float32 ret;
4557
4558 set_flush_inputs_to_zero(false, fpst);
4559 ret = float16_to_float32(f, true, fpst);
4560 set_flush_inputs_to_zero(save, fpst);
4561 return ret;
4562 }
4563
sve_f16_to_f64(float16 f,float_status * fpst)4564 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4565 {
4566 bool save = get_flush_inputs_to_zero(fpst);
4567 float64 ret;
4568
4569 set_flush_inputs_to_zero(false, fpst);
4570 ret = float16_to_float64(f, true, fpst);
4571 set_flush_inputs_to_zero(save, fpst);
4572 return ret;
4573 }
4574
sve_f32_to_f16(float32 f,float_status * fpst)4575 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4576 {
4577 bool save = get_flush_to_zero(fpst);
4578 float16 ret;
4579
4580 set_flush_to_zero(false, fpst);
4581 ret = float32_to_float16(f, true, fpst);
4582 set_flush_to_zero(save, fpst);
4583 return ret;
4584 }
4585
sve_f64_to_f16(float64 f,float_status * fpst)4586 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4587 {
4588 bool save = get_flush_to_zero(fpst);
4589 float16 ret;
4590
4591 set_flush_to_zero(false, fpst);
4592 ret = float64_to_float16(f, true, fpst);
4593 set_flush_to_zero(save, fpst);
4594 return ret;
4595 }
4596
vfp_float16_to_int16_rtz(float16 f,float_status * s)4597 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4598 {
4599 if (float16_is_any_nan(f)) {
4600 float_raise(float_flag_invalid, s);
4601 return 0;
4602 }
4603 return float16_to_int16_round_to_zero(f, s);
4604 }
4605
vfp_float16_to_int64_rtz(float16 f,float_status * s)4606 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4607 {
4608 if (float16_is_any_nan(f)) {
4609 float_raise(float_flag_invalid, s);
4610 return 0;
4611 }
4612 return float16_to_int64_round_to_zero(f, s);
4613 }
4614
vfp_float32_to_int64_rtz(float32 f,float_status * s)4615 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4616 {
4617 if (float32_is_any_nan(f)) {
4618 float_raise(float_flag_invalid, s);
4619 return 0;
4620 }
4621 return float32_to_int64_round_to_zero(f, s);
4622 }
4623
vfp_float64_to_int64_rtz(float64 f,float_status * s)4624 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4625 {
4626 if (float64_is_any_nan(f)) {
4627 float_raise(float_flag_invalid, s);
4628 return 0;
4629 }
4630 return float64_to_int64_round_to_zero(f, s);
4631 }
4632
vfp_float16_to_uint16_rtz(float16 f,float_status * s)4633 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4634 {
4635 if (float16_is_any_nan(f)) {
4636 float_raise(float_flag_invalid, s);
4637 return 0;
4638 }
4639 return float16_to_uint16_round_to_zero(f, s);
4640 }
4641
vfp_float16_to_uint64_rtz(float16 f,float_status * s)4642 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4643 {
4644 if (float16_is_any_nan(f)) {
4645 float_raise(float_flag_invalid, s);
4646 return 0;
4647 }
4648 return float16_to_uint64_round_to_zero(f, s);
4649 }
4650
vfp_float32_to_uint64_rtz(float32 f,float_status * s)4651 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4652 {
4653 if (float32_is_any_nan(f)) {
4654 float_raise(float_flag_invalid, s);
4655 return 0;
4656 }
4657 return float32_to_uint64_round_to_zero(f, s);
4658 }
4659
vfp_float64_to_uint64_rtz(float64 f,float_status * s)4660 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4661 {
4662 if (float64_is_any_nan(f)) {
4663 float_raise(float_flag_invalid, s);
4664 return 0;
4665 }
4666 return float64_to_uint64_round_to_zero(f, s);
4667 }
4668
DO_ZPZ_FP(sve_fcvt_sh,uint32_t,H1_4,sve_f32_to_f16)4669 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4670 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4671 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4672 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4673 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4674 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4675 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4676
4677 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4678 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4679 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4680 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4681 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4682 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4683 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4684
4685 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4686 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4687 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4688 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4689 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4690 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4691 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4692
4693 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4694 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4695 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4696
4697 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4698 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4699 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4700
4701 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4702 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4703 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4704
4705 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4706 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4707 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4708
4709 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4710 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4711 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4712 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4713 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4714 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4715 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4716
4717 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4718 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4719 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4720 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4721 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4722 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4723 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4724
4725 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4726 {
4727 /* Extract frac to the top of the uint32_t. */
4728 uint32_t frac = (uint32_t)a << (16 + 6);
4729 int16_t exp = extract32(a, 10, 5);
4730
4731 if (unlikely(exp == 0)) {
4732 if (frac != 0) {
4733 if (!get_flush_inputs_to_zero(s)) {
4734 /* denormal: bias - fractional_zeros */
4735 return -15 - clz32(frac);
4736 }
4737 /* flush to zero */
4738 float_raise(float_flag_input_denormal_flushed, s);
4739 }
4740 } else if (unlikely(exp == 0x1f)) {
4741 if (frac == 0) {
4742 return INT16_MAX; /* infinity */
4743 }
4744 } else {
4745 /* normal: exp - bias */
4746 return exp - 15;
4747 }
4748 /* nan or zero */
4749 float_raise(float_flag_invalid, s);
4750 return INT16_MIN;
4751 }
4752
do_float32_logb_as_int(float32 a,float_status * s)4753 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4754 {
4755 /* Extract frac to the top of the uint32_t. */
4756 uint32_t frac = a << 9;
4757 int32_t exp = extract32(a, 23, 8);
4758
4759 if (unlikely(exp == 0)) {
4760 if (frac != 0) {
4761 if (!get_flush_inputs_to_zero(s)) {
4762 /* denormal: bias - fractional_zeros */
4763 return -127 - clz32(frac);
4764 }
4765 /* flush to zero */
4766 float_raise(float_flag_input_denormal_flushed, s);
4767 }
4768 } else if (unlikely(exp == 0xff)) {
4769 if (frac == 0) {
4770 return INT32_MAX; /* infinity */
4771 }
4772 } else {
4773 /* normal: exp - bias */
4774 return exp - 127;
4775 }
4776 /* nan or zero */
4777 float_raise(float_flag_invalid, s);
4778 return INT32_MIN;
4779 }
4780
do_float64_logb_as_int(float64 a,float_status * s)4781 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4782 {
4783 /* Extract frac to the top of the uint64_t. */
4784 uint64_t frac = a << 12;
4785 int64_t exp = extract64(a, 52, 11);
4786
4787 if (unlikely(exp == 0)) {
4788 if (frac != 0) {
4789 if (!get_flush_inputs_to_zero(s)) {
4790 /* denormal: bias - fractional_zeros */
4791 return -1023 - clz64(frac);
4792 }
4793 /* flush to zero */
4794 float_raise(float_flag_input_denormal_flushed, s);
4795 }
4796 } else if (unlikely(exp == 0x7ff)) {
4797 if (frac == 0) {
4798 return INT64_MAX; /* infinity */
4799 }
4800 } else {
4801 /* normal: exp - bias */
4802 return exp - 1023;
4803 }
4804 /* nan or zero */
4805 float_raise(float_flag_invalid, s);
4806 return INT64_MIN;
4807 }
4808
DO_ZPZ_FP(flogb_h,float16,H1_2,do_float16_logb_as_int)4809 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4810 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4811 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4812
4813 #undef DO_ZPZ_FP
4814
4815 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4816 float_status *status, uint32_t desc,
4817 uint16_t neg1, uint16_t neg3, int flags)
4818 {
4819 intptr_t i = simd_oprsz(desc);
4820 uint64_t *g = vg;
4821
4822 do {
4823 uint64_t pg = g[(i - 1) >> 6];
4824 do {
4825 i -= 2;
4826 if (likely((pg >> (i & 63)) & 1)) {
4827 float16 e1, e2, e3, r;
4828
4829 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4830 e2 = *(uint16_t *)(vm + H1_2(i));
4831 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4832 r = float16_muladd(e1, e2, e3, flags, status);
4833 *(uint16_t *)(vd + H1_2(i)) = r;
4834 }
4835 } while (i & 63);
4836 } while (i != 0);
4837 }
4838
HELPER(sve_fmla_zpzzz_h)4839 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4840 void *vg, float_status *status, uint32_t desc)
4841 {
4842 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4843 }
4844
HELPER(sve_fmls_zpzzz_h)4845 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4846 void *vg, float_status *status, uint32_t desc)
4847 {
4848 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
4849 }
4850
HELPER(sve_fnmla_zpzzz_h)4851 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4852 void *vg, float_status *status, uint32_t desc)
4853 {
4854 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
4855 }
4856
HELPER(sve_fnmls_zpzzz_h)4857 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4858 void *vg, float_status *status, uint32_t desc)
4859 {
4860 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
4861 }
4862
HELPER(sve_ah_fmls_zpzzz_h)4863 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4864 void *vg, float_status *status, uint32_t desc)
4865 {
4866 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4867 float_muladd_negate_product);
4868 }
4869
HELPER(sve_ah_fnmla_zpzzz_h)4870 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4871 void *vg, float_status *status, uint32_t desc)
4872 {
4873 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4874 float_muladd_negate_product | float_muladd_negate_c);
4875 }
4876
HELPER(sve_ah_fnmls_zpzzz_h)4877 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4878 void *vg, float_status *status, uint32_t desc)
4879 {
4880 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4881 float_muladd_negate_c);
4882 }
4883
do_fmla_zpzzz_s(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint32_t neg1,uint32_t neg3,int flags)4884 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4885 float_status *status, uint32_t desc,
4886 uint32_t neg1, uint32_t neg3, int flags)
4887 {
4888 intptr_t i = simd_oprsz(desc);
4889 uint64_t *g = vg;
4890
4891 do {
4892 uint64_t pg = g[(i - 1) >> 6];
4893 do {
4894 i -= 4;
4895 if (likely((pg >> (i & 63)) & 1)) {
4896 float32 e1, e2, e3, r;
4897
4898 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4899 e2 = *(uint32_t *)(vm + H1_4(i));
4900 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4901 r = float32_muladd(e1, e2, e3, flags, status);
4902 *(uint32_t *)(vd + H1_4(i)) = r;
4903 }
4904 } while (i & 63);
4905 } while (i != 0);
4906 }
4907
HELPER(sve_fmla_zpzzz_s)4908 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4909 void *vg, float_status *status, uint32_t desc)
4910 {
4911 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4912 }
4913
HELPER(sve_fmls_zpzzz_s)4914 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4915 void *vg, float_status *status, uint32_t desc)
4916 {
4917 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0);
4918 }
4919
HELPER(sve_fnmla_zpzzz_s)4920 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4921 void *vg, float_status *status, uint32_t desc)
4922 {
4923 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0);
4924 }
4925
HELPER(sve_fnmls_zpzzz_s)4926 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4927 void *vg, float_status *status, uint32_t desc)
4928 {
4929 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0);
4930 }
4931
HELPER(sve_ah_fmls_zpzzz_s)4932 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4933 void *vg, float_status *status, uint32_t desc)
4934 {
4935 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4936 float_muladd_negate_product);
4937 }
4938
HELPER(sve_ah_fnmla_zpzzz_s)4939 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4940 void *vg, float_status *status, uint32_t desc)
4941 {
4942 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4943 float_muladd_negate_product | float_muladd_negate_c);
4944 }
4945
HELPER(sve_ah_fnmls_zpzzz_s)4946 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4947 void *vg, float_status *status, uint32_t desc)
4948 {
4949 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4950 float_muladd_negate_c);
4951 }
4952
do_fmla_zpzzz_d(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint64_t neg1,uint64_t neg3,int flags)4953 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4954 float_status *status, uint32_t desc,
4955 uint64_t neg1, uint64_t neg3, int flags)
4956 {
4957 intptr_t i = simd_oprsz(desc);
4958 uint64_t *g = vg;
4959
4960 do {
4961 uint64_t pg = g[(i - 1) >> 6];
4962 do {
4963 i -= 8;
4964 if (likely((pg >> (i & 63)) & 1)) {
4965 float64 e1, e2, e3, r;
4966
4967 e1 = *(uint64_t *)(vn + i) ^ neg1;
4968 e2 = *(uint64_t *)(vm + i);
4969 e3 = *(uint64_t *)(va + i) ^ neg3;
4970 r = float64_muladd(e1, e2, e3, flags, status);
4971 *(uint64_t *)(vd + i) = r;
4972 }
4973 } while (i & 63);
4974 } while (i != 0);
4975 }
4976
HELPER(sve_fmla_zpzzz_d)4977 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4978 void *vg, float_status *status, uint32_t desc)
4979 {
4980 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4981 }
4982
HELPER(sve_fmls_zpzzz_d)4983 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4984 void *vg, float_status *status, uint32_t desc)
4985 {
4986 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0);
4987 }
4988
HELPER(sve_fnmla_zpzzz_d)4989 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4990 void *vg, float_status *status, uint32_t desc)
4991 {
4992 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0);
4993 }
4994
HELPER(sve_fnmls_zpzzz_d)4995 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4996 void *vg, float_status *status, uint32_t desc)
4997 {
4998 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0);
4999 }
5000
HELPER(sve_ah_fmls_zpzzz_d)5001 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5002 void *vg, float_status *status, uint32_t desc)
5003 {
5004 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5005 float_muladd_negate_product);
5006 }
5007
HELPER(sve_ah_fnmla_zpzzz_d)5008 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5009 void *vg, float_status *status, uint32_t desc)
5010 {
5011 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5012 float_muladd_negate_product | float_muladd_negate_c);
5013 }
5014
HELPER(sve_ah_fnmls_zpzzz_d)5015 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5016 void *vg, float_status *status, uint32_t desc)
5017 {
5018 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5019 float_muladd_negate_c);
5020 }
5021
5022 /* Two operand floating-point comparison controlled by a predicate.
5023 * Unlike the integer version, we are not allowed to optimistically
5024 * compare operands, since the comparison may have side effects wrt
5025 * the FPSR.
5026 */
5027 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
5028 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
5029 float_status *status, uint32_t desc) \
5030 { \
5031 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
5032 uint64_t *d = vd, *g = vg; \
5033 do { \
5034 uint64_t out = 0, pg = g[j]; \
5035 do { \
5036 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
5037 if (likely((pg >> (i & 63)) & 1)) { \
5038 TYPE nn = *(TYPE *)(vn + H(i)); \
5039 TYPE mm = *(TYPE *)(vm + H(i)); \
5040 out |= OP(TYPE, nn, mm, status); \
5041 } \
5042 } while (i & 63); \
5043 d[j--] = out; \
5044 } while (i > 0); \
5045 }
5046
5047 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5048 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5049 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5050 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5051 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5052 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5053
5054 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5055 DO_FPCMP_PPZZ_H(NAME, OP) \
5056 DO_FPCMP_PPZZ_S(NAME, OP) \
5057 DO_FPCMP_PPZZ_D(NAME, OP)
5058
5059 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
5060 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
5061 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
5062 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
5063 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
5064 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
5065 #define DO_FCMUO(TYPE, X, Y, ST) \
5066 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5067 #define DO_FACGE(TYPE, X, Y, ST) \
5068 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5069 #define DO_FACGT(TYPE, X, Y, ST) \
5070 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5071
DO_FPCMP_PPZZ_ALL(sve_fcmge,DO_FCMGE)5072 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5073 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5074 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5075 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5076 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5077 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5078 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5079
5080 #undef DO_FPCMP_PPZZ_ALL
5081 #undef DO_FPCMP_PPZZ_D
5082 #undef DO_FPCMP_PPZZ_S
5083 #undef DO_FPCMP_PPZZ_H
5084 #undef DO_FPCMP_PPZZ
5085
5086 /* One operand floating-point comparison against zero, controlled
5087 * by a predicate.
5088 */
5089 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
5090 void HELPER(NAME)(void *vd, void *vn, void *vg, \
5091 float_status *status, uint32_t desc) \
5092 { \
5093 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
5094 uint64_t *d = vd, *g = vg; \
5095 do { \
5096 uint64_t out = 0, pg = g[j]; \
5097 do { \
5098 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
5099 if ((pg >> (i & 63)) & 1) { \
5100 TYPE nn = *(TYPE *)(vn + H(i)); \
5101 out |= OP(TYPE, nn, 0, status); \
5102 } \
5103 } while (i & 63); \
5104 d[j--] = out; \
5105 } while (i > 0); \
5106 }
5107
5108 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5109 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5110 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5111 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5112 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5113 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5114
5115 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5116 DO_FPCMP_PPZ0_H(NAME, OP) \
5117 DO_FPCMP_PPZ0_S(NAME, OP) \
5118 DO_FPCMP_PPZ0_D(NAME, OP)
5119
5120 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5121 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5122 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5123 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5124 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5125 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5126
5127 /* FP Trig Multiply-Add. */
5128
5129 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
5130 float_status *s, uint32_t desc)
5131 {
5132 static const float16 coeff[16] = {
5133 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5134 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5135 };
5136 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5137 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5138 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5139 float16 *d = vd, *n = vn, *m = vm;
5140
5141 for (i = 0; i < opr_sz; i++) {
5142 float16 mm = m[i];
5143 intptr_t xx = x;
5144 int flags = 0;
5145
5146 if (float16_is_neg(mm)) {
5147 if (fpcr_ah) {
5148 flags = float_muladd_negate_product;
5149 } else {
5150 mm = float16_abs(mm);
5151 }
5152 xx += 8;
5153 }
5154 d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5155 }
5156 }
5157
HELPER(sve_ftmad_s)5158 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5159 float_status *s, uint32_t desc)
5160 {
5161 static const float32 coeff[16] = {
5162 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5163 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5164 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5165 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5166 };
5167 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5168 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5169 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5170 float32 *d = vd, *n = vn, *m = vm;
5171
5172 for (i = 0; i < opr_sz; i++) {
5173 float32 mm = m[i];
5174 intptr_t xx = x;
5175 int flags = 0;
5176
5177 if (float32_is_neg(mm)) {
5178 if (fpcr_ah) {
5179 flags = float_muladd_negate_product;
5180 } else {
5181 mm = float32_abs(mm);
5182 }
5183 xx += 8;
5184 }
5185 d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);
5186 }
5187 }
5188
HELPER(sve_ftmad_d)5189 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5190 float_status *s, uint32_t desc)
5191 {
5192 static const float64 coeff[16] = {
5193 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5194 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5195 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5196 0x3de5d8408868552full, 0x0000000000000000ull,
5197 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5198 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5199 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5200 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5201 };
5202 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5203 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5204 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5205 float64 *d = vd, *n = vn, *m = vm;
5206
5207 for (i = 0; i < opr_sz; i++) {
5208 float64 mm = m[i];
5209 intptr_t xx = x;
5210 int flags = 0;
5211
5212 if (float64_is_neg(mm)) {
5213 if (fpcr_ah) {
5214 flags = float_muladd_negate_product;
5215 } else {
5216 mm = float64_abs(mm);
5217 }
5218 xx += 8;
5219 }
5220 d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);
5221 }
5222 }
5223
5224 /*
5225 * FP Complex Add
5226 */
5227
HELPER(sve_fcadd_h)5228 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5229 float_status *s, uint32_t desc)
5230 {
5231 intptr_t j, i = simd_oprsz(desc);
5232 uint64_t *g = vg;
5233 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5234 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5235
5236 do {
5237 uint64_t pg = g[(i - 1) >> 6];
5238 do {
5239 float16 e0, e1, e2, e3;
5240
5241 /* I holds the real index; J holds the imag index. */
5242 j = i - sizeof(float16);
5243 i -= 2 * sizeof(float16);
5244
5245 e0 = *(float16 *)(vn + H1_2(i));
5246 e1 = *(float16 *)(vm + H1_2(j));
5247 e2 = *(float16 *)(vn + H1_2(j));
5248 e3 = *(float16 *)(vm + H1_2(i));
5249
5250 if (rot) {
5251 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
5252 } else {
5253 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
5254 }
5255
5256 if (likely((pg >> (i & 63)) & 1)) {
5257 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5258 }
5259 if (likely((pg >> (j & 63)) & 1)) {
5260 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5261 }
5262 } while (i & 63);
5263 } while (i != 0);
5264 }
5265
HELPER(sve_fcadd_s)5266 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5267 float_status *s, uint32_t desc)
5268 {
5269 intptr_t j, i = simd_oprsz(desc);
5270 uint64_t *g = vg;
5271 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5272 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5273
5274 do {
5275 uint64_t pg = g[(i - 1) >> 6];
5276 do {
5277 float32 e0, e1, e2, e3;
5278
5279 /* I holds the real index; J holds the imag index. */
5280 j = i - sizeof(float32);
5281 i -= 2 * sizeof(float32);
5282
5283 e0 = *(float32 *)(vn + H1_2(i));
5284 e1 = *(float32 *)(vm + H1_2(j));
5285 e2 = *(float32 *)(vn + H1_2(j));
5286 e3 = *(float32 *)(vm + H1_2(i));
5287
5288 if (rot) {
5289 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
5290 } else {
5291 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
5292 }
5293
5294 if (likely((pg >> (i & 63)) & 1)) {
5295 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5296 }
5297 if (likely((pg >> (j & 63)) & 1)) {
5298 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5299 }
5300 } while (i & 63);
5301 } while (i != 0);
5302 }
5303
HELPER(sve_fcadd_d)5304 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5305 float_status *s, uint32_t desc)
5306 {
5307 intptr_t j, i = simd_oprsz(desc);
5308 uint64_t *g = vg;
5309 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5310 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5311
5312 do {
5313 uint64_t pg = g[(i - 1) >> 6];
5314 do {
5315 float64 e0, e1, e2, e3;
5316
5317 /* I holds the real index; J holds the imag index. */
5318 j = i - sizeof(float64);
5319 i -= 2 * sizeof(float64);
5320
5321 e0 = *(float64 *)(vn + H1_2(i));
5322 e1 = *(float64 *)(vm + H1_2(j));
5323 e2 = *(float64 *)(vn + H1_2(j));
5324 e3 = *(float64 *)(vm + H1_2(i));
5325
5326 if (rot) {
5327 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
5328 } else {
5329 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
5330 }
5331
5332 if (likely((pg >> (i & 63)) & 1)) {
5333 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5334 }
5335 if (likely((pg >> (j & 63)) & 1)) {
5336 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5337 }
5338 } while (i & 63);
5339 } while (i != 0);
5340 }
5341
5342 /*
5343 * FP Complex Multiply
5344 */
5345
HELPER(sve_fcmla_zpzzz_h)5346 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5347 void *vg, float_status *status, uint32_t desc)
5348 {
5349 intptr_t j, i = simd_oprsz(desc);
5350 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5351 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5352 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5353 uint32_t negf_real = flip ^ negf_imag;
5354 float16 negx_imag, negx_real;
5355 uint64_t *g = vg;
5356
5357 /* With AH=0, use negx; with AH=1 use negf. */
5358 negx_real = (negf_real & ~fpcr_ah) << 15;
5359 negx_imag = (negf_imag & ~fpcr_ah) << 15;
5360 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5361 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5362
5363 do {
5364 uint64_t pg = g[(i - 1) >> 6];
5365 do {
5366 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5367
5368 /* I holds the real index; J holds the imag index. */
5369 j = i - sizeof(float16);
5370 i -= 2 * sizeof(float16);
5371
5372 nr = *(float16 *)(vn + H1_2(i));
5373 ni = *(float16 *)(vn + H1_2(j));
5374 mr = *(float16 *)(vm + H1_2(i));
5375 mi = *(float16 *)(vm + H1_2(j));
5376
5377 e2 = (flip ? ni : nr);
5378 e1 = (flip ? mi : mr) ^ negx_real;
5379 e4 = e2;
5380 e3 = (flip ? mr : mi) ^ negx_imag;
5381
5382 if (likely((pg >> (i & 63)) & 1)) {
5383 d = *(float16 *)(va + H1_2(i));
5384 d = float16_muladd(e2, e1, d, negf_real, status);
5385 *(float16 *)(vd + H1_2(i)) = d;
5386 }
5387 if (likely((pg >> (j & 63)) & 1)) {
5388 d = *(float16 *)(va + H1_2(j));
5389 d = float16_muladd(e4, e3, d, negf_imag, status);
5390 *(float16 *)(vd + H1_2(j)) = d;
5391 }
5392 } while (i & 63);
5393 } while (i != 0);
5394 }
5395
HELPER(sve_fcmla_zpzzz_s)5396 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5397 void *vg, float_status *status, uint32_t desc)
5398 {
5399 intptr_t j, i = simd_oprsz(desc);
5400 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5401 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5402 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5403 uint32_t negf_real = flip ^ negf_imag;
5404 float32 negx_imag, negx_real;
5405 uint64_t *g = vg;
5406
5407 /* With AH=0, use negx; with AH=1 use negf. */
5408 negx_real = (negf_real & ~fpcr_ah) << 31;
5409 negx_imag = (negf_imag & ~fpcr_ah) << 31;
5410 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5411 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5412
5413 do {
5414 uint64_t pg = g[(i - 1) >> 6];
5415 do {
5416 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5417
5418 /* I holds the real index; J holds the imag index. */
5419 j = i - sizeof(float32);
5420 i -= 2 * sizeof(float32);
5421
5422 nr = *(float32 *)(vn + H1_2(i));
5423 ni = *(float32 *)(vn + H1_2(j));
5424 mr = *(float32 *)(vm + H1_2(i));
5425 mi = *(float32 *)(vm + H1_2(j));
5426
5427 e2 = (flip ? ni : nr);
5428 e1 = (flip ? mi : mr) ^ negx_real;
5429 e4 = e2;
5430 e3 = (flip ? mr : mi) ^ negx_imag;
5431
5432 if (likely((pg >> (i & 63)) & 1)) {
5433 d = *(float32 *)(va + H1_2(i));
5434 d = float32_muladd(e2, e1, d, negf_real, status);
5435 *(float32 *)(vd + H1_2(i)) = d;
5436 }
5437 if (likely((pg >> (j & 63)) & 1)) {
5438 d = *(float32 *)(va + H1_2(j));
5439 d = float32_muladd(e4, e3, d, negf_imag, status);
5440 *(float32 *)(vd + H1_2(j)) = d;
5441 }
5442 } while (i & 63);
5443 } while (i != 0);
5444 }
5445
HELPER(sve_fcmla_zpzzz_d)5446 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5447 void *vg, float_status *status, uint32_t desc)
5448 {
5449 intptr_t j, i = simd_oprsz(desc);
5450 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5451 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5452 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5453 uint32_t negf_real = flip ^ negf_imag;
5454 float64 negx_imag, negx_real;
5455 uint64_t *g = vg;
5456
5457 /* With AH=0, use negx; with AH=1 use negf. */
5458 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
5459 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
5460 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5461 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5462
5463 do {
5464 uint64_t pg = g[(i - 1) >> 6];
5465 do {
5466 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5467
5468 /* I holds the real index; J holds the imag index. */
5469 j = i - sizeof(float64);
5470 i -= 2 * sizeof(float64);
5471
5472 nr = *(float64 *)(vn + H1_2(i));
5473 ni = *(float64 *)(vn + H1_2(j));
5474 mr = *(float64 *)(vm + H1_2(i));
5475 mi = *(float64 *)(vm + H1_2(j));
5476
5477 e2 = (flip ? ni : nr);
5478 e1 = (flip ? mi : mr) ^ negx_real;
5479 e4 = e2;
5480 e3 = (flip ? mr : mi) ^ negx_imag;
5481
5482 if (likely((pg >> (i & 63)) & 1)) {
5483 d = *(float64 *)(va + H1_2(i));
5484 d = float64_muladd(e2, e1, d, negf_real, status);
5485 *(float64 *)(vd + H1_2(i)) = d;
5486 }
5487 if (likely((pg >> (j & 63)) & 1)) {
5488 d = *(float64 *)(va + H1_2(j));
5489 d = float64_muladd(e4, e3, d, negf_imag, status);
5490 *(float64 *)(vd + H1_2(j)) = d;
5491 }
5492 } while (i & 63);
5493 } while (i != 0);
5494 }
5495
5496 /*
5497 * Load contiguous data, protected by a governing predicate.
5498 */
5499
5500 /*
5501 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5502 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5503 * element >= @reg_off, or @reg_max if there were no active elements at all.
5504 */
find_next_active(uint64_t * vg,intptr_t reg_off,intptr_t reg_max,int esz)5505 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5506 intptr_t reg_max, int esz)
5507 {
5508 uint64_t pg_mask = pred_esz_masks[esz];
5509 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5510
5511 /* In normal usage, the first element is active. */
5512 if (likely(pg & 1)) {
5513 return reg_off;
5514 }
5515
5516 if (pg == 0) {
5517 reg_off &= -64;
5518 do {
5519 reg_off += 64;
5520 if (unlikely(reg_off >= reg_max)) {
5521 /* The entire predicate was false. */
5522 return reg_max;
5523 }
5524 pg = vg[reg_off >> 6] & pg_mask;
5525 } while (pg == 0);
5526 }
5527 reg_off += ctz64(pg);
5528
5529 /* We should never see an out of range predicate bit set. */
5530 tcg_debug_assert(reg_off < reg_max);
5531 return reg_off;
5532 }
5533
5534 /*
5535 * Resolve the guest virtual address to info->host and info->flags.
5536 * If @nofault, return false if the page is invalid, otherwise
5537 * exit via page fault exception.
5538 */
5539
sve_probe_page(SVEHostPage * info,bool nofault,CPUARMState * env,target_ulong addr,int mem_off,MMUAccessType access_type,int mmu_idx,uintptr_t retaddr)5540 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5541 target_ulong addr, int mem_off, MMUAccessType access_type,
5542 int mmu_idx, uintptr_t retaddr)
5543 {
5544 int flags;
5545
5546 addr += mem_off;
5547
5548 /*
5549 * User-only currently always issues with TBI. See the comment
5550 * above useronly_clean_ptr. Usually we clean this top byte away
5551 * during translation, but we can't do that for e.g. vector + imm
5552 * addressing modes.
5553 *
5554 * We currently always enable TBI for user-only, and do not provide
5555 * a way to turn it off. So clean the pointer unconditionally here,
5556 * rather than look it up here, or pass it down from above.
5557 */
5558 addr = useronly_clean_ptr(addr);
5559
5560 #ifdef CONFIG_USER_ONLY
5561 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5562 &info->host, retaddr);
5563 #else
5564 CPUTLBEntryFull *full;
5565 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5566 &info->host, &full, retaddr);
5567 #endif
5568 info->flags = flags;
5569
5570 if (flags & TLB_INVALID_MASK) {
5571 g_assert(nofault);
5572 return false;
5573 }
5574
5575 #ifdef CONFIG_USER_ONLY
5576 memset(&info->attrs, 0, sizeof(info->attrs));
5577 /* Require both ANON and MTE; see allocation_tag_mem(). */
5578 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5579 #else
5580 info->attrs = full->attrs;
5581 info->tagged = full->extra.arm.pte_attrs == 0xf0;
5582 #endif
5583
5584 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5585 info->host -= mem_off;
5586 return true;
5587 }
5588
5589 /*
5590 * Find first active element on each page, and a loose bound for the
5591 * final element on each page. Identify any single element that spans
5592 * the page boundary. Return true if there are any active elements.
5593 */
sve_cont_ldst_elements(SVEContLdSt * info,target_ulong addr,uint64_t * vg,intptr_t reg_max,int esz,int msize)5594 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5595 intptr_t reg_max, int esz, int msize)
5596 {
5597 const int esize = 1 << esz;
5598 const uint64_t pg_mask = pred_esz_masks[esz];
5599 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5600 intptr_t mem_off_last, mem_off_split;
5601 intptr_t page_split, elt_split;
5602 intptr_t i;
5603
5604 /* Set all of the element indices to -1, and the TLB data to 0. */
5605 memset(info, -1, offsetof(SVEContLdSt, page));
5606 memset(info->page, 0, sizeof(info->page));
5607
5608 /* Gross scan over the entire predicate to find bounds. */
5609 i = 0;
5610 do {
5611 uint64_t pg = vg[i] & pg_mask;
5612 if (pg) {
5613 reg_off_last = i * 64 + 63 - clz64(pg);
5614 if (reg_off_first < 0) {
5615 reg_off_first = i * 64 + ctz64(pg);
5616 }
5617 }
5618 } while (++i * 64 < reg_max);
5619
5620 if (unlikely(reg_off_first < 0)) {
5621 /* No active elements, no pages touched. */
5622 return false;
5623 }
5624 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5625
5626 info->reg_off_first[0] = reg_off_first;
5627 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5628 mem_off_last = (reg_off_last >> esz) * msize;
5629
5630 page_split = -(addr | TARGET_PAGE_MASK);
5631 if (likely(mem_off_last + msize <= page_split)) {
5632 /* The entire operation fits within a single page. */
5633 info->reg_off_last[0] = reg_off_last;
5634 return true;
5635 }
5636
5637 info->page_split = page_split;
5638 elt_split = page_split / msize;
5639 reg_off_split = elt_split << esz;
5640 mem_off_split = elt_split * msize;
5641
5642 /*
5643 * This is the last full element on the first page, but it is not
5644 * necessarily active. If there is no full element, i.e. the first
5645 * active element is the one that's split, this value remains -1.
5646 * It is useful as iteration bounds.
5647 */
5648 if (elt_split != 0) {
5649 info->reg_off_last[0] = reg_off_split - esize;
5650 }
5651
5652 /* Determine if an unaligned element spans the pages. */
5653 if (page_split % msize != 0) {
5654 /* It is helpful to know if the split element is active. */
5655 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5656 info->reg_off_split = reg_off_split;
5657 info->mem_off_split = mem_off_split;
5658
5659 if (reg_off_split == reg_off_last) {
5660 /* The page crossing element is last. */
5661 return true;
5662 }
5663 }
5664 reg_off_split += esize;
5665 mem_off_split += msize;
5666 }
5667
5668 /*
5669 * We do want the first active element on the second page, because
5670 * this may affect the address reported in an exception.
5671 */
5672 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5673 tcg_debug_assert(reg_off_split <= reg_off_last);
5674 info->reg_off_first[1] = reg_off_split;
5675 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5676 info->reg_off_last[1] = reg_off_last;
5677 return true;
5678 }
5679
5680 /*
5681 * Resolve the guest virtual addresses to info->page[].
5682 * Control the generation of page faults with @fault. Return false if
5683 * there is no work to do, which can only happen with @fault == FAULT_NO.
5684 */
sve_cont_ldst_pages(SVEContLdSt * info,SVEContFault fault,CPUARMState * env,target_ulong addr,MMUAccessType access_type,uintptr_t retaddr)5685 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5686 CPUARMState *env, target_ulong addr,
5687 MMUAccessType access_type, uintptr_t retaddr)
5688 {
5689 int mmu_idx = arm_env_mmu_index(env);
5690 int mem_off = info->mem_off_first[0];
5691 bool nofault = fault == FAULT_NO;
5692 bool have_work = true;
5693
5694 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5695 access_type, mmu_idx, retaddr)) {
5696 /* No work to be done. */
5697 return false;
5698 }
5699
5700 if (likely(info->page_split < 0)) {
5701 /* The entire operation was on the one page. */
5702 return true;
5703 }
5704
5705 /*
5706 * If the second page is invalid, then we want the fault address to be
5707 * the first byte on that page which is accessed.
5708 */
5709 if (info->mem_off_split >= 0) {
5710 /*
5711 * There is an element split across the pages. The fault address
5712 * should be the first byte of the second page.
5713 */
5714 mem_off = info->page_split;
5715 /*
5716 * If the split element is also the first active element
5717 * of the vector, then: For first-fault we should continue
5718 * to generate faults for the second page. For no-fault,
5719 * we have work only if the second page is valid.
5720 */
5721 if (info->mem_off_first[0] < info->mem_off_split) {
5722 nofault = FAULT_FIRST;
5723 have_work = false;
5724 }
5725 } else {
5726 /*
5727 * There is no element split across the pages. The fault address
5728 * should be the first active element on the second page.
5729 */
5730 mem_off = info->mem_off_first[1];
5731 /*
5732 * There must have been one active element on the first page,
5733 * so we're out of first-fault territory.
5734 */
5735 nofault = fault != FAULT_ALL;
5736 }
5737
5738 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5739 access_type, mmu_idx, retaddr);
5740 return have_work;
5741 }
5742
5743 #ifndef CONFIG_USER_ONLY
sve_cont_ldst_watchpoints(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,int wp_access,uintptr_t retaddr)5744 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5745 uint64_t *vg, target_ulong addr,
5746 int esize, int msize, int wp_access,
5747 uintptr_t retaddr)
5748 {
5749 intptr_t mem_off, reg_off, reg_last;
5750 int flags0 = info->page[0].flags;
5751 int flags1 = info->page[1].flags;
5752
5753 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5754 return;
5755 }
5756
5757 /* Indicate that watchpoints are handled. */
5758 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5759 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5760
5761 if (flags0 & TLB_WATCHPOINT) {
5762 mem_off = info->mem_off_first[0];
5763 reg_off = info->reg_off_first[0];
5764 reg_last = info->reg_off_last[0];
5765
5766 while (reg_off <= reg_last) {
5767 uint64_t pg = vg[reg_off >> 6];
5768 do {
5769 if ((pg >> (reg_off & 63)) & 1) {
5770 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5771 msize, info->page[0].attrs,
5772 wp_access, retaddr);
5773 }
5774 reg_off += esize;
5775 mem_off += msize;
5776 } while (reg_off <= reg_last && (reg_off & 63));
5777 }
5778 }
5779
5780 mem_off = info->mem_off_split;
5781 if (mem_off >= 0) {
5782 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5783 info->page[0].attrs, wp_access, retaddr);
5784 }
5785
5786 mem_off = info->mem_off_first[1];
5787 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5788 reg_off = info->reg_off_first[1];
5789 reg_last = info->reg_off_last[1];
5790
5791 do {
5792 uint64_t pg = vg[reg_off >> 6];
5793 do {
5794 if ((pg >> (reg_off & 63)) & 1) {
5795 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5796 msize, info->page[1].attrs,
5797 wp_access, retaddr);
5798 }
5799 reg_off += esize;
5800 mem_off += msize;
5801 } while (reg_off & 63);
5802 } while (reg_off <= reg_last);
5803 }
5804 }
5805 #endif
5806
sve_cont_ldst_mte_check(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,uint32_t mtedesc,uintptr_t ra)5807 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5808 uint64_t *vg, target_ulong addr, int esize,
5809 int msize, uint32_t mtedesc, uintptr_t ra)
5810 {
5811 intptr_t mem_off, reg_off, reg_last;
5812
5813 /* Process the page only if MemAttr == Tagged. */
5814 if (info->page[0].tagged) {
5815 mem_off = info->mem_off_first[0];
5816 reg_off = info->reg_off_first[0];
5817 reg_last = info->reg_off_split;
5818 if (reg_last < 0) {
5819 reg_last = info->reg_off_last[0];
5820 }
5821
5822 do {
5823 uint64_t pg = vg[reg_off >> 6];
5824 do {
5825 if ((pg >> (reg_off & 63)) & 1) {
5826 mte_check(env, mtedesc, addr, ra);
5827 }
5828 reg_off += esize;
5829 mem_off += msize;
5830 } while (reg_off <= reg_last && (reg_off & 63));
5831 } while (reg_off <= reg_last);
5832 }
5833
5834 mem_off = info->mem_off_first[1];
5835 if (mem_off >= 0 && info->page[1].tagged) {
5836 reg_off = info->reg_off_first[1];
5837 reg_last = info->reg_off_last[1];
5838
5839 do {
5840 uint64_t pg = vg[reg_off >> 6];
5841 do {
5842 if ((pg >> (reg_off & 63)) & 1) {
5843 mte_check(env, mtedesc, addr, ra);
5844 }
5845 reg_off += esize;
5846 mem_off += msize;
5847 } while (reg_off & 63);
5848 } while (reg_off <= reg_last);
5849 }
5850 }
5851
5852 /*
5853 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5854 */
5855 static inline QEMU_ALWAYS_INLINE
sve_ldN_r(CPUARMState * env,uint64_t * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const int N,uint32_t mtedesc,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5856 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5857 uint32_t desc, const uintptr_t retaddr,
5858 const int esz, const int msz, const int N, uint32_t mtedesc,
5859 sve_ldst1_host_fn *host_fn,
5860 sve_ldst1_tlb_fn *tlb_fn)
5861 {
5862 const unsigned rd = simd_data(desc);
5863 const intptr_t reg_max = simd_oprsz(desc);
5864 intptr_t reg_off, reg_last, mem_off;
5865 SVEContLdSt info;
5866 void *host;
5867 int flags, i;
5868
5869 /* Find the active elements. */
5870 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5871 /* The entire predicate was false; no load occurs. */
5872 for (i = 0; i < N; ++i) {
5873 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5874 }
5875 return;
5876 }
5877
5878 /* Probe the page(s). Exit with exception for any invalid page. */
5879 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5880
5881 /* Handle watchpoints for all active elements. */
5882 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5883 BP_MEM_READ, retaddr);
5884
5885 /*
5886 * Handle mte checks for all active elements.
5887 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5888 */
5889 if (mtedesc) {
5890 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5891 mtedesc, retaddr);
5892 }
5893
5894 flags = info.page[0].flags | info.page[1].flags;
5895 if (unlikely(flags != 0)) {
5896 /*
5897 * At least one page includes MMIO.
5898 * Any bus operation can fail with cpu_transaction_failed,
5899 * which for ARM will raise SyncExternal. Perform the load
5900 * into scratch memory to preserve register state until the end.
5901 */
5902 ARMVectorReg scratch[4] = { };
5903
5904 mem_off = info.mem_off_first[0];
5905 reg_off = info.reg_off_first[0];
5906 reg_last = info.reg_off_last[1];
5907 if (reg_last < 0) {
5908 reg_last = info.reg_off_split;
5909 if (reg_last < 0) {
5910 reg_last = info.reg_off_last[0];
5911 }
5912 }
5913
5914 do {
5915 uint64_t pg = vg[reg_off >> 6];
5916 do {
5917 if ((pg >> (reg_off & 63)) & 1) {
5918 for (i = 0; i < N; ++i) {
5919 tlb_fn(env, &scratch[i], reg_off,
5920 addr + mem_off + (i << msz), retaddr);
5921 }
5922 }
5923 reg_off += 1 << esz;
5924 mem_off += N << msz;
5925 } while (reg_off & 63);
5926 } while (reg_off <= reg_last);
5927
5928 for (i = 0; i < N; ++i) {
5929 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5930 }
5931 return;
5932 }
5933
5934 /* The entire operation is in RAM, on valid pages. */
5935
5936 for (i = 0; i < N; ++i) {
5937 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5938 }
5939
5940 mem_off = info.mem_off_first[0];
5941 reg_off = info.reg_off_first[0];
5942 reg_last = info.reg_off_last[0];
5943 host = info.page[0].host;
5944
5945 set_helper_retaddr(retaddr);
5946
5947 while (reg_off <= reg_last) {
5948 uint64_t pg = vg[reg_off >> 6];
5949 do {
5950 if ((pg >> (reg_off & 63)) & 1) {
5951 for (i = 0; i < N; ++i) {
5952 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5953 host + mem_off + (i << msz));
5954 }
5955 }
5956 reg_off += 1 << esz;
5957 mem_off += N << msz;
5958 } while (reg_off <= reg_last && (reg_off & 63));
5959 }
5960
5961 clear_helper_retaddr();
5962
5963 /*
5964 * Use the slow path to manage the cross-page misalignment.
5965 * But we know this is RAM and cannot trap.
5966 */
5967 mem_off = info.mem_off_split;
5968 if (unlikely(mem_off >= 0)) {
5969 reg_off = info.reg_off_split;
5970 for (i = 0; i < N; ++i) {
5971 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5972 addr + mem_off + (i << msz), retaddr);
5973 }
5974 }
5975
5976 mem_off = info.mem_off_first[1];
5977 if (unlikely(mem_off >= 0)) {
5978 reg_off = info.reg_off_first[1];
5979 reg_last = info.reg_off_last[1];
5980 host = info.page[1].host;
5981
5982 set_helper_retaddr(retaddr);
5983
5984 do {
5985 uint64_t pg = vg[reg_off >> 6];
5986 do {
5987 if ((pg >> (reg_off & 63)) & 1) {
5988 for (i = 0; i < N; ++i) {
5989 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5990 host + mem_off + (i << msz));
5991 }
5992 }
5993 reg_off += 1 << esz;
5994 mem_off += N << msz;
5995 } while (reg_off & 63);
5996 } while (reg_off <= reg_last);
5997
5998 clear_helper_retaddr();
5999 }
6000 }
6001
6002 static inline QEMU_ALWAYS_INLINE
sve_ldN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6003 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6004 uint32_t desc, const uintptr_t ra,
6005 const int esz, const int msz, const int N,
6006 sve_ldst1_host_fn *host_fn,
6007 sve_ldst1_tlb_fn *tlb_fn)
6008 {
6009 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6010 int bit55 = extract64(addr, 55, 1);
6011
6012 /* Remove mtedesc from the normal sve descriptor. */
6013 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6014
6015 /* Perform gross MTE suppression early. */
6016 if (!tbi_check(mtedesc, bit55) ||
6017 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6018 mtedesc = 0;
6019 }
6020
6021 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6022 }
6023
6024 #define DO_LD1_1(NAME, ESZ) \
6025 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
6026 target_ulong addr, uint32_t desc) \
6027 { \
6028 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
6029 sve_##NAME##_host, sve_##NAME##_tlb); \
6030 } \
6031 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
6032 target_ulong addr, uint32_t desc) \
6033 { \
6034 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
6035 sve_##NAME##_host, sve_##NAME##_tlb); \
6036 }
6037
6038 #define DO_LD1_2(NAME, ESZ, MSZ) \
6039 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
6040 target_ulong addr, uint32_t desc) \
6041 { \
6042 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6043 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6044 } \
6045 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
6046 target_ulong addr, uint32_t desc) \
6047 { \
6048 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6049 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6050 } \
6051 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6052 target_ulong addr, uint32_t desc) \
6053 { \
6054 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6055 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6056 } \
6057 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6058 target_ulong addr, uint32_t desc) \
6059 { \
6060 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6061 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6062 }
6063
DO_LD1_1(ld1bb,MO_8)6064 DO_LD1_1(ld1bb, MO_8)
6065 DO_LD1_1(ld1bhu, MO_16)
6066 DO_LD1_1(ld1bhs, MO_16)
6067 DO_LD1_1(ld1bsu, MO_32)
6068 DO_LD1_1(ld1bss, MO_32)
6069 DO_LD1_1(ld1bdu, MO_64)
6070 DO_LD1_1(ld1bds, MO_64)
6071
6072 DO_LD1_2(ld1hh, MO_16, MO_16)
6073 DO_LD1_2(ld1hsu, MO_32, MO_16)
6074 DO_LD1_2(ld1hss, MO_32, MO_16)
6075 DO_LD1_2(ld1hdu, MO_64, MO_16)
6076 DO_LD1_2(ld1hds, MO_64, MO_16)
6077
6078 DO_LD1_2(ld1ss, MO_32, MO_32)
6079 DO_LD1_2(ld1sdu, MO_64, MO_32)
6080 DO_LD1_2(ld1sds, MO_64, MO_32)
6081
6082 DO_LD1_2(ld1dd, MO_64, MO_64)
6083
6084 #undef DO_LD1_1
6085 #undef DO_LD1_2
6086
6087 #define DO_LDN_1(N) \
6088 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
6089 target_ulong addr, uint32_t desc) \
6090 { \
6091 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
6092 sve_ld1bb_host, sve_ld1bb_tlb); \
6093 } \
6094 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
6095 target_ulong addr, uint32_t desc) \
6096 { \
6097 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
6098 sve_ld1bb_host, sve_ld1bb_tlb); \
6099 }
6100
6101 #define DO_LDN_2(N, SUFF, ESZ) \
6102 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
6103 target_ulong addr, uint32_t desc) \
6104 { \
6105 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6106 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6107 } \
6108 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
6109 target_ulong addr, uint32_t desc) \
6110 { \
6111 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6112 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6113 } \
6114 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
6115 target_ulong addr, uint32_t desc) \
6116 { \
6117 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6118 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6119 } \
6120 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
6121 target_ulong addr, uint32_t desc) \
6122 { \
6123 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6124 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6125 }
6126
6127 DO_LDN_1(2)
6128 DO_LDN_1(3)
6129 DO_LDN_1(4)
6130
6131 DO_LDN_2(2, hh, MO_16)
6132 DO_LDN_2(3, hh, MO_16)
6133 DO_LDN_2(4, hh, MO_16)
6134
6135 DO_LDN_2(2, ss, MO_32)
6136 DO_LDN_2(3, ss, MO_32)
6137 DO_LDN_2(4, ss, MO_32)
6138
6139 DO_LDN_2(2, dd, MO_64)
6140 DO_LDN_2(3, dd, MO_64)
6141 DO_LDN_2(4, dd, MO_64)
6142
6143 #undef DO_LDN_1
6144 #undef DO_LDN_2
6145
6146 /*
6147 * Load contiguous data, first-fault and no-fault.
6148 *
6149 * For user-only, we control the race between page_check_range and
6150 * another thread's munmap by using set/clear_helper_retaddr. Any
6151 * SEGV that occurs between those markers is assumed to be because
6152 * the guest page vanished. Keep that block as small as possible
6153 * so that unrelated QEMU bugs are not blamed on the guest.
6154 */
6155
6156 /* Fault on byte I. All bits in FFR from I are cleared. The vector
6157 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6158 * option, which leaves subsequent data unchanged.
6159 */
6160 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6161 {
6162 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6163
6164 if (i & 63) {
6165 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6166 i = ROUND_UP(i, 64);
6167 }
6168 for (; i < oprsz; i += 64) {
6169 ffr[i / 64] = 0;
6170 }
6171 }
6172
6173 /*
6174 * Common helper for all contiguous no-fault and first-fault loads.
6175 */
6176 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r(CPUARMState * env,void * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,uint32_t mtedesc,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6177 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6178 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6179 const int esz, const int msz, const SVEContFault fault,
6180 sve_ldst1_host_fn *host_fn,
6181 sve_ldst1_tlb_fn *tlb_fn)
6182 {
6183 const unsigned rd = simd_data(desc);
6184 void *vd = &env->vfp.zregs[rd];
6185 const intptr_t reg_max = simd_oprsz(desc);
6186 intptr_t reg_off, mem_off, reg_last;
6187 SVEContLdSt info;
6188 int flags;
6189 void *host;
6190
6191 /* Find the active elements. */
6192 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6193 /* The entire predicate was false; no load occurs. */
6194 memset(vd, 0, reg_max);
6195 return;
6196 }
6197 reg_off = info.reg_off_first[0];
6198
6199 /* Probe the page(s). */
6200 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6201 /* Fault on first element. */
6202 tcg_debug_assert(fault == FAULT_NO);
6203 memset(vd, 0, reg_max);
6204 goto do_fault;
6205 }
6206
6207 mem_off = info.mem_off_first[0];
6208 flags = info.page[0].flags;
6209
6210 /*
6211 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6212 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6213 */
6214 if (!info.page[0].tagged) {
6215 mtedesc = 0;
6216 }
6217
6218 if (fault == FAULT_FIRST) {
6219 /* Trapping mte check for the first-fault element. */
6220 if (mtedesc) {
6221 mte_check(env, mtedesc, addr + mem_off, retaddr);
6222 }
6223
6224 /*
6225 * Special handling of the first active element,
6226 * if it crosses a page boundary or is MMIO.
6227 */
6228 bool is_split = mem_off == info.mem_off_split;
6229 if (unlikely(flags != 0) || unlikely(is_split)) {
6230 /*
6231 * Use the slow path for cross-page handling.
6232 * Might trap for MMIO or watchpoints.
6233 */
6234 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6235
6236 /* After any fault, zero the other elements. */
6237 swap_memzero(vd, reg_off);
6238 reg_off += 1 << esz;
6239 mem_off += 1 << msz;
6240 swap_memzero(vd + reg_off, reg_max - reg_off);
6241
6242 if (is_split) {
6243 goto second_page;
6244 }
6245 } else {
6246 memset(vd, 0, reg_max);
6247 }
6248 } else {
6249 memset(vd, 0, reg_max);
6250 if (unlikely(mem_off == info.mem_off_split)) {
6251 /* The first active element crosses a page boundary. */
6252 flags |= info.page[1].flags;
6253 if (unlikely(flags & TLB_MMIO)) {
6254 /* Some page is MMIO, see below. */
6255 goto do_fault;
6256 }
6257 if (unlikely(flags & TLB_WATCHPOINT) &&
6258 (cpu_watchpoint_address_matches
6259 (env_cpu(env), addr + mem_off, 1 << msz)
6260 & BP_MEM_READ)) {
6261 /* Watchpoint hit, see below. */
6262 goto do_fault;
6263 }
6264 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6265 goto do_fault;
6266 }
6267 /*
6268 * Use the slow path for cross-page handling.
6269 * This is RAM, without a watchpoint, and will not trap.
6270 */
6271 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6272 goto second_page;
6273 }
6274 }
6275
6276 /*
6277 * From this point on, all memory operations are MemSingleNF.
6278 *
6279 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6280 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6281 *
6282 * Unfortuately we do not have access to the memory attributes from the
6283 * PTE to tell Device memory from Normal memory. So we make a mostly
6284 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6285 * This gives the right answer for the common cases of "Normal memory,
6286 * backed by host RAM" and "Device memory, backed by MMIO".
6287 * The architecture allows us to suppress an NF load and return
6288 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6289 * case of "Normal memory, backed by MMIO" is permitted. The case we
6290 * get wrong is "Device memory, backed by host RAM", for which we
6291 * should return (UNKNOWN, FAULT) for but do not.
6292 *
6293 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6294 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6295 * architectural breakpoints the same.
6296 */
6297 if (unlikely(flags & TLB_MMIO)) {
6298 goto do_fault;
6299 }
6300
6301 reg_last = info.reg_off_last[0];
6302 host = info.page[0].host;
6303
6304 set_helper_retaddr(retaddr);
6305
6306 do {
6307 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6308 do {
6309 if ((pg >> (reg_off & 63)) & 1) {
6310 if (unlikely(flags & TLB_WATCHPOINT) &&
6311 (cpu_watchpoint_address_matches
6312 (env_cpu(env), addr + mem_off, 1 << msz)
6313 & BP_MEM_READ)) {
6314 clear_helper_retaddr();
6315 goto do_fault;
6316 }
6317 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6318 clear_helper_retaddr();
6319 goto do_fault;
6320 }
6321 host_fn(vd, reg_off, host + mem_off);
6322 }
6323 reg_off += 1 << esz;
6324 mem_off += 1 << msz;
6325 } while (reg_off <= reg_last && (reg_off & 63));
6326 } while (reg_off <= reg_last);
6327
6328 clear_helper_retaddr();
6329
6330 /*
6331 * MemSingleNF is allowed to fail for any reason. We have special
6332 * code above to handle the first element crossing a page boundary.
6333 * As an implementation choice, decline to handle a cross-page element
6334 * in any other position.
6335 */
6336 reg_off = info.reg_off_split;
6337 if (reg_off >= 0) {
6338 goto do_fault;
6339 }
6340
6341 second_page:
6342 reg_off = info.reg_off_first[1];
6343 if (likely(reg_off < 0)) {
6344 /* No active elements on the second page. All done. */
6345 return;
6346 }
6347
6348 /*
6349 * MemSingleNF is allowed to fail for any reason. As an implementation
6350 * choice, decline to handle elements on the second page. This should
6351 * be low frequency as the guest walks through memory -- the next
6352 * iteration of the guest's loop should be aligned on the page boundary,
6353 * and then all following iterations will stay aligned.
6354 */
6355
6356 do_fault:
6357 record_fault(env, reg_off, reg_max);
6358 }
6359
6360 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r_mte(CPUARMState * env,void * vg,target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6361 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6362 uint32_t desc, const uintptr_t retaddr,
6363 const int esz, const int msz, const SVEContFault fault,
6364 sve_ldst1_host_fn *host_fn,
6365 sve_ldst1_tlb_fn *tlb_fn)
6366 {
6367 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6368 int bit55 = extract64(addr, 55, 1);
6369
6370 /* Remove mtedesc from the normal sve descriptor. */
6371 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6372
6373 /* Perform gross MTE suppression early. */
6374 if (!tbi_check(mtedesc, bit55) ||
6375 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6376 mtedesc = 0;
6377 }
6378
6379 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6380 esz, msz, fault, host_fn, tlb_fn);
6381 }
6382
6383 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6384 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6385 target_ulong addr, uint32_t desc) \
6386 { \
6387 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6388 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6389 } \
6390 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6391 target_ulong addr, uint32_t desc) \
6392 { \
6393 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6394 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6395 } \
6396 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6397 target_ulong addr, uint32_t desc) \
6398 { \
6399 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6400 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6401 } \
6402 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6403 target_ulong addr, uint32_t desc) \
6404 { \
6405 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6406 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6407 }
6408
6409 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6410 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6411 target_ulong addr, uint32_t desc) \
6412 { \
6413 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6414 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6415 } \
6416 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6417 target_ulong addr, uint32_t desc) \
6418 { \
6419 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6420 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6421 } \
6422 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6423 target_ulong addr, uint32_t desc) \
6424 { \
6425 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6426 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6427 } \
6428 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6429 target_ulong addr, uint32_t desc) \
6430 { \
6431 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6432 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6433 } \
6434 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6435 target_ulong addr, uint32_t desc) \
6436 { \
6437 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6438 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6439 } \
6440 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6441 target_ulong addr, uint32_t desc) \
6442 { \
6443 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6444 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6445 } \
6446 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6447 target_ulong addr, uint32_t desc) \
6448 { \
6449 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6450 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6451 } \
6452 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6453 target_ulong addr, uint32_t desc) \
6454 { \
6455 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6456 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6457 }
6458
DO_LDFF1_LDNF1_1(bb,MO_8)6459 DO_LDFF1_LDNF1_1(bb, MO_8)
6460 DO_LDFF1_LDNF1_1(bhu, MO_16)
6461 DO_LDFF1_LDNF1_1(bhs, MO_16)
6462 DO_LDFF1_LDNF1_1(bsu, MO_32)
6463 DO_LDFF1_LDNF1_1(bss, MO_32)
6464 DO_LDFF1_LDNF1_1(bdu, MO_64)
6465 DO_LDFF1_LDNF1_1(bds, MO_64)
6466
6467 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6468 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6469 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6470 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6471 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6472
6473 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6474 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6475 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6476
6477 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6478
6479 #undef DO_LDFF1_LDNF1_1
6480 #undef DO_LDFF1_LDNF1_2
6481
6482 /*
6483 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6484 */
6485
6486 static inline QEMU_ALWAYS_INLINE
6487 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6488 uint32_t desc, const uintptr_t retaddr,
6489 const int esz, const int msz, const int N, uint32_t mtedesc,
6490 sve_ldst1_host_fn *host_fn,
6491 sve_ldst1_tlb_fn *tlb_fn)
6492 {
6493 const unsigned rd = simd_data(desc);
6494 const intptr_t reg_max = simd_oprsz(desc);
6495 intptr_t reg_off, reg_last, mem_off;
6496 SVEContLdSt info;
6497 void *host;
6498 int i, flags;
6499
6500 /* Find the active elements. */
6501 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6502 /* The entire predicate was false; no store occurs. */
6503 return;
6504 }
6505
6506 /* Probe the page(s). Exit with exception for any invalid page. */
6507 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6508
6509 /* Handle watchpoints for all active elements. */
6510 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6511 BP_MEM_WRITE, retaddr);
6512
6513 /*
6514 * Handle mte checks for all active elements.
6515 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6516 */
6517 if (mtedesc) {
6518 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6519 mtedesc, retaddr);
6520 }
6521
6522 flags = info.page[0].flags | info.page[1].flags;
6523 if (unlikely(flags != 0)) {
6524 /*
6525 * At least one page includes MMIO.
6526 * Any bus operation can fail with cpu_transaction_failed,
6527 * which for ARM will raise SyncExternal. We cannot avoid
6528 * this fault and will leave with the store incomplete.
6529 */
6530 mem_off = info.mem_off_first[0];
6531 reg_off = info.reg_off_first[0];
6532 reg_last = info.reg_off_last[1];
6533 if (reg_last < 0) {
6534 reg_last = info.reg_off_split;
6535 if (reg_last < 0) {
6536 reg_last = info.reg_off_last[0];
6537 }
6538 }
6539
6540 do {
6541 uint64_t pg = vg[reg_off >> 6];
6542 do {
6543 if ((pg >> (reg_off & 63)) & 1) {
6544 for (i = 0; i < N; ++i) {
6545 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6546 addr + mem_off + (i << msz), retaddr);
6547 }
6548 }
6549 reg_off += 1 << esz;
6550 mem_off += N << msz;
6551 } while (reg_off & 63);
6552 } while (reg_off <= reg_last);
6553 return;
6554 }
6555
6556 mem_off = info.mem_off_first[0];
6557 reg_off = info.reg_off_first[0];
6558 reg_last = info.reg_off_last[0];
6559 host = info.page[0].host;
6560
6561 set_helper_retaddr(retaddr);
6562
6563 while (reg_off <= reg_last) {
6564 uint64_t pg = vg[reg_off >> 6];
6565 do {
6566 if ((pg >> (reg_off & 63)) & 1) {
6567 for (i = 0; i < N; ++i) {
6568 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6569 host + mem_off + (i << msz));
6570 }
6571 }
6572 reg_off += 1 << esz;
6573 mem_off += N << msz;
6574 } while (reg_off <= reg_last && (reg_off & 63));
6575 }
6576
6577 clear_helper_retaddr();
6578
6579 /*
6580 * Use the slow path to manage the cross-page misalignment.
6581 * But we know this is RAM and cannot trap.
6582 */
6583 mem_off = info.mem_off_split;
6584 if (unlikely(mem_off >= 0)) {
6585 reg_off = info.reg_off_split;
6586 for (i = 0; i < N; ++i) {
6587 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6588 addr + mem_off + (i << msz), retaddr);
6589 }
6590 }
6591
6592 mem_off = info.mem_off_first[1];
6593 if (unlikely(mem_off >= 0)) {
6594 reg_off = info.reg_off_first[1];
6595 reg_last = info.reg_off_last[1];
6596 host = info.page[1].host;
6597
6598 set_helper_retaddr(retaddr);
6599
6600 do {
6601 uint64_t pg = vg[reg_off >> 6];
6602 do {
6603 if ((pg >> (reg_off & 63)) & 1) {
6604 for (i = 0; i < N; ++i) {
6605 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6606 host + mem_off + (i << msz));
6607 }
6608 }
6609 reg_off += 1 << esz;
6610 mem_off += N << msz;
6611 } while (reg_off & 63);
6612 } while (reg_off <= reg_last);
6613
6614 clear_helper_retaddr();
6615 }
6616 }
6617
6618 static inline QEMU_ALWAYS_INLINE
sve_stN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6619 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6620 uint32_t desc, const uintptr_t ra,
6621 const int esz, const int msz, const int N,
6622 sve_ldst1_host_fn *host_fn,
6623 sve_ldst1_tlb_fn *tlb_fn)
6624 {
6625 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6626 int bit55 = extract64(addr, 55, 1);
6627
6628 /* Remove mtedesc from the normal sve descriptor. */
6629 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6630
6631 /* Perform gross MTE suppression early. */
6632 if (!tbi_check(mtedesc, bit55) ||
6633 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6634 mtedesc = 0;
6635 }
6636
6637 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6638 }
6639
6640 #define DO_STN_1(N, NAME, ESZ) \
6641 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6642 target_ulong addr, uint32_t desc) \
6643 { \
6644 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6645 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6646 } \
6647 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6648 target_ulong addr, uint32_t desc) \
6649 { \
6650 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6651 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6652 }
6653
6654 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6655 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6656 target_ulong addr, uint32_t desc) \
6657 { \
6658 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6659 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6660 } \
6661 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6662 target_ulong addr, uint32_t desc) \
6663 { \
6664 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6665 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6666 } \
6667 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6668 target_ulong addr, uint32_t desc) \
6669 { \
6670 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6671 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6672 } \
6673 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6674 target_ulong addr, uint32_t desc) \
6675 { \
6676 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6677 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6678 }
6679
6680 DO_STN_1(1, bb, MO_8)
6681 DO_STN_1(1, bh, MO_16)
6682 DO_STN_1(1, bs, MO_32)
6683 DO_STN_1(1, bd, MO_64)
6684 DO_STN_1(2, bb, MO_8)
6685 DO_STN_1(3, bb, MO_8)
6686 DO_STN_1(4, bb, MO_8)
6687
6688 DO_STN_2(1, hh, MO_16, MO_16)
6689 DO_STN_2(1, hs, MO_32, MO_16)
6690 DO_STN_2(1, hd, MO_64, MO_16)
6691 DO_STN_2(2, hh, MO_16, MO_16)
6692 DO_STN_2(3, hh, MO_16, MO_16)
6693 DO_STN_2(4, hh, MO_16, MO_16)
6694
6695 DO_STN_2(1, ss, MO_32, MO_32)
6696 DO_STN_2(1, sd, MO_64, MO_32)
6697 DO_STN_2(2, ss, MO_32, MO_32)
6698 DO_STN_2(3, ss, MO_32, MO_32)
6699 DO_STN_2(4, ss, MO_32, MO_32)
6700
6701 DO_STN_2(1, dd, MO_64, MO_64)
6702 DO_STN_2(2, dd, MO_64, MO_64)
6703 DO_STN_2(3, dd, MO_64, MO_64)
6704 DO_STN_2(4, dd, MO_64, MO_64)
6705
6706 #undef DO_STN_1
6707 #undef DO_STN_2
6708
6709 /*
6710 * Loads with a vector index.
6711 */
6712
6713 /*
6714 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6715 */
6716 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6717
off_zsu_s(void * reg,intptr_t reg_ofs)6718 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6719 {
6720 return *(uint32_t *)(reg + H1_4(reg_ofs));
6721 }
6722
off_zss_s(void * reg,intptr_t reg_ofs)6723 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6724 {
6725 return *(int32_t *)(reg + H1_4(reg_ofs));
6726 }
6727
off_zsu_d(void * reg,intptr_t reg_ofs)6728 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6729 {
6730 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6731 }
6732
off_zss_d(void * reg,intptr_t reg_ofs)6733 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6734 {
6735 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6736 }
6737
off_zd_d(void * reg,intptr_t reg_ofs)6738 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6739 {
6740 return *(uint64_t *)(reg + reg_ofs);
6741 }
6742
6743 static inline QEMU_ALWAYS_INLINE
sve_ld1_z(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,uint32_t mtedesc,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6744 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6745 target_ulong base, uint32_t desc, uintptr_t retaddr,
6746 uint32_t mtedesc, int esize, int msize,
6747 zreg_off_fn *off_fn,
6748 sve_ldst1_host_fn *host_fn,
6749 sve_ldst1_tlb_fn *tlb_fn)
6750 {
6751 const int mmu_idx = arm_env_mmu_index(env);
6752 const intptr_t reg_max = simd_oprsz(desc);
6753 const int scale = simd_data(desc);
6754 ARMVectorReg scratch;
6755 intptr_t reg_off;
6756 SVEHostPage info, info2;
6757
6758 memset(&scratch, 0, reg_max);
6759 reg_off = 0;
6760 do {
6761 uint64_t pg = vg[reg_off >> 6];
6762 do {
6763 if (likely(pg & 1)) {
6764 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6765 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6766
6767 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6768 mmu_idx, retaddr);
6769
6770 if (likely(in_page >= msize)) {
6771 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6772 cpu_check_watchpoint(env_cpu(env), addr, msize,
6773 info.attrs, BP_MEM_READ, retaddr);
6774 }
6775 if (mtedesc && info.tagged) {
6776 mte_check(env, mtedesc, addr, retaddr);
6777 }
6778 if (unlikely(info.flags & TLB_MMIO)) {
6779 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6780 } else {
6781 set_helper_retaddr(retaddr);
6782 host_fn(&scratch, reg_off, info.host);
6783 clear_helper_retaddr();
6784 }
6785 } else {
6786 /* Element crosses the page boundary. */
6787 sve_probe_page(&info2, false, env, addr + in_page, 0,
6788 MMU_DATA_LOAD, mmu_idx, retaddr);
6789 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6790 cpu_check_watchpoint(env_cpu(env), addr,
6791 msize, info.attrs,
6792 BP_MEM_READ, retaddr);
6793 }
6794 if (mtedesc && info.tagged) {
6795 mte_check(env, mtedesc, addr, retaddr);
6796 }
6797 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6798 }
6799 }
6800 reg_off += esize;
6801 pg >>= esize;
6802 } while (reg_off & 63);
6803 } while (reg_off < reg_max);
6804
6805 /* Wait until all exceptions have been raised to write back. */
6806 memcpy(vd, &scratch, reg_max);
6807 }
6808
6809 static inline QEMU_ALWAYS_INLINE
sve_ld1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6810 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6811 target_ulong base, uint32_t desc, uintptr_t retaddr,
6812 int esize, int msize, zreg_off_fn *off_fn,
6813 sve_ldst1_host_fn *host_fn,
6814 sve_ldst1_tlb_fn *tlb_fn)
6815 {
6816 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6817 /* Remove mtedesc from the normal sve descriptor. */
6818 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6819
6820 /*
6821 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6822 * offset base entirely over the address space hole to change the
6823 * pointer tag, or change the bit55 selector. So we could here
6824 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6825 */
6826 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6827 esize, msize, off_fn, host_fn, tlb_fn);
6828 }
6829
6830 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6831 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6832 void *vm, target_ulong base, uint32_t desc) \
6833 { \
6834 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6835 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6836 } \
6837 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6838 void *vm, target_ulong base, uint32_t desc) \
6839 { \
6840 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6841 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6842 }
6843
6844 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6845 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6846 void *vm, target_ulong base, uint32_t desc) \
6847 { \
6848 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6849 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6850 } \
6851 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6852 void *vm, target_ulong base, uint32_t desc) \
6853 { \
6854 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6855 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6856 }
6857
DO_LD1_ZPZ_S(bsu,zsu,MO_8)6858 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6859 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6860 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6861 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6862 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6863
6864 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6865 DO_LD1_ZPZ_S(bss, zss, MO_8)
6866 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6867 DO_LD1_ZPZ_D(bds, zss, MO_8)
6868 DO_LD1_ZPZ_D(bds, zd, MO_8)
6869
6870 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6871 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6872 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6873 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6874 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6875
6876 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6877 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6878 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6879 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6880 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6881
6882 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6883 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6884 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6885 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6886 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6887
6888 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6889 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6890 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6891 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6892 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6893
6894 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6895 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6896 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6897 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6898 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6899
6900 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6901 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6902 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6903 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6904 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6905
6906 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6907 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6908 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6909
6910 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6911 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6912 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6913
6914 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6915 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6916 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6917
6918 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6919 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6920 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6921
6922 #undef DO_LD1_ZPZ_S
6923 #undef DO_LD1_ZPZ_D
6924
6925 /* First fault loads with a vector index. */
6926
6927 /*
6928 * Common helpers for all gather first-faulting loads.
6929 */
6930
6931 static inline QEMU_ALWAYS_INLINE
6932 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6933 target_ulong base, uint32_t desc, uintptr_t retaddr,
6934 uint32_t mtedesc, const int esz, const int msz,
6935 zreg_off_fn *off_fn,
6936 sve_ldst1_host_fn *host_fn,
6937 sve_ldst1_tlb_fn *tlb_fn)
6938 {
6939 const int mmu_idx = arm_env_mmu_index(env);
6940 const intptr_t reg_max = simd_oprsz(desc);
6941 const int scale = simd_data(desc);
6942 const int esize = 1 << esz;
6943 const int msize = 1 << msz;
6944 intptr_t reg_off;
6945 SVEHostPage info;
6946 target_ulong addr, in_page;
6947 ARMVectorReg scratch;
6948
6949 /* Skip to the first true predicate. */
6950 reg_off = find_next_active(vg, 0, reg_max, esz);
6951 if (unlikely(reg_off >= reg_max)) {
6952 /* The entire predicate was false; no load occurs. */
6953 memset(vd, 0, reg_max);
6954 return;
6955 }
6956
6957 /* Protect against overlap between vd and vm. */
6958 if (unlikely(vd == vm)) {
6959 vm = memcpy(&scratch, vm, reg_max);
6960 }
6961
6962 /*
6963 * Probe the first element, allowing faults.
6964 */
6965 addr = base + (off_fn(vm, reg_off) << scale);
6966 if (mtedesc) {
6967 mte_check(env, mtedesc, addr, retaddr);
6968 }
6969 tlb_fn(env, vd, reg_off, addr, retaddr);
6970
6971 /* After any fault, zero the other elements. */
6972 swap_memzero(vd, reg_off);
6973 reg_off += esize;
6974 swap_memzero(vd + reg_off, reg_max - reg_off);
6975
6976 /*
6977 * Probe the remaining elements, not allowing faults.
6978 */
6979 while (reg_off < reg_max) {
6980 uint64_t pg = vg[reg_off >> 6];
6981 do {
6982 if (likely((pg >> (reg_off & 63)) & 1)) {
6983 addr = base + (off_fn(vm, reg_off) << scale);
6984 in_page = -(addr | TARGET_PAGE_MASK);
6985
6986 if (unlikely(in_page < msize)) {
6987 /* Stop if the element crosses a page boundary. */
6988 goto fault;
6989 }
6990
6991 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6992 mmu_idx, retaddr);
6993 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6994 goto fault;
6995 }
6996 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6997 (cpu_watchpoint_address_matches
6998 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6999 goto fault;
7000 }
7001 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
7002 goto fault;
7003 }
7004
7005 set_helper_retaddr(retaddr);
7006 host_fn(vd, reg_off, info.host);
7007 clear_helper_retaddr();
7008 }
7009 reg_off += esize;
7010 } while (reg_off & 63);
7011 }
7012 return;
7013
7014 fault:
7015 record_fault(env, reg_off, reg_max);
7016 }
7017
7018 static inline QEMU_ALWAYS_INLINE
sve_ldff1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,const int esz,const int msz,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7019 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7020 target_ulong base, uint32_t desc, uintptr_t retaddr,
7021 const int esz, const int msz,
7022 zreg_off_fn *off_fn,
7023 sve_ldst1_host_fn *host_fn,
7024 sve_ldst1_tlb_fn *tlb_fn)
7025 {
7026 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7027 /* Remove mtedesc from the normal sve descriptor. */
7028 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7029
7030 /*
7031 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7032 * offset base entirely over the address space hole to change the
7033 * pointer tag, or change the bit55 selector. So we could here
7034 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7035 */
7036 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7037 esz, msz, off_fn, host_fn, tlb_fn);
7038 }
7039
7040 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
7041 void HELPER(sve_ldff##MEM##_##OFS) \
7042 (CPUARMState *env, void *vd, void *vg, \
7043 void *vm, target_ulong base, uint32_t desc) \
7044 { \
7045 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
7046 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7047 } \
7048 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7049 (CPUARMState *env, void *vd, void *vg, \
7050 void *vm, target_ulong base, uint32_t desc) \
7051 { \
7052 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
7053 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7054 }
7055
7056 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
7057 void HELPER(sve_ldff##MEM##_##OFS) \
7058 (CPUARMState *env, void *vd, void *vg, \
7059 void *vm, target_ulong base, uint32_t desc) \
7060 { \
7061 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
7062 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7063 } \
7064 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7065 (CPUARMState *env, void *vd, void *vg, \
7066 void *vm, target_ulong base, uint32_t desc) \
7067 { \
7068 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
7069 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7070 }
7071
DO_LDFF1_ZPZ_S(bsu,zsu,MO_8)7072 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7073 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7074 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7075 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7076 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7077
7078 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7079 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7080 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7081 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7082 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7083
7084 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7085 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7086 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7087 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7088 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7089
7090 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7091 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7092 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7093 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7094 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7095
7096 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7097 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7098 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7099 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7100 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7101
7102 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7103 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7104 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7105 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7106 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7107
7108 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
7109 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
7110 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7111 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7112 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7113
7114 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
7115 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
7116 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7117 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7118 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7119
7120 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7121 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7122 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7123
7124 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7125 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7126 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7127
7128 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7129 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7130 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7131
7132 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7133 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7134 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7135
7136 /* Stores with a vector index. */
7137
7138 static inline QEMU_ALWAYS_INLINE
7139 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7140 target_ulong base, uint32_t desc, uintptr_t retaddr,
7141 uint32_t mtedesc, int esize, int msize,
7142 zreg_off_fn *off_fn,
7143 sve_ldst1_host_fn *host_fn,
7144 sve_ldst1_tlb_fn *tlb_fn)
7145 {
7146 const int mmu_idx = arm_env_mmu_index(env);
7147 const intptr_t reg_max = simd_oprsz(desc);
7148 const int scale = simd_data(desc);
7149 void *host[ARM_MAX_VQ * 4];
7150 intptr_t reg_off, i;
7151 SVEHostPage info, info2;
7152
7153 /*
7154 * Probe all of the elements for host addresses and flags.
7155 */
7156 i = reg_off = 0;
7157 do {
7158 uint64_t pg = vg[reg_off >> 6];
7159 do {
7160 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7161 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7162
7163 host[i] = NULL;
7164 if (likely((pg >> (reg_off & 63)) & 1)) {
7165 if (likely(in_page >= msize)) {
7166 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7167 mmu_idx, retaddr);
7168 if (!(info.flags & TLB_MMIO)) {
7169 host[i] = info.host;
7170 }
7171 } else {
7172 /*
7173 * Element crosses the page boundary.
7174 * Probe both pages, but do not record the host address,
7175 * so that we use the slow path.
7176 */
7177 sve_probe_page(&info, false, env, addr, 0,
7178 MMU_DATA_STORE, mmu_idx, retaddr);
7179 sve_probe_page(&info2, false, env, addr + in_page, 0,
7180 MMU_DATA_STORE, mmu_idx, retaddr);
7181 info.flags |= info2.flags;
7182 }
7183
7184 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7185 cpu_check_watchpoint(env_cpu(env), addr, msize,
7186 info.attrs, BP_MEM_WRITE, retaddr);
7187 }
7188
7189 if (mtedesc && info.tagged) {
7190 mte_check(env, mtedesc, addr, retaddr);
7191 }
7192 }
7193 i += 1;
7194 reg_off += esize;
7195 } while (reg_off & 63);
7196 } while (reg_off < reg_max);
7197
7198 /*
7199 * Now that we have recognized all exceptions except SyncExternal
7200 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7201 *
7202 * Note for the common case of an element in RAM, not crossing a page
7203 * boundary, we have stored the host address in host[]. This doubles
7204 * as a first-level check against the predicate, since only enabled
7205 * elements have non-null host addresses.
7206 */
7207 i = reg_off = 0;
7208 do {
7209 void *h = host[i];
7210 if (likely(h != NULL)) {
7211 set_helper_retaddr(retaddr);
7212 host_fn(vd, reg_off, h);
7213 clear_helper_retaddr();
7214 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7215 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7216 tlb_fn(env, vd, reg_off, addr, retaddr);
7217 }
7218 i += 1;
7219 reg_off += esize;
7220 } while (reg_off < reg_max);
7221 }
7222
7223 static inline QEMU_ALWAYS_INLINE
sve_st1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7224 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7225 target_ulong base, uint32_t desc, uintptr_t retaddr,
7226 int esize, int msize, zreg_off_fn *off_fn,
7227 sve_ldst1_host_fn *host_fn,
7228 sve_ldst1_tlb_fn *tlb_fn)
7229 {
7230 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7231 /* Remove mtedesc from the normal sve descriptor. */
7232 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7233
7234 /*
7235 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7236 * offset base entirely over the address space hole to change the
7237 * pointer tag, or change the bit55 selector. So we could here
7238 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7239 */
7240 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7241 esize, msize, off_fn, host_fn, tlb_fn);
7242 }
7243
7244 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7245 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7246 void *vm, target_ulong base, uint32_t desc) \
7247 { \
7248 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7249 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7250 } \
7251 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7252 void *vm, target_ulong base, uint32_t desc) \
7253 { \
7254 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7255 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7256 }
7257
7258 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7259 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7260 void *vm, target_ulong base, uint32_t desc) \
7261 { \
7262 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7263 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7264 } \
7265 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7266 void *vm, target_ulong base, uint32_t desc) \
7267 { \
7268 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7269 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7270 }
7271
DO_ST1_ZPZ_S(bs,zsu,MO_8)7272 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7273 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7274 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7275 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7276 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7277
7278 DO_ST1_ZPZ_S(bs, zss, MO_8)
7279 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7280 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7281 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7282 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7283
7284 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7285 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7286 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7287 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7288 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7289 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7290 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7291
7292 DO_ST1_ZPZ_D(bd, zss, MO_8)
7293 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7294 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7295 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7296 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7297 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7298 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7299
7300 DO_ST1_ZPZ_D(bd, zd, MO_8)
7301 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7302 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7303 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7304 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7305 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7306 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7307
7308 #undef DO_ST1_ZPZ_S
7309 #undef DO_ST1_ZPZ_D
7310
7311 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7312 {
7313 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7314 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7315
7316 for (i = 0; i < opr_sz; ++i) {
7317 d[i] = n[i] ^ m[i] ^ k[i];
7318 }
7319 }
7320
HELPER(sve2_bcax)7321 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7322 {
7323 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7324 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7325
7326 for (i = 0; i < opr_sz; ++i) {
7327 d[i] = n[i] ^ (m[i] & ~k[i]);
7328 }
7329 }
7330
HELPER(sve2_bsl1n)7331 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7332 {
7333 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7334 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7335
7336 for (i = 0; i < opr_sz; ++i) {
7337 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7338 }
7339 }
7340
HELPER(sve2_bsl2n)7341 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7342 {
7343 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7344 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7345
7346 for (i = 0; i < opr_sz; ++i) {
7347 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7348 }
7349 }
7350
HELPER(sve2_nbsl)7351 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7352 {
7353 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7354 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7355
7356 for (i = 0; i < opr_sz; ++i) {
7357 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7358 }
7359 }
7360
7361 /*
7362 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7363 * See hasless(v,1) from
7364 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7365 */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)7366 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7367 {
7368 int bits = 8 << esz;
7369 uint64_t ones = dup_const(esz, 1);
7370 uint64_t signs = ones << (bits - 1);
7371 uint64_t cmp0, cmp1;
7372
7373 cmp1 = dup_const(esz, n);
7374 cmp0 = cmp1 ^ m0;
7375 cmp1 = cmp1 ^ m1;
7376 cmp0 = (cmp0 - ones) & ~cmp0;
7377 cmp1 = (cmp1 - ones) & ~cmp1;
7378 return (cmp0 | cmp1) & signs;
7379 }
7380
do_match(void * vd,void * vn,void * vm,void * vg,uint32_t desc,int esz,bool nmatch)7381 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7382 uint32_t desc, int esz, bool nmatch)
7383 {
7384 uint16_t esz_mask = pred_esz_masks[esz];
7385 intptr_t opr_sz = simd_oprsz(desc);
7386 uint32_t flags = PREDTEST_INIT;
7387 intptr_t i, j, k;
7388
7389 for (i = 0; i < opr_sz; i += 16) {
7390 uint64_t m0 = *(uint64_t *)(vm + i);
7391 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7392 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7393 uint16_t out = 0;
7394
7395 for (j = 0; j < 16; j += 8) {
7396 uint64_t n = *(uint64_t *)(vn + i + j);
7397
7398 for (k = 0; k < 8; k += 1 << esz) {
7399 if (pg & (1 << (j + k))) {
7400 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7401 out |= (o ^ nmatch) << (j + k);
7402 }
7403 }
7404 }
7405 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7406 flags = iter_predtest_fwd(out, pg, flags);
7407 }
7408 return flags;
7409 }
7410
7411 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7412 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7413 { \
7414 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7415 }
7416
DO_PPZZ_MATCH(sve2_match_ppzz_b,MO_8,false)7417 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7418 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7419
7420 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7421 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7422
7423 #undef DO_PPZZ_MATCH
7424
7425 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7426 uint32_t desc)
7427 {
7428 ARMVectorReg scratch;
7429 intptr_t i, j;
7430 intptr_t opr_sz = simd_oprsz(desc);
7431 uint32_t *d = vd, *n = vn, *m = vm;
7432 uint8_t *pg = vg;
7433
7434 if (d == n) {
7435 n = memcpy(&scratch, n, opr_sz);
7436 if (d == m) {
7437 m = n;
7438 }
7439 } else if (d == m) {
7440 m = memcpy(&scratch, m, opr_sz);
7441 }
7442
7443 for (i = 0; i < opr_sz; i += 4) {
7444 uint64_t count = 0;
7445 uint8_t pred;
7446
7447 pred = pg[H1(i >> 3)] >> (i & 7);
7448 if (pred & 1) {
7449 uint32_t nn = n[H4(i >> 2)];
7450
7451 for (j = 0; j <= i; j += 4) {
7452 pred = pg[H1(j >> 3)] >> (j & 7);
7453 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7454 ++count;
7455 }
7456 }
7457 }
7458 d[H4(i >> 2)] = count;
7459 }
7460 }
7461
HELPER(sve2_histcnt_d)7462 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7463 uint32_t desc)
7464 {
7465 ARMVectorReg scratch;
7466 intptr_t i, j;
7467 intptr_t opr_sz = simd_oprsz(desc);
7468 uint64_t *d = vd, *n = vn, *m = vm;
7469 uint8_t *pg = vg;
7470
7471 if (d == n) {
7472 n = memcpy(&scratch, n, opr_sz);
7473 if (d == m) {
7474 m = n;
7475 }
7476 } else if (d == m) {
7477 m = memcpy(&scratch, m, opr_sz);
7478 }
7479
7480 for (i = 0; i < opr_sz / 8; ++i) {
7481 uint64_t count = 0;
7482 if (pg[H1(i)] & 1) {
7483 uint64_t nn = n[i];
7484 for (j = 0; j <= i; ++j) {
7485 if ((pg[H1(j)] & 1) && nn == m[j]) {
7486 ++count;
7487 }
7488 }
7489 }
7490 d[i] = count;
7491 }
7492 }
7493
7494 /*
7495 * Returns the number of bytes in m0 and m1 that match n.
7496 * Unlike do_match2 we don't just need true/false, we need an exact count.
7497 * This requires two extra logical operations.
7498 */
do_histseg_cnt(uint8_t n,uint64_t m0,uint64_t m1)7499 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7500 {
7501 const uint64_t mask = dup_const(MO_8, 0x7f);
7502 uint64_t cmp0, cmp1;
7503
7504 cmp1 = dup_const(MO_8, n);
7505 cmp0 = cmp1 ^ m0;
7506 cmp1 = cmp1 ^ m1;
7507
7508 /*
7509 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7510 * 2: carry in to msb if byte != 0 (+ mask)
7511 * 3: set msb if cmp has msb set (| cmp)
7512 * 4: set ~msb to ignore them (| mask)
7513 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7514 * 5: invert, resulting in 0x80 if and only if byte == 0.
7515 */
7516 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7517 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7518
7519 /*
7520 * Combine the two compares in a way that the bits do
7521 * not overlap, and so preserves the count of set bits.
7522 * If the host has an efficient instruction for ctpop,
7523 * then ctpop(x) + ctpop(y) has the same number of
7524 * operations as ctpop(x | (y >> 1)). If the host does
7525 * not have an efficient ctpop, then we only want to
7526 * use it once.
7527 */
7528 return ctpop64(cmp0 | (cmp1 >> 1));
7529 }
7530
HELPER(sve2_histseg)7531 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7532 {
7533 intptr_t i, j;
7534 intptr_t opr_sz = simd_oprsz(desc);
7535
7536 for (i = 0; i < opr_sz; i += 16) {
7537 uint64_t n0 = *(uint64_t *)(vn + i);
7538 uint64_t m0 = *(uint64_t *)(vm + i);
7539 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7540 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7541 uint64_t out0 = 0;
7542 uint64_t out1 = 0;
7543
7544 for (j = 0; j < 64; j += 8) {
7545 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7546 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7547 out0 |= cnt0 << j;
7548 out1 |= cnt1 << j;
7549 }
7550
7551 *(uint64_t *)(vd + i) = out0;
7552 *(uint64_t *)(vd + i + 8) = out1;
7553 }
7554 }
7555
HELPER(sve2_xar_b)7556 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7557 {
7558 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7559 int shr = simd_data(desc);
7560 int shl = 8 - shr;
7561 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7562 uint64_t *d = vd, *n = vn, *m = vm;
7563
7564 for (i = 0; i < opr_sz; ++i) {
7565 uint64_t t = n[i] ^ m[i];
7566 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7567 }
7568 }
7569
HELPER(sve2_xar_h)7570 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7571 {
7572 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7573 int shr = simd_data(desc);
7574 int shl = 16 - shr;
7575 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7576 uint64_t *d = vd, *n = vn, *m = vm;
7577
7578 for (i = 0; i < opr_sz; ++i) {
7579 uint64_t t = n[i] ^ m[i];
7580 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7581 }
7582 }
7583
HELPER(sve2_xar_s)7584 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7585 {
7586 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7587 int shr = simd_data(desc);
7588 uint32_t *d = vd, *n = vn, *m = vm;
7589
7590 for (i = 0; i < opr_sz; ++i) {
7591 d[i] = ror32(n[i] ^ m[i], shr);
7592 }
7593 }
7594
HELPER(fmmla_s)7595 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7596 float_status *status, uint32_t desc)
7597 {
7598 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7599
7600 for (s = 0; s < opr_sz; ++s) {
7601 float32 *n = vn + s * sizeof(float32) * 4;
7602 float32 *m = vm + s * sizeof(float32) * 4;
7603 float32 *a = va + s * sizeof(float32) * 4;
7604 float32 *d = vd + s * sizeof(float32) * 4;
7605 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7606 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7607 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7608 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7609 float32 p0, p1;
7610
7611 /* i = 0, j = 0 */
7612 p0 = float32_mul(n00, m00, status);
7613 p1 = float32_mul(n01, m01, status);
7614 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7615
7616 /* i = 0, j = 1 */
7617 p0 = float32_mul(n00, m10, status);
7618 p1 = float32_mul(n01, m11, status);
7619 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7620
7621 /* i = 1, j = 0 */
7622 p0 = float32_mul(n10, m00, status);
7623 p1 = float32_mul(n11, m01, status);
7624 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7625
7626 /* i = 1, j = 1 */
7627 p0 = float32_mul(n10, m10, status);
7628 p1 = float32_mul(n11, m11, status);
7629 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7630 }
7631 }
7632
HELPER(fmmla_d)7633 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7634 float_status *status, uint32_t desc)
7635 {
7636 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7637
7638 for (s = 0; s < opr_sz; ++s) {
7639 float64 *n = vn + s * sizeof(float64) * 4;
7640 float64 *m = vm + s * sizeof(float64) * 4;
7641 float64 *a = va + s * sizeof(float64) * 4;
7642 float64 *d = vd + s * sizeof(float64) * 4;
7643 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7644 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7645 float64 p0, p1;
7646
7647 /* i = 0, j = 0 */
7648 p0 = float64_mul(n00, m00, status);
7649 p1 = float64_mul(n01, m01, status);
7650 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7651
7652 /* i = 0, j = 1 */
7653 p0 = float64_mul(n00, m10, status);
7654 p1 = float64_mul(n01, m11, status);
7655 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7656
7657 /* i = 1, j = 0 */
7658 p0 = float64_mul(n10, m00, status);
7659 p1 = float64_mul(n11, m01, status);
7660 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7661
7662 /* i = 1, j = 1 */
7663 p0 = float64_mul(n10, m10, status);
7664 p1 = float64_mul(n11, m11, status);
7665 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7666 }
7667 }
7668
7669 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7670 void HELPER(NAME)(void *vd, void *vn, void *vg, \
7671 float_status *status, uint32_t desc) \
7672 { \
7673 intptr_t i = simd_oprsz(desc); \
7674 uint64_t *g = vg; \
7675 do { \
7676 uint64_t pg = g[(i - 1) >> 6]; \
7677 do { \
7678 i -= sizeof(TYPEW); \
7679 if (likely((pg >> (i & 63)) & 1)) { \
7680 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7681 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7682 } \
7683 } while (i & 63); \
7684 } while (i != 0); \
7685 }
7686
7687 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7688 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7689 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7690
7691 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7692 void HELPER(NAME)(void *vd, void *vn, void *vg, \
7693 float_status *status, uint32_t desc) \
7694 { \
7695 intptr_t i = simd_oprsz(desc); \
7696 uint64_t *g = vg; \
7697 do { \
7698 uint64_t pg = g[(i - 1) >> 6]; \
7699 do { \
7700 i -= sizeof(TYPEW); \
7701 if (likely((pg >> (i & 63)) & 1)) { \
7702 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7703 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7704 } \
7705 } while (i & 63); \
7706 } while (i != 0); \
7707 }
7708
7709 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7710 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7711
7712 #undef DO_FCVTLT
7713 #undef DO_FCVTNT
7714