1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg.h"
28 #include "vec_internal.h"
29 #include "sve_ldst_internal.h"
30 #include "hw/core/tcg-cpu-ops.h"
31
32
33 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
34 *
35 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
36 * and bit 0 set if C is set. Compare the definitions of these variables
37 * within CPUARMState.
38 */
39
40 /* For no G bits set, NZCV = C. */
41 #define PREDTEST_INIT 1
42
43 /* This is an iterative function, called for each Pd and Pg word
44 * moving forward.
45 */
iter_predtest_fwd(uint64_t d,uint64_t g,uint32_t flags)46 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
47 {
48 if (likely(g)) {
49 /* Compute N from first D & G.
50 Use bit 2 to signal first G bit seen. */
51 if (!(flags & 4)) {
52 flags |= ((d & (g & -g)) != 0) << 31;
53 flags |= 4;
54 }
55
56 /* Accumulate Z from each D & G. */
57 flags |= ((d & g) != 0) << 1;
58
59 /* Compute C from last !(D & G). Replace previous. */
60 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
61 }
62 return flags;
63 }
64
65 /* This is an iterative function, called for each Pd and Pg word
66 * moving backward.
67 */
iter_predtest_bwd(uint64_t d,uint64_t g,uint32_t flags)68 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
69 {
70 if (likely(g)) {
71 /* Compute C from first (i.e last) !(D & G).
72 Use bit 2 to signal first G bit seen. */
73 if (!(flags & 4)) {
74 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
75 flags |= (d & pow2floor(g)) == 0;
76 }
77
78 /* Accumulate Z from each D & G. */
79 flags |= ((d & g) != 0) << 1;
80
81 /* Compute N from last (i.e first) D & G. Replace previous. */
82 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
83 }
84 return flags;
85 }
86
87 /* The same for a single word predicate. */
HELPER(sve_predtest1)88 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
89 {
90 return iter_predtest_fwd(d, g, PREDTEST_INIT);
91 }
92
93 /* The same for a multi-word predicate. */
HELPER(sve_predtest)94 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
95 {
96 uint32_t flags = PREDTEST_INIT;
97 uint64_t *d = vd, *g = vg;
98 uintptr_t i = 0;
99
100 do {
101 flags = iter_predtest_fwd(d[i], g[i], flags);
102 } while (++i < words);
103
104 return flags;
105 }
106
107 /* Similarly for single word elements. */
expand_pred_s(uint8_t byte)108 static inline uint64_t expand_pred_s(uint8_t byte)
109 {
110 static const uint64_t word[] = {
111 [0x01] = 0x00000000ffffffffull,
112 [0x10] = 0xffffffff00000000ull,
113 [0x11] = 0xffffffffffffffffull,
114 };
115 return word[byte & 0x11];
116 }
117
118 #define LOGICAL_PPPP(NAME, FUNC) \
119 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
120 { \
121 uintptr_t opr_sz = simd_oprsz(desc); \
122 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
123 uintptr_t i; \
124 for (i = 0; i < opr_sz / 8; ++i) { \
125 d[i] = FUNC(n[i], m[i], g[i]); \
126 } \
127 }
128
129 #define DO_AND(N, M, G) (((N) & (M)) & (G))
130 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
131 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
132 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
133 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
134 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
135 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
136 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
137
LOGICAL_PPPP(sve_and_pppp,DO_AND)138 LOGICAL_PPPP(sve_and_pppp, DO_AND)
139 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
140 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
141 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
142 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
143 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
144 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
145 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
146
147 #undef DO_AND
148 #undef DO_BIC
149 #undef DO_EOR
150 #undef DO_ORR
151 #undef DO_ORN
152 #undef DO_NOR
153 #undef DO_NAND
154 #undef DO_SEL
155 #undef LOGICAL_PPPP
156
157 /* Fully general three-operand expander, controlled by a predicate.
158 * This is complicated by the host-endian storage of the register file.
159 */
160 /* ??? I don't expect the compiler could ever vectorize this itself.
161 * With some tables we can convert bit masks to byte masks, and with
162 * extra care wrt byte/word ordering we could use gcc generic vectors
163 * and do 16 bytes at a time.
164 */
165 #define DO_ZPZZ(NAME, TYPE, H, OP) \
166 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
167 { \
168 intptr_t i, opr_sz = simd_oprsz(desc); \
169 for (i = 0; i < opr_sz; ) { \
170 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
171 do { \
172 if (pg & 1) { \
173 TYPE nn = *(TYPE *)(vn + H(i)); \
174 TYPE mm = *(TYPE *)(vm + H(i)); \
175 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
176 } \
177 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
178 } while (i & 15); \
179 } \
180 }
181
182 /* Similarly, specialized for 64-bit operands. */
183 #define DO_ZPZZ_D(NAME, TYPE, OP) \
184 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
185 { \
186 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
187 TYPE *d = vd, *n = vn, *m = vm; \
188 uint8_t *pg = vg; \
189 for (i = 0; i < opr_sz; i += 1) { \
190 if (pg[H1(i)] & 1) { \
191 TYPE nn = n[i], mm = m[i]; \
192 d[i] = OP(nn, mm); \
193 } \
194 } \
195 }
196
197 #define DO_AND(N, M) (N & M)
198 #define DO_EOR(N, M) (N ^ M)
199 #define DO_ORR(N, M) (N | M)
200 #define DO_BIC(N, M) (N & ~M)
201 #define DO_ADD(N, M) (N + M)
202 #define DO_SUB(N, M) (N - M)
203 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
204 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
205 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
206 #define DO_MUL(N, M) (N * M)
207
208
209 /*
210 * We must avoid the C undefined behaviour cases: division by
211 * zero and signed division of INT_MIN by -1. Both of these
212 * have architecturally defined required results for Arm.
213 * We special case all signed divisions by -1 to avoid having
214 * to deduce the minimum integer for the type involved.
215 */
216 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
217 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
218
219 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
220 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
221 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
222 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
223
224 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
225 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
226 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
227 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
228
229 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
230 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
231 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
232 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
233
234 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
235 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
236 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
237 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
238
239 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
240 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
241 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
242 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
243
244 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
245 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
246 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
247 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
248
249 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
250 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
251 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
252 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
253
254 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
255 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
256 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
257 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
258
259 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
260 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
261 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
262 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
263
264 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
265 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
266 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
267 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
268
269 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
270 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
271 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
272 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
273
274 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
275 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
276 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
277 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
278
279 /* Because the computation type is at least twice as large as required,
280 these work for both signed and unsigned source types. */
281 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
282 {
283 return (n * m) >> 8;
284 }
285
do_mulh_h(int32_t n,int32_t m)286 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
287 {
288 return (n * m) >> 16;
289 }
290
do_mulh_s(int64_t n,int64_t m)291 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
292 {
293 return (n * m) >> 32;
294 }
295
do_smulh_d(uint64_t n,uint64_t m)296 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
297 {
298 uint64_t lo, hi;
299 muls64(&lo, &hi, n, m);
300 return hi;
301 }
302
do_umulh_d(uint64_t n,uint64_t m)303 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
304 {
305 uint64_t lo, hi;
306 mulu64(&lo, &hi, n, m);
307 return hi;
308 }
309
DO_ZPZZ(sve_mul_zpzz_b,uint8_t,H1,DO_MUL)310 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
311 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
312 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
313 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
314
315 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
316 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
317 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
318 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
319
320 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
321 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
322 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
323 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
324
325 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
326 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
327
328 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
329 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
330
331 /* Note that all bits of the shift are significant
332 and not modulo the element size. */
333 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
334 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
335 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
336
337 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
338 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
339 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
340
341 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
342 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
343 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
344
345 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
346 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
347 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
348
349 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
350 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
351 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
352
353 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
354 {
355 int8_t n1 = n, n2 = n >> 8;
356 return m + n1 + n2;
357 }
358
do_sadalp_s(int32_t n,int32_t m)359 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
360 {
361 int16_t n1 = n, n2 = n >> 16;
362 return m + n1 + n2;
363 }
364
do_sadalp_d(int64_t n,int64_t m)365 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
366 {
367 int32_t n1 = n, n2 = n >> 32;
368 return m + n1 + n2;
369 }
370
DO_ZPZZ(sve2_sadalp_zpzz_h,int16_t,H1_2,do_sadalp_h)371 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
372 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
373 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
374
375 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
376 {
377 uint8_t n1 = n, n2 = n >> 8;
378 return m + n1 + n2;
379 }
380
do_uadalp_s(uint32_t n,uint32_t m)381 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
382 {
383 uint16_t n1 = n, n2 = n >> 16;
384 return m + n1 + n2;
385 }
386
do_uadalp_d(uint64_t n,uint64_t m)387 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
388 {
389 uint32_t n1 = n, n2 = n >> 32;
390 return m + n1 + n2;
391 }
392
DO_ZPZZ(sve2_uadalp_zpzz_h,uint16_t,H1_2,do_uadalp_h)393 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
394 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
395 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
396
397 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
398 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
399 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
400 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
401
402 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
403 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
404 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
405 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
406
407 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
408 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
409 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
410 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
411
412 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
413 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
414 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
415 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
416
417 /*
418 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
419 * We pass in a pointer to a dummy saturation field to trigger
420 * the saturating arithmetic but discard the information about
421 * whether it has occurred.
422 */
423 #define do_sqshl_b(n, m) \
424 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
425 #define do_sqshl_h(n, m) \
426 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
427 #define do_sqshl_s(n, m) \
428 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
429 #define do_sqshl_d(n, m) \
430 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
431
432 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
433 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
434 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
435 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
436
437 #define do_uqshl_b(n, m) \
438 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
439 #define do_uqshl_h(n, m) \
440 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
441 #define do_uqshl_s(n, m) \
442 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
443 #define do_uqshl_d(n, m) \
444 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
445
446 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
447 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
448 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
449 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
450
451 #define do_sqrshl_b(n, m) \
452 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
453 #define do_sqrshl_h(n, m) \
454 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
455 #define do_sqrshl_s(n, m) \
456 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
457 #define do_sqrshl_d(n, m) \
458 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
459
460 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
461 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
462 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
463 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
464
465 #undef do_sqrshl_d
466
467 #define do_uqrshl_b(n, m) \
468 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
469 #define do_uqrshl_h(n, m) \
470 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
471 #define do_uqrshl_s(n, m) \
472 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
473 #define do_uqrshl_d(n, m) \
474 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
475
476 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
477 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
478 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
479 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
480
481 #undef do_uqrshl_d
482
483 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
484 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
485
486 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
487 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
488 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
489 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
490
491 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
492 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
493 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
494 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
495
496 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
497 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
498
499 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
500 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
501 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
502 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
503
504 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
505 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
506 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
507 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
508
509 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
510 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
511
512 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
513 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
514 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
515 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
516
517 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
518 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
519 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
520 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
521
522 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
523 {
524 return val >= max ? max : val <= min ? min : val;
525 }
526
527 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
528 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
529 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
530
do_sqadd_d(int64_t n,int64_t m)531 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
532 {
533 int64_t r = n + m;
534 if (((r ^ n) & ~(n ^ m)) < 0) {
535 /* Signed overflow. */
536 return r < 0 ? INT64_MAX : INT64_MIN;
537 }
538 return r;
539 }
540
DO_ZPZZ(sve2_sqadd_zpzz_b,int8_t,H1,DO_SQADD_B)541 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
542 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
543 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
544 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
545
546 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
547 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
548 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
549
550 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
551 {
552 uint64_t r = n + m;
553 return r < n ? UINT64_MAX : r;
554 }
555
DO_ZPZZ(sve2_uqadd_zpzz_b,uint8_t,H1,DO_UQADD_B)556 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
557 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
558 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
559 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
560
561 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
562 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
563 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
564
565 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
566 {
567 int64_t r = n - m;
568 if (((r ^ n) & (n ^ m)) < 0) {
569 /* Signed overflow. */
570 return r < 0 ? INT64_MAX : INT64_MIN;
571 }
572 return r;
573 }
574
DO_ZPZZ(sve2_sqsub_zpzz_b,int8_t,H1,DO_SQSUB_B)575 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
576 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
577 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
578 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
579
580 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
581 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
582 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
583
584 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
585 {
586 return n > m ? n - m : 0;
587 }
588
DO_ZPZZ(sve2_uqsub_zpzz_b,uint8_t,H1,DO_UQSUB_B)589 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
590 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
591 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
592 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
593
594 #define DO_SUQADD_B(n, m) \
595 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
596 #define DO_SUQADD_H(n, m) \
597 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
598 #define DO_SUQADD_S(n, m) \
599 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
600
601 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
602 {
603 uint64_t r = n + m;
604
605 if (n < 0) {
606 /* Note that m - abs(n) cannot underflow. */
607 if (r > INT64_MAX) {
608 /* Result is either very large positive or negative. */
609 if (m > -n) {
610 /* m > abs(n), so r is a very large positive. */
611 return INT64_MAX;
612 }
613 /* Result is negative. */
614 }
615 } else {
616 /* Both inputs are positive: check for overflow. */
617 if (r < m || r > INT64_MAX) {
618 return INT64_MAX;
619 }
620 }
621 return r;
622 }
623
DO_ZPZZ(sve2_suqadd_zpzz_b,uint8_t,H1,DO_SUQADD_B)624 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
625 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
626 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
627 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
628
629 #define DO_USQADD_B(n, m) \
630 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
631 #define DO_USQADD_H(n, m) \
632 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
633 #define DO_USQADD_S(n, m) \
634 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
635
636 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
637 {
638 uint64_t r = n + m;
639
640 if (m < 0) {
641 return n < -m ? 0 : r;
642 }
643 return r < n ? UINT64_MAX : r;
644 }
645
DO_ZPZZ(sve2_usqadd_zpzz_b,uint8_t,H1,DO_USQADD_B)646 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
647 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
648 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
649 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
650
651 #undef DO_ZPZZ
652 #undef DO_ZPZZ_D
653
654 /*
655 * Three operand expander, operating on element pairs.
656 * If the slot I is even, the elements from from VN {I, I+1}.
657 * If the slot I is odd, the elements from from VM {I-1, I}.
658 * Load all of the input elements in each pair before overwriting output.
659 */
660 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
661 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
662 { \
663 intptr_t i, opr_sz = simd_oprsz(desc); \
664 for (i = 0; i < opr_sz; ) { \
665 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
666 do { \
667 TYPE n0 = *(TYPE *)(vn + H(i)); \
668 TYPE m0 = *(TYPE *)(vm + H(i)); \
669 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
670 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
671 if (pg & 1) { \
672 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
673 } \
674 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
675 if (pg & 1) { \
676 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
677 } \
678 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
679 } while (i & 15); \
680 } \
681 }
682
683 /* Similarly, specialized for 64-bit operands. */
684 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
685 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
686 { \
687 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
688 TYPE *d = vd, *n = vn, *m = vm; \
689 uint8_t *pg = vg; \
690 for (i = 0; i < opr_sz; i += 2) { \
691 TYPE n0 = n[i], n1 = n[i + 1]; \
692 TYPE m0 = m[i], m1 = m[i + 1]; \
693 if (pg[H1(i)] & 1) { \
694 d[i] = OP(n0, n1); \
695 } \
696 if (pg[H1(i + 1)] & 1) { \
697 d[i + 1] = OP(m0, m1); \
698 } \
699 } \
700 }
701
702 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
703 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
704 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
705 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
706
707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
709 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
710 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
711
712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
714 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
715 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
716
717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
719 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
720 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
721
722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
724 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
725 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
726
727 #undef DO_ZPZZ_PAIR
728 #undef DO_ZPZZ_PAIR_D
729
730 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
731 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
732 void *status, uint32_t desc) \
733 { \
734 intptr_t i, opr_sz = simd_oprsz(desc); \
735 for (i = 0; i < opr_sz; ) { \
736 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
737 do { \
738 TYPE n0 = *(TYPE *)(vn + H(i)); \
739 TYPE m0 = *(TYPE *)(vm + H(i)); \
740 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
741 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
742 if (pg & 1) { \
743 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
744 } \
745 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
746 if (pg & 1) { \
747 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
748 } \
749 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
750 } while (i & 15); \
751 } \
752 }
753
754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
756 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
757
758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
760 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
761
762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
764 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
765
766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
768 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
769
770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
772 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
773
774 #undef DO_ZPZZ_PAIR_FP
775
776 /* Three-operand expander, controlled by a predicate, in which the
777 * third operand is "wide". That is, for D = N op M, the same 64-bit
778 * value of M is used with all of the narrower values of N.
779 */
780 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
781 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
782 { \
783 intptr_t i, opr_sz = simd_oprsz(desc); \
784 for (i = 0; i < opr_sz; ) { \
785 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
786 TYPEW mm = *(TYPEW *)(vm + i); \
787 do { \
788 if (pg & 1) { \
789 TYPE nn = *(TYPE *)(vn + H(i)); \
790 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
791 } \
792 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
793 } while (i & 7); \
794 } \
795 }
796
797 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
798 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
799 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
800
801 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
802 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
803 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
804
805 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
806 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
807 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
808
809 #undef DO_ZPZW
810
811 /* Fully general two-operand expander, controlled by a predicate.
812 */
813 #define DO_ZPZ(NAME, TYPE, H, OP) \
814 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
815 { \
816 intptr_t i, opr_sz = simd_oprsz(desc); \
817 for (i = 0; i < opr_sz; ) { \
818 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
819 do { \
820 if (pg & 1) { \
821 TYPE nn = *(TYPE *)(vn + H(i)); \
822 *(TYPE *)(vd + H(i)) = OP(nn); \
823 } \
824 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
825 } while (i & 15); \
826 } \
827 }
828
829 /* Similarly, specialized for 64-bit operands. */
830 #define DO_ZPZ_D(NAME, TYPE, OP) \
831 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
832 { \
833 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
834 TYPE *d = vd, *n = vn; \
835 uint8_t *pg = vg; \
836 for (i = 0; i < opr_sz; i += 1) { \
837 if (pg[H1(i)] & 1) { \
838 TYPE nn = n[i]; \
839 d[i] = OP(nn); \
840 } \
841 } \
842 }
843
844 #define DO_CLS_B(N) (clrsb32(N) - 24)
845 #define DO_CLS_H(N) (clrsb32(N) - 16)
846
847 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
848 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
849 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
850 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
851
852 #define DO_CLZ_B(N) (clz32(N) - 24)
853 #define DO_CLZ_H(N) (clz32(N) - 16)
854
855 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
856 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
857 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
858 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
859
860 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
861 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
862 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
863 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
864
865 #define DO_CNOT(N) (N == 0)
866
867 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
868 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
869 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
870 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
871
872 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
873
874 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
875 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
876 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
877
878 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
879
880 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
881 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
882 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
883
884 #define DO_NOT(N) (~N)
885
886 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
887 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
888 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
889 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
890
891 #define DO_SXTB(N) ((int8_t)N)
892 #define DO_SXTH(N) ((int16_t)N)
893 #define DO_SXTS(N) ((int32_t)N)
894 #define DO_UXTB(N) ((uint8_t)N)
895 #define DO_UXTH(N) ((uint16_t)N)
896 #define DO_UXTS(N) ((uint32_t)N)
897
898 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
899 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
900 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
901 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
902 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
903 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
904
905 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
906 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
907 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
908 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
909 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
910 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
911
912 #define DO_ABS(N) (N < 0 ? -N : N)
913
914 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
915 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
916 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
917 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
918
919 #define DO_NEG(N) (-N)
920
921 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
922 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
923 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
924 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
925
926 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
927 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
928 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
929
930 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
931 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
932
933 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
934
935 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
936 {
937 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
938 uint64_t *d = vd, *n = vn;
939 uint8_t *pg = vg;
940
941 for (i = 0; i < opr_sz; i += 2) {
942 if (pg[H1(i)] & 1) {
943 uint64_t n0 = n[i + 0];
944 uint64_t n1 = n[i + 1];
945 d[i + 0] = n1;
946 d[i + 1] = n0;
947 }
948 }
949 }
950
DO_ZPZ(sve_rbit_b,uint8_t,H1,revbit8)951 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
952 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
953 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
954 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
955
956 #define DO_SQABS(X) \
957 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
958 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
959
960 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
961 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
962 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
963 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
964
965 #define DO_SQNEG(X) \
966 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
967 x_ == min_ ? -min_ - 1 : -x_; })
968
969 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
970 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
971 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
972 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
973
974 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
975 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
976
977 /* Three-operand expander, unpredicated, in which the third operand is "wide".
978 */
979 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
980 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
981 { \
982 intptr_t i, opr_sz = simd_oprsz(desc); \
983 for (i = 0; i < opr_sz; ) { \
984 TYPEW mm = *(TYPEW *)(vm + i); \
985 do { \
986 TYPE nn = *(TYPE *)(vn + H(i)); \
987 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
988 i += sizeof(TYPE); \
989 } while (i & 7); \
990 } \
991 }
992
993 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
994 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
995 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
996
997 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
998 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
999 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1000
1001 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1002 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1003 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1004
1005 #undef DO_ZZW
1006
1007 #undef DO_CLS_B
1008 #undef DO_CLS_H
1009 #undef DO_CLZ_B
1010 #undef DO_CLZ_H
1011 #undef DO_CNOT
1012 #undef DO_FABS
1013 #undef DO_FNEG
1014 #undef DO_ABS
1015 #undef DO_NEG
1016 #undef DO_ZPZ
1017 #undef DO_ZPZ_D
1018
1019 /*
1020 * Three-operand expander, unpredicated, in which the two inputs are
1021 * selected from the top or bottom half of the wide column.
1022 */
1023 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1024 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1025 { \
1026 intptr_t i, opr_sz = simd_oprsz(desc); \
1027 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1028 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1029 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1030 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1031 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1032 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1033 } \
1034 }
1035
1036 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1037 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1038 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1039
1040 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1041 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1042 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1043
1044 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1045 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1046 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1047
1048 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1049 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1050 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1051
1052 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1053 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1054 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1055
1056 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1057 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1058 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1059
1060 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1061 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1062 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1063
1064 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1065 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1066 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1067
1068 /* Note that the multiply cannot overflow, but the doubling can. */
1069 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1070 {
1071 int16_t val = n * m;
1072 return DO_SQADD_H(val, val);
1073 }
1074
do_sqdmull_s(int32_t n,int32_t m)1075 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1076 {
1077 int32_t val = n * m;
1078 return DO_SQADD_S(val, val);
1079 }
1080
do_sqdmull_d(int64_t n,int64_t m)1081 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1082 {
1083 int64_t val = n * m;
1084 return do_sqadd_d(val, val);
1085 }
1086
DO_ZZZ_TB(sve2_sqdmull_zzz_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h)1087 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1088 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1089 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1090
1091 #undef DO_ZZZ_TB
1092
1093 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1094 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1095 { \
1096 intptr_t i, opr_sz = simd_oprsz(desc); \
1097 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1098 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1099 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1100 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1101 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1102 } \
1103 }
1104
1105 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1106 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1107 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1108
1109 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1110 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1111 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1112
1113 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1114 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1115 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1116
1117 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1118 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1119 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1120
1121 #undef DO_ZZZ_WTB
1122
1123 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1124 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1125 { \
1126 intptr_t i, opr_sz = simd_oprsz(desc); \
1127 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1128 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1129 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1130 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1131 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1132 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1133 } \
1134 }
1135
1136 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1137 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1138 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1139 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1140
1141 #undef DO_ZZZ_NTB
1142
1143 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1144 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1145 { \
1146 intptr_t i, opr_sz = simd_oprsz(desc); \
1147 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1148 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1149 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1150 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1151 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1152 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1153 } \
1154 }
1155
1156 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1157 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1158 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1159
1160 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1161 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1162 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1163
1164 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1165 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1166 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1167
1168 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1169 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1170 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1171
1172 #define DO_NMUL(N, M) -(N * M)
1173
1174 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1176 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1177
1178 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1180 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1181
1182 #undef DO_ZZZW_ACC
1183
1184 #define DO_XTNB(NAME, TYPE, OP) \
1185 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1186 { \
1187 intptr_t i, opr_sz = simd_oprsz(desc); \
1188 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1189 TYPE nn = *(TYPE *)(vn + i); \
1190 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1191 *(TYPE *)(vd + i) = nn; \
1192 } \
1193 }
1194
1195 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1196 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1197 { \
1198 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1199 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1200 TYPE nn = *(TYPE *)(vn + i); \
1201 *(TYPEN *)(vd + i + odd) = OP(nn); \
1202 } \
1203 }
1204
1205 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1206 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1207 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1208
1209 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1210 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1211 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1212
1213 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1214 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1215 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1216
1217 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1218 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1219 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1220
1221 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1222 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1223 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1224
1225 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1226 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1227 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1228
1229 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1230 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1231 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1232
1233 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1234 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1235 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1236
1237 #undef DO_XTNB
1238 #undef DO_XTNT
1239
1240 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1241 {
1242 intptr_t i, opr_sz = simd_oprsz(desc);
1243 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1244 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1245 uint32_t *a = va, *n = vn;
1246 uint64_t *d = vd, *m = vm;
1247
1248 for (i = 0; i < opr_sz / 8; ++i) {
1249 uint32_t e1 = a[2 * i + H4(0)];
1250 uint32_t e2 = n[2 * i + sel] ^ inv;
1251 uint64_t c = extract64(m[i], 32, 1);
1252 /* Compute and store the entire 33-bit result at once. */
1253 d[i] = c + e1 + e2;
1254 }
1255 }
1256
HELPER(sve2_adcl_d)1257 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1258 {
1259 intptr_t i, opr_sz = simd_oprsz(desc);
1260 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1261 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1262 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1263
1264 for (i = 0; i < opr_sz / 8; i += 2) {
1265 Int128 e1 = int128_make64(a[i]);
1266 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1267 Int128 c = int128_make64(m[i + 1] & 1);
1268 Int128 r = int128_add(int128_add(e1, e2), c);
1269 d[i + 0] = int128_getlo(r);
1270 d[i + 1] = int128_gethi(r);
1271 }
1272 }
1273
1274 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1275 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1276 { \
1277 intptr_t i, opr_sz = simd_oprsz(desc); \
1278 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1279 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1280 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1281 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1282 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1283 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1284 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1285 } \
1286 }
1287
DO_SQDMLAL(sve2_sqdmlal_zzzw_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h,DO_SQADD_H)1288 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1289 do_sqdmull_h, DO_SQADD_H)
1290 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1291 do_sqdmull_s, DO_SQADD_S)
1292 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1293 do_sqdmull_d, do_sqadd_d)
1294
1295 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1296 do_sqdmull_h, DO_SQSUB_H)
1297 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1298 do_sqdmull_s, DO_SQSUB_S)
1299 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1300 do_sqdmull_d, do_sqsub_d)
1301
1302 #undef DO_SQDMLAL
1303
1304 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1305 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1306 { \
1307 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1308 int rot = simd_data(desc); \
1309 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1310 bool sub_r = rot == 1 || rot == 2; \
1311 bool sub_i = rot >= 2; \
1312 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1313 for (i = 0; i < opr_sz; i += 2) { \
1314 TYPE elt1_a = n[H(i + sel_a)]; \
1315 TYPE elt2_a = m[H(i + sel_a)]; \
1316 TYPE elt2_b = m[H(i + sel_b)]; \
1317 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1318 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1319 } \
1320 }
1321
1322 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1323
1324 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1325 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1326 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1327 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1328
1329 #define DO_SQRDMLAH_B(N, M, A, S) \
1330 do_sqrdmlah_b(N, M, A, S, true)
1331 #define DO_SQRDMLAH_H(N, M, A, S) \
1332 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1333 #define DO_SQRDMLAH_S(N, M, A, S) \
1334 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1335 #define DO_SQRDMLAH_D(N, M, A, S) \
1336 do_sqrdmlah_d(N, M, A, S, true)
1337
1338 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1341 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1342
1343 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1344 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1345 { \
1346 intptr_t i, j, oprsz = simd_oprsz(desc); \
1347 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1348 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1349 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1350 bool sub_r = rot == 1 || rot == 2; \
1351 bool sub_i = rot >= 2; \
1352 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1353 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1354 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1355 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1356 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1357 TYPE elt1_a = n[H(i + j + sel_a)]; \
1358 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1359 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1360 } \
1361 } \
1362 }
1363
1364 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1365 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1366
1367 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1368 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1369
1370 #undef DO_CMLA
1371 #undef DO_CMLA_FUNC
1372 #undef DO_CMLA_IDX_FUNC
1373 #undef DO_SQRDMLAH_B
1374 #undef DO_SQRDMLAH_H
1375 #undef DO_SQRDMLAH_S
1376 #undef DO_SQRDMLAH_D
1377
1378 /* Note N and M are 4 elements bundled into one unit. */
1379 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1380 int sel_a, int sel_b, int sub_i)
1381 {
1382 for (int i = 0; i <= 1; i++) {
1383 int32_t elt1_r = (int8_t)(n >> (16 * i));
1384 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1385 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1386 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1387
1388 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1389 }
1390 return a;
1391 }
1392
do_cdot_d(uint64_t n,uint64_t m,int64_t a,int sel_a,int sel_b,int sub_i)1393 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1394 int sel_a, int sel_b, int sub_i)
1395 {
1396 for (int i = 0; i <= 1; i++) {
1397 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1398 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1399 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1400 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1401
1402 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1403 }
1404 return a;
1405 }
1406
HELPER(sve2_cdot_zzzz_s)1407 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1408 void *va, uint32_t desc)
1409 {
1410 int opr_sz = simd_oprsz(desc);
1411 int rot = simd_data(desc);
1412 int sel_a = rot & 1;
1413 int sel_b = sel_a ^ 1;
1414 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1415 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1416
1417 for (int e = 0; e < opr_sz / 4; e++) {
1418 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1419 }
1420 }
1421
HELPER(sve2_cdot_zzzz_d)1422 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1423 void *va, uint32_t desc)
1424 {
1425 int opr_sz = simd_oprsz(desc);
1426 int rot = simd_data(desc);
1427 int sel_a = rot & 1;
1428 int sel_b = sel_a ^ 1;
1429 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1430 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1431
1432 for (int e = 0; e < opr_sz / 8; e++) {
1433 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1434 }
1435 }
1436
HELPER(sve2_cdot_idx_s)1437 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1438 void *va, uint32_t desc)
1439 {
1440 int opr_sz = simd_oprsz(desc);
1441 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1442 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1443 int sel_a = rot & 1;
1444 int sel_b = sel_a ^ 1;
1445 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1446 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1447
1448 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1449 uint32_t seg_m = m[seg + idx];
1450 for (int e = 0; e < 4; e++) {
1451 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1452 sel_a, sel_b, sub_i);
1453 }
1454 }
1455 }
1456
HELPER(sve2_cdot_idx_d)1457 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1458 void *va, uint32_t desc)
1459 {
1460 int seg, opr_sz = simd_oprsz(desc);
1461 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1462 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1463 int sel_a = rot & 1;
1464 int sel_b = sel_a ^ 1;
1465 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1466 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1467
1468 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1469 uint64_t seg_m = m[seg + idx];
1470 for (int e = 0; e < 2; e++) {
1471 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1472 sel_a, sel_b, sub_i);
1473 }
1474 }
1475 }
1476
1477 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1478 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1479 { \
1480 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1481 intptr_t i, j, idx = simd_data(desc); \
1482 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1483 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1484 TYPE mm = m[i]; \
1485 for (j = 0; j < segment; j++) { \
1486 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1487 } \
1488 } \
1489 }
1490
1491 #define DO_SQRDMLAH_H(N, M, A) \
1492 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1493 #define DO_SQRDMLAH_S(N, M, A) \
1494 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1495 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1496
DO_ZZXZ(sve2_sqrdmlah_idx_h,int16_t,H2,DO_SQRDMLAH_H)1497 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1498 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1499 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1500
1501 #define DO_SQRDMLSH_H(N, M, A) \
1502 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1503 #define DO_SQRDMLSH_S(N, M, A) \
1504 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1505 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1506
1507 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1508 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1509 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1510
1511 #undef DO_ZZXZ
1512
1513 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1514 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1515 { \
1516 intptr_t i, j, oprsz = simd_oprsz(desc); \
1517 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1518 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1519 for (i = 0; i < oprsz; i += 16) { \
1520 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1521 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1522 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1523 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1524 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1525 } \
1526 } \
1527 }
1528
1529 #define DO_MLA(N, M, A) (A + N * M)
1530
1531 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1532 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1533 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1534 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1535
1536 #define DO_MLS(N, M, A) (A - N * M)
1537
1538 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1539 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1540 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1541 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1542
1543 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1544 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1545
1546 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1547 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1548
1549 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1550 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1551
1552 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1553 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1554
1555 #undef DO_MLA
1556 #undef DO_MLS
1557 #undef DO_ZZXW
1558
1559 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1560 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1561 { \
1562 intptr_t i, j, oprsz = simd_oprsz(desc); \
1563 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1564 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1565 for (i = 0; i < oprsz; i += 16) { \
1566 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1567 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1568 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1569 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1570 } \
1571 } \
1572 }
1573
1574 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1575 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1576
1577 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1578 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1579
1580 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1581 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1582
1583 #undef DO_ZZX
1584
1585 #define DO_BITPERM(NAME, TYPE, OP) \
1586 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1587 { \
1588 intptr_t i, opr_sz = simd_oprsz(desc); \
1589 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1590 TYPE nn = *(TYPE *)(vn + i); \
1591 TYPE mm = *(TYPE *)(vm + i); \
1592 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1593 } \
1594 }
1595
1596 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1597 {
1598 uint64_t res = 0;
1599 int db, rb = 0;
1600
1601 for (db = 0; db < n; ++db) {
1602 if ((mask >> db) & 1) {
1603 res |= ((data >> db) & 1) << rb;
1604 ++rb;
1605 }
1606 }
1607 return res;
1608 }
1609
DO_BITPERM(sve2_bext_b,uint8_t,bitextract)1610 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1611 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1612 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1613 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1614
1615 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1616 {
1617 uint64_t res = 0;
1618 int rb, db = 0;
1619
1620 for (rb = 0; rb < n; ++rb) {
1621 if ((mask >> rb) & 1) {
1622 res |= ((data >> db) & 1) << rb;
1623 ++db;
1624 }
1625 }
1626 return res;
1627 }
1628
DO_BITPERM(sve2_bdep_b,uint8_t,bitdeposit)1629 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1630 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1631 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1632 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1633
1634 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1635 {
1636 uint64_t resm = 0, resu = 0;
1637 int db, rbm = 0, rbu = 0;
1638
1639 for (db = 0; db < n; ++db) {
1640 uint64_t val = (data >> db) & 1;
1641 if ((mask >> db) & 1) {
1642 resm |= val << rbm++;
1643 } else {
1644 resu |= val << rbu++;
1645 }
1646 }
1647
1648 return resm | (resu << rbm);
1649 }
1650
DO_BITPERM(sve2_bgrp_b,uint8_t,bitgroup)1651 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1652 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1653 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1654 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1655
1656 #undef DO_BITPERM
1657
1658 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1659 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1660 { \
1661 intptr_t i, opr_sz = simd_oprsz(desc); \
1662 int sub_r = simd_data(desc); \
1663 if (sub_r) { \
1664 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1665 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1666 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1667 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1668 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1669 acc_r = ADD_OP(acc_r, el2_i); \
1670 acc_i = SUB_OP(acc_i, el2_r); \
1671 *(TYPE *)(vd + H(i)) = acc_r; \
1672 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1673 } \
1674 } else { \
1675 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1676 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1677 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1678 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1679 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1680 acc_r = SUB_OP(acc_r, el2_i); \
1681 acc_i = ADD_OP(acc_i, el2_r); \
1682 *(TYPE *)(vd + H(i)) = acc_r; \
1683 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1684 } \
1685 } \
1686 }
1687
1688 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1689 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1690 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1691 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1692
1693 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1694 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1695 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1696 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1697
1698 #undef DO_CADD
1699
1700 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1701 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1702 { \
1703 intptr_t i, opr_sz = simd_oprsz(desc); \
1704 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1705 int shift = simd_data(desc) >> 1; \
1706 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1707 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1708 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1709 } \
1710 }
1711
1712 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1713 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1714 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1715
1716 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1717 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1718 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1719
1720 #undef DO_ZZI_SHLL
1721
1722 /* Two-operand reduction expander, controlled by a predicate.
1723 * The difference between TYPERED and TYPERET has to do with
1724 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1725 * but TYPERET must be unsigned so that e.g. a 32-bit value
1726 * is not sign-extended to the ABI uint64_t return type.
1727 */
1728 /* ??? If we were to vectorize this by hand the reduction ordering
1729 * would change. For integer operands, this is perfectly fine.
1730 */
1731 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1732 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1733 { \
1734 intptr_t i, opr_sz = simd_oprsz(desc); \
1735 TYPERED ret = INIT; \
1736 for (i = 0; i < opr_sz; ) { \
1737 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1738 do { \
1739 if (pg & 1) { \
1740 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1741 ret = OP(ret, nn); \
1742 } \
1743 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1744 } while (i & 15); \
1745 } \
1746 return (TYPERET)ret; \
1747 }
1748
1749 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1750 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1751 { \
1752 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1753 TYPEE *n = vn; \
1754 uint8_t *pg = vg; \
1755 TYPER ret = INIT; \
1756 for (i = 0; i < opr_sz; i += 1) { \
1757 if (pg[H1(i)] & 1) { \
1758 TYPEE nn = n[i]; \
1759 ret = OP(ret, nn); \
1760 } \
1761 } \
1762 return ret; \
1763 }
1764
1765 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1766 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1767 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1768 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1769
1770 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1771 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1772 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1773 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1774
1775 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1776 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1777 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1778 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1779
1780 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1781 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1782 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1783
1784 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1785 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1786 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1787 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1788
1789 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1790 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1791 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1792 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1793
1794 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1795 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1796 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1797 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1798
1799 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1800 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1801 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1802 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1803
1804 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1805 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1806 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1807 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1808
1809 #undef DO_VPZ
1810 #undef DO_VPZ_D
1811
1812 /* Two vector operand, one scalar operand, unpredicated. */
1813 #define DO_ZZI(NAME, TYPE, OP) \
1814 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1815 { \
1816 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1817 TYPE s = s64, *d = vd, *n = vn; \
1818 for (i = 0; i < opr_sz; ++i) { \
1819 d[i] = OP(n[i], s); \
1820 } \
1821 }
1822
1823 #define DO_SUBR(X, Y) (Y - X)
1824
1825 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1826 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1827 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1828 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1829
1830 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1831 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1832 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1833 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1834
1835 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1836 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1837 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1838 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1839
1840 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1841 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1842 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1843 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1844
1845 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1846 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1847 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1848 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1849
1850 #undef DO_ZZI
1851
1852 #undef DO_AND
1853 #undef DO_ORR
1854 #undef DO_EOR
1855 #undef DO_BIC
1856 #undef DO_ADD
1857 #undef DO_SUB
1858 #undef DO_MAX
1859 #undef DO_MIN
1860 #undef DO_ABD
1861 #undef DO_MUL
1862 #undef DO_DIV
1863 #undef DO_ASR
1864 #undef DO_LSR
1865 #undef DO_LSL
1866 #undef DO_SUBR
1867
1868 /* Similar to the ARM LastActiveElement pseudocode function, except the
1869 result is multiplied by the element size. This includes the not found
1870 indication; e.g. not found for esz=3 is -8. */
1871 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1872 {
1873 uint64_t mask = pred_esz_masks[esz];
1874 intptr_t i = words;
1875
1876 do {
1877 uint64_t this_g = g[--i] & mask;
1878 if (this_g) {
1879 return i * 64 + (63 - clz64(this_g));
1880 }
1881 } while (i > 0);
1882 return (intptr_t)-1 << esz;
1883 }
1884
HELPER(sve_pfirst)1885 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1886 {
1887 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1888 uint32_t flags = PREDTEST_INIT;
1889 uint64_t *d = vd, *g = vg;
1890 intptr_t i = 0;
1891
1892 do {
1893 uint64_t this_d = d[i];
1894 uint64_t this_g = g[i];
1895
1896 if (this_g) {
1897 if (!(flags & 4)) {
1898 /* Set in D the first bit of G. */
1899 this_d |= this_g & -this_g;
1900 d[i] = this_d;
1901 }
1902 flags = iter_predtest_fwd(this_d, this_g, flags);
1903 }
1904 } while (++i < words);
1905
1906 return flags;
1907 }
1908
HELPER(sve_pnext)1909 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1910 {
1911 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1912 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1913 uint32_t flags = PREDTEST_INIT;
1914 uint64_t *d = vd, *g = vg, esz_mask;
1915 intptr_t i, next;
1916
1917 next = last_active_element(vd, words, esz) + (1 << esz);
1918 esz_mask = pred_esz_masks[esz];
1919
1920 /* Similar to the pseudocode for pnext, but scaled by ESZ
1921 so that we find the correct bit. */
1922 if (next < words * 64) {
1923 uint64_t mask = -1;
1924
1925 if (next & 63) {
1926 mask = ~((1ull << (next & 63)) - 1);
1927 next &= -64;
1928 }
1929 do {
1930 uint64_t this_g = g[next / 64] & esz_mask & mask;
1931 if (this_g != 0) {
1932 next = (next & -64) + ctz64(this_g);
1933 break;
1934 }
1935 next += 64;
1936 mask = -1;
1937 } while (next < words * 64);
1938 }
1939
1940 i = 0;
1941 do {
1942 uint64_t this_d = 0;
1943 if (i == next / 64) {
1944 this_d = 1ull << (next & 63);
1945 }
1946 d[i] = this_d;
1947 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1948 } while (++i < words);
1949
1950 return flags;
1951 }
1952
1953 /*
1954 * Copy Zn into Zd, and store zero into inactive elements.
1955 * If inv, store zeros into the active elements.
1956 */
HELPER(sve_movz_b)1957 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1958 {
1959 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1960 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1961 uint64_t *d = vd, *n = vn;
1962 uint8_t *pg = vg;
1963
1964 for (i = 0; i < opr_sz; i += 1) {
1965 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1966 }
1967 }
1968
HELPER(sve_movz_h)1969 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1970 {
1971 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1972 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1973 uint64_t *d = vd, *n = vn;
1974 uint8_t *pg = vg;
1975
1976 for (i = 0; i < opr_sz; i += 1) {
1977 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1978 }
1979 }
1980
HELPER(sve_movz_s)1981 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1982 {
1983 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1984 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1985 uint64_t *d = vd, *n = vn;
1986 uint8_t *pg = vg;
1987
1988 for (i = 0; i < opr_sz; i += 1) {
1989 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1990 }
1991 }
1992
HELPER(sve_movz_d)1993 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1994 {
1995 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1996 uint64_t *d = vd, *n = vn;
1997 uint8_t *pg = vg;
1998 uint8_t inv = simd_data(desc);
1999
2000 for (i = 0; i < opr_sz; i += 1) {
2001 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2002 }
2003 }
2004
2005 /* Three-operand expander, immediate operand, controlled by a predicate.
2006 */
2007 #define DO_ZPZI(NAME, TYPE, H, OP) \
2008 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2009 { \
2010 intptr_t i, opr_sz = simd_oprsz(desc); \
2011 TYPE imm = simd_data(desc); \
2012 for (i = 0; i < opr_sz; ) { \
2013 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2014 do { \
2015 if (pg & 1) { \
2016 TYPE nn = *(TYPE *)(vn + H(i)); \
2017 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2018 } \
2019 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2020 } while (i & 15); \
2021 } \
2022 }
2023
2024 /* Similarly, specialized for 64-bit operands. */
2025 #define DO_ZPZI_D(NAME, TYPE, OP) \
2026 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2027 { \
2028 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2029 TYPE *d = vd, *n = vn; \
2030 TYPE imm = simd_data(desc); \
2031 uint8_t *pg = vg; \
2032 for (i = 0; i < opr_sz; i += 1) { \
2033 if (pg[H1(i)] & 1) { \
2034 TYPE nn = n[i]; \
2035 d[i] = OP(nn, imm); \
2036 } \
2037 } \
2038 }
2039
2040 #define DO_SHR(N, M) (N >> M)
2041 #define DO_SHL(N, M) (N << M)
2042
2043 /* Arithmetic shift right for division. This rounds negative numbers
2044 toward zero as per signed division. Therefore before shifting,
2045 when N is negative, add 2**M-1. */
2046 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2047
do_urshr(uint64_t x,unsigned sh)2048 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2049 {
2050 if (likely(sh < 64)) {
2051 return (x >> sh) + ((x >> (sh - 1)) & 1);
2052 } else if (sh == 64) {
2053 return x >> 63;
2054 } else {
2055 return 0;
2056 }
2057 }
2058
do_srshr(int64_t x,unsigned sh)2059 static inline int64_t do_srshr(int64_t x, unsigned sh)
2060 {
2061 if (likely(sh < 64)) {
2062 return (x >> sh) + ((x >> (sh - 1)) & 1);
2063 } else {
2064 /* Rounding the sign bit always produces 0. */
2065 return 0;
2066 }
2067 }
2068
DO_ZPZI(sve_asr_zpzi_b,int8_t,H1,DO_SHR)2069 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2070 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2071 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2072 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2073
2074 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2075 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2076 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2077 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2078
2079 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2080 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2081 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2082 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2083
2084 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2085 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2086 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2087 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2088
2089 /* SVE2 bitwise shift by immediate */
2090 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2091 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2092 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2093 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2094
2095 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2096 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2097 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2098 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2099
2100 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2101 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2102 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2103 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2104
2105 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2106 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2107 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2108 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2109
2110 #define do_suqrshl_b(n, m) \
2111 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2112 #define do_suqrshl_h(n, m) \
2113 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2114 #define do_suqrshl_s(n, m) \
2115 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2116 #define do_suqrshl_d(n, m) \
2117 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2118
2119 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2120 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2121 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2122 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2123
2124 #undef DO_ASRD
2125 #undef DO_ZPZI
2126 #undef DO_ZPZI_D
2127
2128 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2129 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2130 { \
2131 intptr_t i, opr_sz = simd_oprsz(desc); \
2132 int shift = simd_data(desc); \
2133 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2134 TYPEW nn = *(TYPEW *)(vn + i); \
2135 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2136 } \
2137 }
2138
2139 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2140 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2141 { \
2142 intptr_t i, opr_sz = simd_oprsz(desc); \
2143 int shift = simd_data(desc); \
2144 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2145 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2146 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2147 } \
2148 }
2149
2150 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2151 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2152 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2153
2154 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2155 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2156 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2157
2158 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2159 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2160 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2161
2162 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2163 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2164 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2165
2166 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2167 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2168 #define DO_SQSHRUN_D(x, sh) \
2169 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2170
2171 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2172 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2173 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2174
2175 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2176 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2177 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2178
2179 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2180 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2181 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2182
2183 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2184 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2185 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2186
2187 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2188 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2189 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2190
2191 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2192 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2193 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2194
2195 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2196 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2197 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2198
2199 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2200 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2201 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2202
2203 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2204 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2205 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2206
2207 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2208 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2209 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2210
2211 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2212 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2213 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2214
2215 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2216 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2217 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2218
2219 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2220 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2221 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2222
2223 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2224 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2225 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2226
2227 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2228 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2229 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2230
2231 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2232 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2233 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2234
2235 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2236 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2237 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2238
2239 #undef DO_SHRNB
2240 #undef DO_SHRNT
2241
2242 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2243 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2244 { \
2245 intptr_t i, opr_sz = simd_oprsz(desc); \
2246 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2247 TYPEW nn = *(TYPEW *)(vn + i); \
2248 TYPEW mm = *(TYPEW *)(vm + i); \
2249 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2250 } \
2251 }
2252
2253 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2254 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2255 { \
2256 intptr_t i, opr_sz = simd_oprsz(desc); \
2257 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2258 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2259 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2260 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2261 } \
2262 }
2263
2264 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2265 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2266 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2267 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2268
2269 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2270 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2271 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2272
2273 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2274 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2275 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2276
2277 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2278 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2279 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2280
2281 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2282 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2283 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2284
2285 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2286 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2287 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2288
2289 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2290 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2291 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2292
2293 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2294 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2295 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2296
2297 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2298 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2299 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2300
2301 #undef DO_RSUBHN
2302 #undef DO_SUBHN
2303 #undef DO_RADDHN
2304 #undef DO_ADDHN
2305
2306 #undef DO_BINOPNB
2307
2308 /* Fully general four-operand expander, controlled by a predicate.
2309 */
2310 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2311 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2312 void *vg, uint32_t desc) \
2313 { \
2314 intptr_t i, opr_sz = simd_oprsz(desc); \
2315 for (i = 0; i < opr_sz; ) { \
2316 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2317 do { \
2318 if (pg & 1) { \
2319 TYPE nn = *(TYPE *)(vn + H(i)); \
2320 TYPE mm = *(TYPE *)(vm + H(i)); \
2321 TYPE aa = *(TYPE *)(va + H(i)); \
2322 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2323 } \
2324 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2325 } while (i & 15); \
2326 } \
2327 }
2328
2329 /* Similarly, specialized for 64-bit operands. */
2330 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2331 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2332 void *vg, uint32_t desc) \
2333 { \
2334 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2335 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2336 uint8_t *pg = vg; \
2337 for (i = 0; i < opr_sz; i += 1) { \
2338 if (pg[H1(i)] & 1) { \
2339 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2340 d[i] = OP(aa, nn, mm); \
2341 } \
2342 } \
2343 }
2344
2345 #define DO_MLA(A, N, M) (A + N * M)
2346 #define DO_MLS(A, N, M) (A - N * M)
2347
2348 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2349 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2350
2351 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2352 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2353
2354 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2355 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2356
2357 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2358 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2359
2360 #undef DO_MLA
2361 #undef DO_MLS
2362 #undef DO_ZPZZZ
2363 #undef DO_ZPZZZ_D
2364
2365 void HELPER(sve_index_b)(void *vd, uint32_t start,
2366 uint32_t incr, uint32_t desc)
2367 {
2368 intptr_t i, opr_sz = simd_oprsz(desc);
2369 uint8_t *d = vd;
2370 for (i = 0; i < opr_sz; i += 1) {
2371 d[H1(i)] = start + i * incr;
2372 }
2373 }
2374
HELPER(sve_index_h)2375 void HELPER(sve_index_h)(void *vd, uint32_t start,
2376 uint32_t incr, uint32_t desc)
2377 {
2378 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2379 uint16_t *d = vd;
2380 for (i = 0; i < opr_sz; i += 1) {
2381 d[H2(i)] = start + i * incr;
2382 }
2383 }
2384
HELPER(sve_index_s)2385 void HELPER(sve_index_s)(void *vd, uint32_t start,
2386 uint32_t incr, uint32_t desc)
2387 {
2388 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2389 uint32_t *d = vd;
2390 for (i = 0; i < opr_sz; i += 1) {
2391 d[H4(i)] = start + i * incr;
2392 }
2393 }
2394
HELPER(sve_index_d)2395 void HELPER(sve_index_d)(void *vd, uint64_t start,
2396 uint64_t incr, uint32_t desc)
2397 {
2398 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2399 uint64_t *d = vd;
2400 for (i = 0; i < opr_sz; i += 1) {
2401 d[i] = start + i * incr;
2402 }
2403 }
2404
HELPER(sve_adr_p32)2405 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2406 {
2407 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2408 uint32_t sh = simd_data(desc);
2409 uint32_t *d = vd, *n = vn, *m = vm;
2410 for (i = 0; i < opr_sz; i += 1) {
2411 d[i] = n[i] + (m[i] << sh);
2412 }
2413 }
2414
HELPER(sve_adr_p64)2415 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2416 {
2417 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2418 uint64_t sh = simd_data(desc);
2419 uint64_t *d = vd, *n = vn, *m = vm;
2420 for (i = 0; i < opr_sz; i += 1) {
2421 d[i] = n[i] + (m[i] << sh);
2422 }
2423 }
2424
HELPER(sve_adr_s32)2425 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2426 {
2427 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2428 uint64_t sh = simd_data(desc);
2429 uint64_t *d = vd, *n = vn, *m = vm;
2430 for (i = 0; i < opr_sz; i += 1) {
2431 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2432 }
2433 }
2434
HELPER(sve_adr_u32)2435 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2436 {
2437 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2438 uint64_t sh = simd_data(desc);
2439 uint64_t *d = vd, *n = vn, *m = vm;
2440 for (i = 0; i < opr_sz; i += 1) {
2441 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2442 }
2443 }
2444
HELPER(sve_fexpa_h)2445 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2446 {
2447 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2448 static const uint16_t coeff[] = {
2449 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2450 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2451 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2452 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2453 };
2454 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2455 uint16_t *d = vd, *n = vn;
2456
2457 for (i = 0; i < opr_sz; i++) {
2458 uint16_t nn = n[i];
2459 intptr_t idx = extract32(nn, 0, 5);
2460 uint16_t exp = extract32(nn, 5, 5);
2461 d[i] = coeff[idx] | (exp << 10);
2462 }
2463 }
2464
HELPER(sve_fexpa_s)2465 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2466 {
2467 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2468 static const uint32_t coeff[] = {
2469 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2470 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2471 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2472 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2473 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2474 0x1ef532, 0x20b051, 0x227043, 0x243516,
2475 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2476 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2477 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2478 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2479 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2480 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2481 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2482 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2483 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2484 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2485 };
2486 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2487 uint32_t *d = vd, *n = vn;
2488
2489 for (i = 0; i < opr_sz; i++) {
2490 uint32_t nn = n[i];
2491 intptr_t idx = extract32(nn, 0, 6);
2492 uint32_t exp = extract32(nn, 6, 8);
2493 d[i] = coeff[idx] | (exp << 23);
2494 }
2495 }
2496
HELPER(sve_fexpa_d)2497 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2498 {
2499 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2500 static const uint64_t coeff[] = {
2501 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2502 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2503 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2504 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2505 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2506 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2507 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2508 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2509 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2510 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2511 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2512 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2513 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2514 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2515 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2516 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2517 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2518 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2519 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2520 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2521 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2522 0xFA7C1819E90D8ull,
2523 };
2524 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2525 uint64_t *d = vd, *n = vn;
2526
2527 for (i = 0; i < opr_sz; i++) {
2528 uint64_t nn = n[i];
2529 intptr_t idx = extract32(nn, 0, 6);
2530 uint64_t exp = extract32(nn, 6, 11);
2531 d[i] = coeff[idx] | (exp << 52);
2532 }
2533 }
2534
HELPER(sve_ftssel_h)2535 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2536 {
2537 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2538 uint16_t *d = vd, *n = vn, *m = vm;
2539 for (i = 0; i < opr_sz; i += 1) {
2540 uint16_t nn = n[i];
2541 uint16_t mm = m[i];
2542 if (mm & 1) {
2543 nn = float16_one;
2544 }
2545 d[i] = nn ^ (mm & 2) << 14;
2546 }
2547 }
2548
HELPER(sve_ftssel_s)2549 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2550 {
2551 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2552 uint32_t *d = vd, *n = vn, *m = vm;
2553 for (i = 0; i < opr_sz; i += 1) {
2554 uint32_t nn = n[i];
2555 uint32_t mm = m[i];
2556 if (mm & 1) {
2557 nn = float32_one;
2558 }
2559 d[i] = nn ^ (mm & 2) << 30;
2560 }
2561 }
2562
HELPER(sve_ftssel_d)2563 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2564 {
2565 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2566 uint64_t *d = vd, *n = vn, *m = vm;
2567 for (i = 0; i < opr_sz; i += 1) {
2568 uint64_t nn = n[i];
2569 uint64_t mm = m[i];
2570 if (mm & 1) {
2571 nn = float64_one;
2572 }
2573 d[i] = nn ^ (mm & 2) << 62;
2574 }
2575 }
2576
2577 /*
2578 * Signed saturating addition with scalar operand.
2579 */
2580
HELPER(sve_sqaddi_b)2581 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2582 {
2583 intptr_t i, oprsz = simd_oprsz(desc);
2584
2585 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2586 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2587 }
2588 }
2589
HELPER(sve_sqaddi_h)2590 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2591 {
2592 intptr_t i, oprsz = simd_oprsz(desc);
2593
2594 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2595 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2596 }
2597 }
2598
HELPER(sve_sqaddi_s)2599 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2600 {
2601 intptr_t i, oprsz = simd_oprsz(desc);
2602
2603 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2604 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2605 }
2606 }
2607
HELPER(sve_sqaddi_d)2608 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2609 {
2610 intptr_t i, oprsz = simd_oprsz(desc);
2611
2612 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2613 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2614 }
2615 }
2616
2617 /*
2618 * Unsigned saturating addition with scalar operand.
2619 */
2620
HELPER(sve_uqaddi_b)2621 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2622 {
2623 intptr_t i, oprsz = simd_oprsz(desc);
2624
2625 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2626 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2627 }
2628 }
2629
HELPER(sve_uqaddi_h)2630 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2631 {
2632 intptr_t i, oprsz = simd_oprsz(desc);
2633
2634 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2635 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2636 }
2637 }
2638
HELPER(sve_uqaddi_s)2639 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2640 {
2641 intptr_t i, oprsz = simd_oprsz(desc);
2642
2643 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2644 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2645 }
2646 }
2647
HELPER(sve_uqaddi_d)2648 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2649 {
2650 intptr_t i, oprsz = simd_oprsz(desc);
2651
2652 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2653 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2654 }
2655 }
2656
HELPER(sve_uqsubi_d)2657 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2658 {
2659 intptr_t i, oprsz = simd_oprsz(desc);
2660
2661 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2662 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2663 }
2664 }
2665
2666 /* Two operand predicated copy immediate with merge. All valid immediates
2667 * can fit within 17 signed bits in the simd_data field.
2668 */
HELPER(sve_cpy_m_b)2669 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2670 uint64_t mm, uint32_t desc)
2671 {
2672 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2673 uint64_t *d = vd, *n = vn;
2674 uint8_t *pg = vg;
2675
2676 mm = dup_const(MO_8, mm);
2677 for (i = 0; i < opr_sz; i += 1) {
2678 uint64_t nn = n[i];
2679 uint64_t pp = expand_pred_b(pg[H1(i)]);
2680 d[i] = (mm & pp) | (nn & ~pp);
2681 }
2682 }
2683
HELPER(sve_cpy_m_h)2684 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2685 uint64_t mm, uint32_t desc)
2686 {
2687 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2688 uint64_t *d = vd, *n = vn;
2689 uint8_t *pg = vg;
2690
2691 mm = dup_const(MO_16, mm);
2692 for (i = 0; i < opr_sz; i += 1) {
2693 uint64_t nn = n[i];
2694 uint64_t pp = expand_pred_h(pg[H1(i)]);
2695 d[i] = (mm & pp) | (nn & ~pp);
2696 }
2697 }
2698
HELPER(sve_cpy_m_s)2699 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2700 uint64_t mm, uint32_t desc)
2701 {
2702 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2703 uint64_t *d = vd, *n = vn;
2704 uint8_t *pg = vg;
2705
2706 mm = dup_const(MO_32, mm);
2707 for (i = 0; i < opr_sz; i += 1) {
2708 uint64_t nn = n[i];
2709 uint64_t pp = expand_pred_s(pg[H1(i)]);
2710 d[i] = (mm & pp) | (nn & ~pp);
2711 }
2712 }
2713
HELPER(sve_cpy_m_d)2714 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2715 uint64_t mm, uint32_t desc)
2716 {
2717 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2718 uint64_t *d = vd, *n = vn;
2719 uint8_t *pg = vg;
2720
2721 for (i = 0; i < opr_sz; i += 1) {
2722 uint64_t nn = n[i];
2723 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2724 }
2725 }
2726
HELPER(sve_cpy_z_b)2727 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2728 {
2729 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2730 uint64_t *d = vd;
2731 uint8_t *pg = vg;
2732
2733 val = dup_const(MO_8, val);
2734 for (i = 0; i < opr_sz; i += 1) {
2735 d[i] = val & expand_pred_b(pg[H1(i)]);
2736 }
2737 }
2738
HELPER(sve_cpy_z_h)2739 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2740 {
2741 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2742 uint64_t *d = vd;
2743 uint8_t *pg = vg;
2744
2745 val = dup_const(MO_16, val);
2746 for (i = 0; i < opr_sz; i += 1) {
2747 d[i] = val & expand_pred_h(pg[H1(i)]);
2748 }
2749 }
2750
HELPER(sve_cpy_z_s)2751 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2752 {
2753 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2754 uint64_t *d = vd;
2755 uint8_t *pg = vg;
2756
2757 val = dup_const(MO_32, val);
2758 for (i = 0; i < opr_sz; i += 1) {
2759 d[i] = val & expand_pred_s(pg[H1(i)]);
2760 }
2761 }
2762
HELPER(sve_cpy_z_d)2763 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2764 {
2765 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2766 uint64_t *d = vd;
2767 uint8_t *pg = vg;
2768
2769 for (i = 0; i < opr_sz; i += 1) {
2770 d[i] = (pg[H1(i)] & 1 ? val : 0);
2771 }
2772 }
2773
2774 /* Big-endian hosts need to frob the byte indices. If the copy
2775 * happens to be 8-byte aligned, then no frobbing necessary.
2776 */
swap_memmove(void * vd,void * vs,size_t n)2777 static void swap_memmove(void *vd, void *vs, size_t n)
2778 {
2779 uintptr_t d = (uintptr_t)vd;
2780 uintptr_t s = (uintptr_t)vs;
2781 uintptr_t o = (d | s | n) & 7;
2782 size_t i;
2783
2784 #if !HOST_BIG_ENDIAN
2785 o = 0;
2786 #endif
2787 switch (o) {
2788 case 0:
2789 memmove(vd, vs, n);
2790 break;
2791
2792 case 4:
2793 if (d < s || d >= s + n) {
2794 for (i = 0; i < n; i += 4) {
2795 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2796 }
2797 } else {
2798 for (i = n; i > 0; ) {
2799 i -= 4;
2800 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2801 }
2802 }
2803 break;
2804
2805 case 2:
2806 case 6:
2807 if (d < s || d >= s + n) {
2808 for (i = 0; i < n; i += 2) {
2809 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2810 }
2811 } else {
2812 for (i = n; i > 0; ) {
2813 i -= 2;
2814 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2815 }
2816 }
2817 break;
2818
2819 default:
2820 if (d < s || d >= s + n) {
2821 for (i = 0; i < n; i++) {
2822 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2823 }
2824 } else {
2825 for (i = n; i > 0; ) {
2826 i -= 1;
2827 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2828 }
2829 }
2830 break;
2831 }
2832 }
2833
2834 /* Similarly for memset of 0. */
swap_memzero(void * vd,size_t n)2835 static void swap_memzero(void *vd, size_t n)
2836 {
2837 uintptr_t d = (uintptr_t)vd;
2838 uintptr_t o = (d | n) & 7;
2839 size_t i;
2840
2841 /* Usually, the first bit of a predicate is set, so N is 0. */
2842 if (likely(n == 0)) {
2843 return;
2844 }
2845
2846 #if !HOST_BIG_ENDIAN
2847 o = 0;
2848 #endif
2849 switch (o) {
2850 case 0:
2851 memset(vd, 0, n);
2852 break;
2853
2854 case 4:
2855 for (i = 0; i < n; i += 4) {
2856 *(uint32_t *)H1_4(d + i) = 0;
2857 }
2858 break;
2859
2860 case 2:
2861 case 6:
2862 for (i = 0; i < n; i += 2) {
2863 *(uint16_t *)H1_2(d + i) = 0;
2864 }
2865 break;
2866
2867 default:
2868 for (i = 0; i < n; i++) {
2869 *(uint8_t *)H1(d + i) = 0;
2870 }
2871 break;
2872 }
2873 }
2874
HELPER(sve_ext)2875 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2876 {
2877 intptr_t opr_sz = simd_oprsz(desc);
2878 size_t n_ofs = simd_data(desc);
2879 size_t n_siz = opr_sz - n_ofs;
2880
2881 if (vd != vm) {
2882 swap_memmove(vd, vn + n_ofs, n_siz);
2883 swap_memmove(vd + n_siz, vm, n_ofs);
2884 } else if (vd != vn) {
2885 swap_memmove(vd + n_siz, vd, n_ofs);
2886 swap_memmove(vd, vn + n_ofs, n_siz);
2887 } else {
2888 /* vd == vn == vm. Need temp space. */
2889 ARMVectorReg tmp;
2890 swap_memmove(&tmp, vm, n_ofs);
2891 swap_memmove(vd, vd + n_ofs, n_siz);
2892 memcpy(vd + n_siz, &tmp, n_ofs);
2893 }
2894 }
2895
2896 #define DO_INSR(NAME, TYPE, H) \
2897 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2898 { \
2899 intptr_t opr_sz = simd_oprsz(desc); \
2900 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2901 *(TYPE *)(vd + H(0)) = val; \
2902 }
2903
DO_INSR(sve_insr_b,uint8_t,H1)2904 DO_INSR(sve_insr_b, uint8_t, H1)
2905 DO_INSR(sve_insr_h, uint16_t, H1_2)
2906 DO_INSR(sve_insr_s, uint32_t, H1_4)
2907 DO_INSR(sve_insr_d, uint64_t, H1_8)
2908
2909 #undef DO_INSR
2910
2911 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2912 {
2913 intptr_t i, j, opr_sz = simd_oprsz(desc);
2914 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2915 uint64_t f = *(uint64_t *)(vn + i);
2916 uint64_t b = *(uint64_t *)(vn + j);
2917 *(uint64_t *)(vd + i) = bswap64(b);
2918 *(uint64_t *)(vd + j) = bswap64(f);
2919 }
2920 }
2921
HELPER(sve_rev_h)2922 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2923 {
2924 intptr_t i, j, opr_sz = simd_oprsz(desc);
2925 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2926 uint64_t f = *(uint64_t *)(vn + i);
2927 uint64_t b = *(uint64_t *)(vn + j);
2928 *(uint64_t *)(vd + i) = hswap64(b);
2929 *(uint64_t *)(vd + j) = hswap64(f);
2930 }
2931 }
2932
HELPER(sve_rev_s)2933 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2934 {
2935 intptr_t i, j, opr_sz = simd_oprsz(desc);
2936 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2937 uint64_t f = *(uint64_t *)(vn + i);
2938 uint64_t b = *(uint64_t *)(vn + j);
2939 *(uint64_t *)(vd + i) = rol64(b, 32);
2940 *(uint64_t *)(vd + j) = rol64(f, 32);
2941 }
2942 }
2943
HELPER(sve_rev_d)2944 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2945 {
2946 intptr_t i, j, opr_sz = simd_oprsz(desc);
2947 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2948 uint64_t f = *(uint64_t *)(vn + i);
2949 uint64_t b = *(uint64_t *)(vn + j);
2950 *(uint64_t *)(vd + i) = b;
2951 *(uint64_t *)(vd + j) = f;
2952 }
2953 }
2954
2955 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2956
do_tbl1(void * vd,void * vn,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)2957 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2958 bool is_tbx, tb_impl_fn *fn)
2959 {
2960 ARMVectorReg scratch;
2961 uintptr_t oprsz = simd_oprsz(desc);
2962
2963 if (unlikely(vd == vn)) {
2964 vn = memcpy(&scratch, vn, oprsz);
2965 }
2966
2967 fn(vd, vn, NULL, vm, oprsz, is_tbx);
2968 }
2969
do_tbl2(void * vd,void * vn0,void * vn1,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)2970 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2971 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2972 {
2973 ARMVectorReg scratch;
2974 uintptr_t oprsz = simd_oprsz(desc);
2975
2976 if (unlikely(vd == vn0)) {
2977 vn0 = memcpy(&scratch, vn0, oprsz);
2978 if (vd == vn1) {
2979 vn1 = vn0;
2980 }
2981 } else if (unlikely(vd == vn1)) {
2982 vn1 = memcpy(&scratch, vn1, oprsz);
2983 }
2984
2985 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2986 }
2987
2988 #define DO_TB(SUFF, TYPE, H) \
2989 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
2990 void *vm, uintptr_t oprsz, bool is_tbx) \
2991 { \
2992 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
2993 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
2994 for (i = 0; i < nelem; ++i) { \
2995 TYPE index = indexes[H1(i)], val = 0; \
2996 if (index < nelem) { \
2997 val = tbl0[H(index)]; \
2998 } else { \
2999 index -= nelem; \
3000 if (tbl1 && index < nelem) { \
3001 val = tbl1[H(index)]; \
3002 } else if (is_tbx) { \
3003 continue; \
3004 } \
3005 } \
3006 d[H(i)] = val; \
3007 } \
3008 } \
3009 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3010 { \
3011 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3012 } \
3013 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3014 void *vm, uint32_t desc) \
3015 { \
3016 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3017 } \
3018 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3019 { \
3020 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3021 }
3022
3023 DO_TB(b, uint8_t, H1)
3024 DO_TB(h, uint16_t, H2)
3025 DO_TB(s, uint32_t, H4)
3026 DO_TB(d, uint64_t, H8)
3027
3028 #undef DO_TB
3029
3030 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3031 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3032 { \
3033 intptr_t i, opr_sz = simd_oprsz(desc); \
3034 TYPED *d = vd; \
3035 TYPES *n = vn; \
3036 ARMVectorReg tmp; \
3037 if (unlikely(vn - vd < opr_sz)) { \
3038 n = memcpy(&tmp, n, opr_sz / 2); \
3039 } \
3040 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3041 d[HD(i)] = n[HS(i)]; \
3042 } \
3043 }
3044
3045 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3046 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3047 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3048
3049 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3050 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3051 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3052
3053 #undef DO_UNPK
3054
3055 /* Mask of bits included in the even numbered predicates of width esz.
3056 * We also use this for expand_bits/compress_bits, and so extend the
3057 * same pattern out to 16-bit units.
3058 */
3059 static const uint64_t even_bit_esz_masks[5] = {
3060 0x5555555555555555ull,
3061 0x3333333333333333ull,
3062 0x0f0f0f0f0f0f0f0full,
3063 0x00ff00ff00ff00ffull,
3064 0x0000ffff0000ffffull,
3065 };
3066
3067 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3068 * For N==0, this corresponds to the operation that in qemu/bitops.h
3069 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3070 * section 7-2 Shuffling Bits.
3071 */
expand_bits(uint64_t x,int n)3072 static uint64_t expand_bits(uint64_t x, int n)
3073 {
3074 int i;
3075
3076 x &= 0xffffffffu;
3077 for (i = 4; i >= n; i--) {
3078 int sh = 1 << i;
3079 x = ((x << sh) | x) & even_bit_esz_masks[i];
3080 }
3081 return x;
3082 }
3083
3084 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3085 * For N==0, this corresponds to the operation that in qemu/bitops.h
3086 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3087 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3088 */
compress_bits(uint64_t x,int n)3089 static uint64_t compress_bits(uint64_t x, int n)
3090 {
3091 int i;
3092
3093 for (i = n; i <= 4; i++) {
3094 int sh = 1 << i;
3095 x &= even_bit_esz_masks[i];
3096 x = (x >> sh) | x;
3097 }
3098 return x & 0xffffffffu;
3099 }
3100
HELPER(sve_zip_p)3101 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3102 {
3103 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3104 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3105 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3106 int esize = 1 << esz;
3107 uint64_t *d = vd;
3108 intptr_t i;
3109
3110 if (oprsz <= 8) {
3111 uint64_t nn = *(uint64_t *)vn;
3112 uint64_t mm = *(uint64_t *)vm;
3113 int half = 4 * oprsz;
3114
3115 nn = extract64(nn, high * half, half);
3116 mm = extract64(mm, high * half, half);
3117 nn = expand_bits(nn, esz);
3118 mm = expand_bits(mm, esz);
3119 d[0] = nn | (mm << esize);
3120 } else {
3121 ARMPredicateReg tmp;
3122
3123 /* We produce output faster than we consume input.
3124 Therefore we must be mindful of possible overlap. */
3125 if (vd == vn) {
3126 vn = memcpy(&tmp, vn, oprsz);
3127 if (vd == vm) {
3128 vm = vn;
3129 }
3130 } else if (vd == vm) {
3131 vm = memcpy(&tmp, vm, oprsz);
3132 }
3133 if (high) {
3134 high = oprsz >> 1;
3135 }
3136
3137 if ((oprsz & 7) == 0) {
3138 uint32_t *n = vn, *m = vm;
3139 high >>= 2;
3140
3141 for (i = 0; i < oprsz / 8; i++) {
3142 uint64_t nn = n[H4(high + i)];
3143 uint64_t mm = m[H4(high + i)];
3144
3145 nn = expand_bits(nn, esz);
3146 mm = expand_bits(mm, esz);
3147 d[i] = nn | (mm << esize);
3148 }
3149 } else {
3150 uint8_t *n = vn, *m = vm;
3151 uint16_t *d16 = vd;
3152
3153 for (i = 0; i < oprsz / 2; i++) {
3154 uint16_t nn = n[H1(high + i)];
3155 uint16_t mm = m[H1(high + i)];
3156
3157 nn = expand_bits(nn, esz);
3158 mm = expand_bits(mm, esz);
3159 d16[H2(i)] = nn | (mm << esize);
3160 }
3161 }
3162 }
3163 }
3164
HELPER(sve_uzp_p)3165 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3166 {
3167 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3168 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3169 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3170 uint64_t *d = vd, *n = vn, *m = vm;
3171 uint64_t l, h;
3172 intptr_t i;
3173
3174 if (oprsz <= 8) {
3175 l = compress_bits(n[0] >> odd, esz);
3176 h = compress_bits(m[0] >> odd, esz);
3177 d[0] = l | (h << (4 * oprsz));
3178 } else {
3179 ARMPredicateReg tmp_m;
3180 intptr_t oprsz_16 = oprsz / 16;
3181
3182 if ((vm - vd) < (uintptr_t)oprsz) {
3183 m = memcpy(&tmp_m, vm, oprsz);
3184 }
3185
3186 for (i = 0; i < oprsz_16; i++) {
3187 l = n[2 * i + 0];
3188 h = n[2 * i + 1];
3189 l = compress_bits(l >> odd, esz);
3190 h = compress_bits(h >> odd, esz);
3191 d[i] = l | (h << 32);
3192 }
3193
3194 /*
3195 * For VL which is not a multiple of 512, the results from M do not
3196 * align nicely with the uint64_t for D. Put the aligned results
3197 * from M into TMP_M and then copy it into place afterward.
3198 */
3199 if (oprsz & 15) {
3200 int final_shift = (oprsz & 15) * 2;
3201
3202 l = n[2 * i + 0];
3203 h = n[2 * i + 1];
3204 l = compress_bits(l >> odd, esz);
3205 h = compress_bits(h >> odd, esz);
3206 d[i] = l | (h << final_shift);
3207
3208 for (i = 0; i < oprsz_16; i++) {
3209 l = m[2 * i + 0];
3210 h = m[2 * i + 1];
3211 l = compress_bits(l >> odd, esz);
3212 h = compress_bits(h >> odd, esz);
3213 tmp_m.p[i] = l | (h << 32);
3214 }
3215 l = m[2 * i + 0];
3216 h = m[2 * i + 1];
3217 l = compress_bits(l >> odd, esz);
3218 h = compress_bits(h >> odd, esz);
3219 tmp_m.p[i] = l | (h << final_shift);
3220
3221 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3222 } else {
3223 for (i = 0; i < oprsz_16; i++) {
3224 l = m[2 * i + 0];
3225 h = m[2 * i + 1];
3226 l = compress_bits(l >> odd, esz);
3227 h = compress_bits(h >> odd, esz);
3228 d[oprsz_16 + i] = l | (h << 32);
3229 }
3230 }
3231 }
3232 }
3233
HELPER(sve_trn_p)3234 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3235 {
3236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3237 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3238 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3239 uint64_t *d = vd, *n = vn, *m = vm;
3240 uint64_t mask;
3241 int shr, shl;
3242 intptr_t i;
3243
3244 shl = 1 << esz;
3245 shr = 0;
3246 mask = even_bit_esz_masks[esz];
3247 if (odd) {
3248 mask <<= shl;
3249 shr = shl;
3250 shl = 0;
3251 }
3252
3253 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3254 uint64_t nn = (n[i] & mask) >> shr;
3255 uint64_t mm = (m[i] & mask) << shl;
3256 d[i] = nn + mm;
3257 }
3258 }
3259
3260 /* Reverse units of 2**N bits. */
reverse_bits_64(uint64_t x,int n)3261 static uint64_t reverse_bits_64(uint64_t x, int n)
3262 {
3263 int i, sh;
3264
3265 x = bswap64(x);
3266 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3267 uint64_t mask = even_bit_esz_masks[i];
3268 x = ((x & mask) << sh) | ((x >> sh) & mask);
3269 }
3270 return x;
3271 }
3272
reverse_bits_8(uint8_t x,int n)3273 static uint8_t reverse_bits_8(uint8_t x, int n)
3274 {
3275 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3276 int i, sh;
3277
3278 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3279 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3280 }
3281 return x;
3282 }
3283
HELPER(sve_rev_p)3284 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3285 {
3286 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3287 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3288 intptr_t i, oprsz_2 = oprsz / 2;
3289
3290 if (oprsz <= 8) {
3291 uint64_t l = *(uint64_t *)vn;
3292 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3293 *(uint64_t *)vd = l;
3294 } else if ((oprsz & 15) == 0) {
3295 for (i = 0; i < oprsz_2; i += 8) {
3296 intptr_t ih = oprsz - 8 - i;
3297 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3298 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3299 *(uint64_t *)(vd + i) = h;
3300 *(uint64_t *)(vd + ih) = l;
3301 }
3302 } else {
3303 for (i = 0; i < oprsz_2; i += 1) {
3304 intptr_t il = H1(i);
3305 intptr_t ih = H1(oprsz - 1 - i);
3306 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3307 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3308 *(uint8_t *)(vd + il) = h;
3309 *(uint8_t *)(vd + ih) = l;
3310 }
3311 }
3312 }
3313
HELPER(sve_punpk_p)3314 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3315 {
3316 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3317 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3318 uint64_t *d = vd;
3319 intptr_t i;
3320
3321 if (oprsz <= 8) {
3322 uint64_t nn = *(uint64_t *)vn;
3323 int half = 4 * oprsz;
3324
3325 nn = extract64(nn, high * half, half);
3326 nn = expand_bits(nn, 0);
3327 d[0] = nn;
3328 } else {
3329 ARMPredicateReg tmp_n;
3330
3331 /* We produce output faster than we consume input.
3332 Therefore we must be mindful of possible overlap. */
3333 if ((vn - vd) < (uintptr_t)oprsz) {
3334 vn = memcpy(&tmp_n, vn, oprsz);
3335 }
3336 if (high) {
3337 high = oprsz >> 1;
3338 }
3339
3340 if ((oprsz & 7) == 0) {
3341 uint32_t *n = vn;
3342 high >>= 2;
3343
3344 for (i = 0; i < oprsz / 8; i++) {
3345 uint64_t nn = n[H4(high + i)];
3346 d[i] = expand_bits(nn, 0);
3347 }
3348 } else {
3349 uint16_t *d16 = vd;
3350 uint8_t *n = vn;
3351
3352 for (i = 0; i < oprsz / 2; i++) {
3353 uint16_t nn = n[H1(high + i)];
3354 d16[H2(i)] = expand_bits(nn, 0);
3355 }
3356 }
3357 }
3358 }
3359
3360 #define DO_ZIP(NAME, TYPE, H) \
3361 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3362 { \
3363 intptr_t oprsz = simd_oprsz(desc); \
3364 intptr_t odd_ofs = simd_data(desc); \
3365 intptr_t i, oprsz_2 = oprsz / 2; \
3366 ARMVectorReg tmp_n, tmp_m; \
3367 /* We produce output faster than we consume input. \
3368 Therefore we must be mindful of possible overlap. */ \
3369 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3370 vn = memcpy(&tmp_n, vn, oprsz); \
3371 } \
3372 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3373 vm = memcpy(&tmp_m, vm, oprsz); \
3374 } \
3375 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3376 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3377 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3378 *(TYPE *)(vm + odd_ofs + H(i)); \
3379 } \
3380 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3381 memset(vd + oprsz - 16, 0, 16); \
3382 } \
3383 }
3384
DO_ZIP(sve_zip_b,uint8_t,H1)3385 DO_ZIP(sve_zip_b, uint8_t, H1)
3386 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3387 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3388 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3389 DO_ZIP(sve2_zip_q, Int128, )
3390
3391 #define DO_UZP(NAME, TYPE, H) \
3392 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3393 { \
3394 intptr_t oprsz = simd_oprsz(desc); \
3395 intptr_t odd_ofs = simd_data(desc); \
3396 intptr_t i, p; \
3397 ARMVectorReg tmp_m; \
3398 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3399 vm = memcpy(&tmp_m, vm, oprsz); \
3400 } \
3401 i = 0, p = odd_ofs; \
3402 do { \
3403 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3404 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3405 } while (p < oprsz); \
3406 p -= oprsz; \
3407 do { \
3408 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3409 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3410 } while (p < oprsz); \
3411 tcg_debug_assert(i == oprsz); \
3412 }
3413
3414 DO_UZP(sve_uzp_b, uint8_t, H1)
3415 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3416 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3417 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3418 DO_UZP(sve2_uzp_q, Int128, )
3419
3420 #define DO_TRN(NAME, TYPE, H) \
3421 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3422 { \
3423 intptr_t oprsz = simd_oprsz(desc); \
3424 intptr_t odd_ofs = simd_data(desc); \
3425 intptr_t i; \
3426 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3427 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3428 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3429 *(TYPE *)(vd + H(i + 0)) = ae; \
3430 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3431 } \
3432 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3433 memset(vd + oprsz - 16, 0, 16); \
3434 } \
3435 }
3436
3437 DO_TRN(sve_trn_b, uint8_t, H1)
3438 DO_TRN(sve_trn_h, uint16_t, H1_2)
3439 DO_TRN(sve_trn_s, uint32_t, H1_4)
3440 DO_TRN(sve_trn_d, uint64_t, H1_8)
3441 DO_TRN(sve2_trn_q, Int128, )
3442
3443 #undef DO_ZIP
3444 #undef DO_UZP
3445 #undef DO_TRN
3446
3447 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3448 {
3449 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3450 uint32_t *d = vd, *n = vn;
3451 uint8_t *pg = vg;
3452
3453 for (i = j = 0; i < opr_sz; i++) {
3454 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3455 d[H4(j)] = n[H4(i)];
3456 j++;
3457 }
3458 }
3459 for (; j < opr_sz; j++) {
3460 d[H4(j)] = 0;
3461 }
3462 }
3463
HELPER(sve_compact_d)3464 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3465 {
3466 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3467 uint64_t *d = vd, *n = vn;
3468 uint8_t *pg = vg;
3469
3470 for (i = j = 0; i < opr_sz; i++) {
3471 if (pg[H1(i)] & 1) {
3472 d[j] = n[i];
3473 j++;
3474 }
3475 }
3476 for (; j < opr_sz; j++) {
3477 d[j] = 0;
3478 }
3479 }
3480
3481 /* Similar to the ARM LastActiveElement pseudocode function, except the
3482 * result is multiplied by the element size. This includes the not found
3483 * indication; e.g. not found for esz=3 is -8.
3484 */
HELPER(sve_last_active_element)3485 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3486 {
3487 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3488 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3489
3490 return last_active_element(vg, words, esz);
3491 }
3492
HELPER(sve_splice)3493 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3494 {
3495 intptr_t opr_sz = simd_oprsz(desc) / 8;
3496 int esz = simd_data(desc);
3497 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3498 intptr_t i, first_i, last_i;
3499 ARMVectorReg tmp;
3500
3501 first_i = last_i = 0;
3502 first_g = last_g = 0;
3503
3504 /* Find the extent of the active elements within VG. */
3505 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3506 pg = *(uint64_t *)(vg + i) & mask;
3507 if (pg) {
3508 if (last_g == 0) {
3509 last_g = pg;
3510 last_i = i;
3511 }
3512 first_g = pg;
3513 first_i = i;
3514 }
3515 }
3516
3517 len = 0;
3518 if (first_g != 0) {
3519 first_i = first_i * 8 + ctz64(first_g);
3520 last_i = last_i * 8 + 63 - clz64(last_g);
3521 len = last_i - first_i + (1 << esz);
3522 if (vd == vm) {
3523 vm = memcpy(&tmp, vm, opr_sz * 8);
3524 }
3525 swap_memmove(vd, vn + first_i, len);
3526 }
3527 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3528 }
3529
HELPER(sve_sel_zpzz_b)3530 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3531 void *vg, uint32_t desc)
3532 {
3533 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3534 uint64_t *d = vd, *n = vn, *m = vm;
3535 uint8_t *pg = vg;
3536
3537 for (i = 0; i < opr_sz; i += 1) {
3538 uint64_t nn = n[i], mm = m[i];
3539 uint64_t pp = expand_pred_b(pg[H1(i)]);
3540 d[i] = (nn & pp) | (mm & ~pp);
3541 }
3542 }
3543
HELPER(sve_sel_zpzz_h)3544 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3545 void *vg, uint32_t desc)
3546 {
3547 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3548 uint64_t *d = vd, *n = vn, *m = vm;
3549 uint8_t *pg = vg;
3550
3551 for (i = 0; i < opr_sz; i += 1) {
3552 uint64_t nn = n[i], mm = m[i];
3553 uint64_t pp = expand_pred_h(pg[H1(i)]);
3554 d[i] = (nn & pp) | (mm & ~pp);
3555 }
3556 }
3557
HELPER(sve_sel_zpzz_s)3558 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3559 void *vg, uint32_t desc)
3560 {
3561 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3562 uint64_t *d = vd, *n = vn, *m = vm;
3563 uint8_t *pg = vg;
3564
3565 for (i = 0; i < opr_sz; i += 1) {
3566 uint64_t nn = n[i], mm = m[i];
3567 uint64_t pp = expand_pred_s(pg[H1(i)]);
3568 d[i] = (nn & pp) | (mm & ~pp);
3569 }
3570 }
3571
HELPER(sve_sel_zpzz_d)3572 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3573 void *vg, uint32_t desc)
3574 {
3575 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3576 uint64_t *d = vd, *n = vn, *m = vm;
3577 uint8_t *pg = vg;
3578
3579 for (i = 0; i < opr_sz; i += 1) {
3580 uint64_t nn = n[i], mm = m[i];
3581 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3582 }
3583 }
3584
HELPER(sve_sel_zpzz_q)3585 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3586 void *vg, uint32_t desc)
3587 {
3588 intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3589 Int128 *d = vd, *n = vn, *m = vm;
3590 uint16_t *pg = vg;
3591
3592 for (i = 0; i < opr_sz; i += 1) {
3593 d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3594 }
3595 }
3596
3597 /* Two operand comparison controlled by a predicate.
3598 * ??? It is very tempting to want to be able to expand this inline
3599 * with x86 instructions, e.g.
3600 *
3601 * vcmpeqw zm, zn, %ymm0
3602 * vpmovmskb %ymm0, %eax
3603 * and $0x5555, %eax
3604 * and pg, %eax
3605 *
3606 * or even aarch64, e.g.
3607 *
3608 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3609 * cmeq v0.8h, zn, zm
3610 * and v0.8h, v0.8h, mask
3611 * addv h0, v0.8h
3612 * and v0.8b, pg
3613 *
3614 * However, coming up with an abstraction that allows vector inputs and
3615 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3616 * scalar outputs, is tricky.
3617 */
3618 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3619 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3620 { \
3621 intptr_t opr_sz = simd_oprsz(desc); \
3622 uint32_t flags = PREDTEST_INIT; \
3623 intptr_t i = opr_sz; \
3624 do { \
3625 uint64_t out = 0, pg; \
3626 do { \
3627 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3628 TYPE nn = *(TYPE *)(vn + H(i)); \
3629 TYPE mm = *(TYPE *)(vm + H(i)); \
3630 out |= nn OP mm; \
3631 } while (i & 63); \
3632 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3633 out &= pg; \
3634 *(uint64_t *)(vd + (i >> 3)) = out; \
3635 flags = iter_predtest_bwd(out, pg, flags); \
3636 } while (i > 0); \
3637 return flags; \
3638 }
3639
3640 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3641 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3642 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3643 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3644 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3645 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3646 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3647 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3648
3649 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3650 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3651 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3652 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3653
3654 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3655 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3656 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3657 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3658
3659 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3660 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3661 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3662 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3663
3664 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3665 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3666 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3667 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3668
3669 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3670 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3671 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3672 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3673
3674 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3675 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3676 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3677 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3678
3679 #undef DO_CMP_PPZZ_B
3680 #undef DO_CMP_PPZZ_H
3681 #undef DO_CMP_PPZZ_S
3682 #undef DO_CMP_PPZZ_D
3683 #undef DO_CMP_PPZZ
3684
3685 /* Similar, but the second source is "wide". */
3686 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3687 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3688 { \
3689 intptr_t opr_sz = simd_oprsz(desc); \
3690 uint32_t flags = PREDTEST_INIT; \
3691 intptr_t i = opr_sz; \
3692 do { \
3693 uint64_t out = 0, pg; \
3694 do { \
3695 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3696 do { \
3697 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3698 TYPE nn = *(TYPE *)(vn + H(i)); \
3699 out |= nn OP mm; \
3700 } while (i & 7); \
3701 } while (i & 63); \
3702 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3703 out &= pg; \
3704 *(uint64_t *)(vd + (i >> 3)) = out; \
3705 flags = iter_predtest_bwd(out, pg, flags); \
3706 } while (i > 0); \
3707 return flags; \
3708 }
3709
3710 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3711 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3712 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3713 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3714 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3715 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3716
3717 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3718 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3719 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3720
3721 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3722 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3723 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3724
3725 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3726 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3727 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3728
3729 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3730 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3731 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3732
3733 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3734 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3735 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3736
3737 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3738 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3739 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3740
3741 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3742 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3743 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3744
3745 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3746 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3747 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3748
3749 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3750 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3751 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3752
3753 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3754 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3755 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3756
3757 #undef DO_CMP_PPZW_B
3758 #undef DO_CMP_PPZW_H
3759 #undef DO_CMP_PPZW_S
3760 #undef DO_CMP_PPZW
3761
3762 /* Similar, but the second source is immediate. */
3763 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3764 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3765 { \
3766 intptr_t opr_sz = simd_oprsz(desc); \
3767 uint32_t flags = PREDTEST_INIT; \
3768 TYPE mm = simd_data(desc); \
3769 intptr_t i = opr_sz; \
3770 do { \
3771 uint64_t out = 0, pg; \
3772 do { \
3773 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3774 TYPE nn = *(TYPE *)(vn + H(i)); \
3775 out |= nn OP mm; \
3776 } while (i & 63); \
3777 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3778 out &= pg; \
3779 *(uint64_t *)(vd + (i >> 3)) = out; \
3780 flags = iter_predtest_bwd(out, pg, flags); \
3781 } while (i > 0); \
3782 return flags; \
3783 }
3784
3785 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3786 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3787 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3788 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3789 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3790 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3791 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3792 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3793
3794 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3795 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3796 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3797 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3798
3799 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3800 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3801 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3802 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3803
3804 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3805 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3806 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3807 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3808
3809 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3810 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3811 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3812 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3813
3814 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3815 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3816 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3817 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3818
3819 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3820 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3821 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3822 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3823
3824 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3825 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3826 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3827 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3828
3829 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3830 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3831 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3832 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3833
3834 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3835 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3836 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3837 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3838
3839 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3840 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3841 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3842 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3843
3844 #undef DO_CMP_PPZI_B
3845 #undef DO_CMP_PPZI_H
3846 #undef DO_CMP_PPZI_S
3847 #undef DO_CMP_PPZI_D
3848 #undef DO_CMP_PPZI
3849
3850 /* Similar to the ARM LastActive pseudocode function. */
last_active_pred(void * vd,void * vg,intptr_t oprsz)3851 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3852 {
3853 intptr_t i;
3854
3855 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3856 uint64_t pg = *(uint64_t *)(vg + i);
3857 if (pg) {
3858 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3859 }
3860 }
3861 return 0;
3862 }
3863
3864 /* Compute a mask into RETB that is true for all G, up to and including
3865 * (if after) or excluding (if !after) the first G & N.
3866 * Return true if BRK found.
3867 */
compute_brk(uint64_t * retb,uint64_t n,uint64_t g,bool brk,bool after)3868 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3869 bool brk, bool after)
3870 {
3871 uint64_t b;
3872
3873 if (brk) {
3874 b = 0;
3875 } else if ((g & n) == 0) {
3876 /* For all G, no N are set; break not found. */
3877 b = g;
3878 } else {
3879 /* Break somewhere in N. Locate it. */
3880 b = g & n; /* guard true, pred true */
3881 b = b & -b; /* first such */
3882 if (after) {
3883 b = b | (b - 1); /* break after same */
3884 } else {
3885 b = b - 1; /* break before same */
3886 }
3887 brk = true;
3888 }
3889
3890 *retb = b;
3891 return brk;
3892 }
3893
3894 /* Compute a zeroing BRK. */
compute_brk_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3895 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3896 intptr_t oprsz, bool after)
3897 {
3898 bool brk = false;
3899 intptr_t i;
3900
3901 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3902 uint64_t this_b, this_g = g[i];
3903
3904 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3905 d[i] = this_b & this_g;
3906 }
3907 }
3908
3909 /* Likewise, but also compute flags. */
compute_brks_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3910 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3911 intptr_t oprsz, bool after)
3912 {
3913 uint32_t flags = PREDTEST_INIT;
3914 bool brk = false;
3915 intptr_t i;
3916
3917 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3918 uint64_t this_b, this_d, this_g = g[i];
3919
3920 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3921 d[i] = this_d = this_b & this_g;
3922 flags = iter_predtest_fwd(this_d, this_g, flags);
3923 }
3924 return flags;
3925 }
3926
3927 /* Compute a merging BRK. */
compute_brk_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3928 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3929 intptr_t oprsz, bool after)
3930 {
3931 bool brk = false;
3932 intptr_t i;
3933
3934 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3935 uint64_t this_b, this_g = g[i];
3936
3937 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3938 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3939 }
3940 }
3941
3942 /* Likewise, but also compute flags. */
compute_brks_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3943 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3944 intptr_t oprsz, bool after)
3945 {
3946 uint32_t flags = PREDTEST_INIT;
3947 bool brk = false;
3948 intptr_t i;
3949
3950 for (i = 0; i < oprsz / 8; ++i) {
3951 uint64_t this_b, this_d = d[i], this_g = g[i];
3952
3953 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3954 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3955 flags = iter_predtest_fwd(this_d, this_g, flags);
3956 }
3957 return flags;
3958 }
3959
do_zero(ARMPredicateReg * d,intptr_t oprsz)3960 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3961 {
3962 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3963 * The compiler should turn this into 4 64-bit integer stores.
3964 */
3965 memset(d, 0, sizeof(ARMPredicateReg));
3966 return PREDTEST_INIT;
3967 }
3968
HELPER(sve_brkpa)3969 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3970 uint32_t pred_desc)
3971 {
3972 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3973 if (last_active_pred(vn, vg, oprsz)) {
3974 compute_brk_z(vd, vm, vg, oprsz, true);
3975 } else {
3976 do_zero(vd, oprsz);
3977 }
3978 }
3979
HELPER(sve_brkpas)3980 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3981 uint32_t pred_desc)
3982 {
3983 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3984 if (last_active_pred(vn, vg, oprsz)) {
3985 return compute_brks_z(vd, vm, vg, oprsz, true);
3986 } else {
3987 return do_zero(vd, oprsz);
3988 }
3989 }
3990
HELPER(sve_brkpb)3991 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3992 uint32_t pred_desc)
3993 {
3994 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3995 if (last_active_pred(vn, vg, oprsz)) {
3996 compute_brk_z(vd, vm, vg, oprsz, false);
3997 } else {
3998 do_zero(vd, oprsz);
3999 }
4000 }
4001
HELPER(sve_brkpbs)4002 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4003 uint32_t pred_desc)
4004 {
4005 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4006 if (last_active_pred(vn, vg, oprsz)) {
4007 return compute_brks_z(vd, vm, vg, oprsz, false);
4008 } else {
4009 return do_zero(vd, oprsz);
4010 }
4011 }
4012
HELPER(sve_brka_z)4013 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4014 {
4015 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4016 compute_brk_z(vd, vn, vg, oprsz, true);
4017 }
4018
HELPER(sve_brkas_z)4019 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4020 {
4021 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4022 return compute_brks_z(vd, vn, vg, oprsz, true);
4023 }
4024
HELPER(sve_brkb_z)4025 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4026 {
4027 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4028 compute_brk_z(vd, vn, vg, oprsz, false);
4029 }
4030
HELPER(sve_brkbs_z)4031 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4032 {
4033 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4034 return compute_brks_z(vd, vn, vg, oprsz, false);
4035 }
4036
HELPER(sve_brka_m)4037 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4038 {
4039 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4040 compute_brk_m(vd, vn, vg, oprsz, true);
4041 }
4042
HELPER(sve_brkas_m)4043 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4044 {
4045 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4046 return compute_brks_m(vd, vn, vg, oprsz, true);
4047 }
4048
HELPER(sve_brkb_m)4049 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4050 {
4051 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4052 compute_brk_m(vd, vn, vg, oprsz, false);
4053 }
4054
HELPER(sve_brkbs_m)4055 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4056 {
4057 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4058 return compute_brks_m(vd, vn, vg, oprsz, false);
4059 }
4060
HELPER(sve_brkn)4061 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4062 {
4063 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4064 if (!last_active_pred(vn, vg, oprsz)) {
4065 do_zero(vd, oprsz);
4066 }
4067 }
4068
4069 /* As if PredTest(Ones(PL), D, esz). */
predtest_ones(ARMPredicateReg * d,intptr_t oprsz,uint64_t esz_mask)4070 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4071 uint64_t esz_mask)
4072 {
4073 uint32_t flags = PREDTEST_INIT;
4074 intptr_t i;
4075
4076 for (i = 0; i < oprsz / 8; i++) {
4077 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4078 }
4079 if (oprsz & 7) {
4080 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4081 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4082 }
4083 return flags;
4084 }
4085
HELPER(sve_brkns)4086 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4087 {
4088 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4089 if (last_active_pred(vn, vg, oprsz)) {
4090 return predtest_ones(vd, oprsz, -1);
4091 } else {
4092 return do_zero(vd, oprsz);
4093 }
4094 }
4095
HELPER(sve_cntp)4096 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4097 {
4098 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4099 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4100 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4101 intptr_t i;
4102
4103 for (i = 0; i < words; ++i) {
4104 uint64_t t = n[i] & g[i] & mask;
4105 sum += ctpop64(t);
4106 }
4107 return sum;
4108 }
4109
HELPER(sve_whilel)4110 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4111 {
4112 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4113 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4114 uint64_t esz_mask = pred_esz_masks[esz];
4115 ARMPredicateReg *d = vd;
4116 uint32_t flags;
4117 intptr_t i;
4118
4119 /* Begin with a zero predicate register. */
4120 flags = do_zero(d, oprsz);
4121 if (count == 0) {
4122 return flags;
4123 }
4124
4125 /* Set all of the requested bits. */
4126 for (i = 0; i < count / 64; ++i) {
4127 d->p[i] = esz_mask;
4128 }
4129 if (count & 63) {
4130 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4131 }
4132
4133 return predtest_ones(d, oprsz, esz_mask);
4134 }
4135
HELPER(sve_whileg)4136 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4137 {
4138 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4139 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4140 uint64_t esz_mask = pred_esz_masks[esz];
4141 ARMPredicateReg *d = vd;
4142 intptr_t i, invcount, oprbits;
4143 uint64_t bits;
4144
4145 if (count == 0) {
4146 return do_zero(d, oprsz);
4147 }
4148
4149 oprbits = oprsz * 8;
4150 tcg_debug_assert(count <= oprbits);
4151
4152 bits = esz_mask;
4153 if (oprbits & 63) {
4154 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4155 }
4156
4157 invcount = oprbits - count;
4158 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4159 d->p[i] = bits;
4160 bits = esz_mask;
4161 }
4162
4163 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4164
4165 while (--i >= 0) {
4166 d->p[i] = 0;
4167 }
4168
4169 return predtest_ones(d, oprsz, esz_mask);
4170 }
4171
4172 /* Recursive reduction on a function;
4173 * C.f. the ARM ARM function ReducePredicated.
4174 *
4175 * While it would be possible to write this without the DATA temporary,
4176 * it is much simpler to process the predicate register this way.
4177 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4178 * little to gain with a more complex non-recursive form.
4179 */
4180 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4181 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4182 { \
4183 if (n == 1) { \
4184 return *data; \
4185 } else { \
4186 uintptr_t half = n / 2; \
4187 TYPE lo = NAME##_reduce(data, status, half); \
4188 TYPE hi = NAME##_reduce(data + half, status, half); \
4189 return TYPE##_##FUNC(lo, hi, status); \
4190 } \
4191 } \
4192 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4193 { \
4194 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4195 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4196 for (i = 0; i < oprsz; ) { \
4197 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4198 do { \
4199 TYPE nn = *(TYPE *)(vn + H(i)); \
4200 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4201 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4202 } while (i & 15); \
4203 } \
4204 for (; i < maxsz; i += sizeof(TYPE)) { \
4205 *(TYPE *)((void *)data + i) = IDENT; \
4206 } \
4207 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4208 }
4209
DO_REDUCE(sve_faddv_h,float16,H1_2,add,float16_zero)4210 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4211 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4212 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4213
4214 /* Identity is floatN_default_nan, without the function call. */
4215 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4216 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4217 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4218
4219 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4220 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4221 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4222
4223 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4224 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4225 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4226
4227 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4228 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4229 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4230
4231 #undef DO_REDUCE
4232
4233 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4234 void *status, uint32_t desc)
4235 {
4236 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4237 float16 result = nn;
4238
4239 do {
4240 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4241 do {
4242 if (pg & 1) {
4243 float16 mm = *(float16 *)(vm + H1_2(i));
4244 result = float16_add(result, mm, status);
4245 }
4246 i += sizeof(float16), pg >>= sizeof(float16);
4247 } while (i & 15);
4248 } while (i < opr_sz);
4249
4250 return result;
4251 }
4252
HELPER(sve_fadda_s)4253 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4254 void *status, uint32_t desc)
4255 {
4256 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4257 float32 result = nn;
4258
4259 do {
4260 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4261 do {
4262 if (pg & 1) {
4263 float32 mm = *(float32 *)(vm + H1_2(i));
4264 result = float32_add(result, mm, status);
4265 }
4266 i += sizeof(float32), pg >>= sizeof(float32);
4267 } while (i & 15);
4268 } while (i < opr_sz);
4269
4270 return result;
4271 }
4272
HELPER(sve_fadda_d)4273 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4274 void *status, uint32_t desc)
4275 {
4276 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4277 uint64_t *m = vm;
4278 uint8_t *pg = vg;
4279
4280 for (i = 0; i < opr_sz; i++) {
4281 if (pg[H1(i)] & 1) {
4282 nn = float64_add(nn, m[i], status);
4283 }
4284 }
4285
4286 return nn;
4287 }
4288
4289 /* Fully general three-operand expander, controlled by a predicate,
4290 * With the extra float_status parameter.
4291 */
4292 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4293 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4294 void *status, uint32_t desc) \
4295 { \
4296 intptr_t i = simd_oprsz(desc); \
4297 uint64_t *g = vg; \
4298 do { \
4299 uint64_t pg = g[(i - 1) >> 6]; \
4300 do { \
4301 i -= sizeof(TYPE); \
4302 if (likely((pg >> (i & 63)) & 1)) { \
4303 TYPE nn = *(TYPE *)(vn + H(i)); \
4304 TYPE mm = *(TYPE *)(vm + H(i)); \
4305 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4306 } \
4307 } while (i & 63); \
4308 } while (i != 0); \
4309 }
4310
DO_ZPZZ_FP(sve_fadd_h,uint16_t,H1_2,float16_add)4311 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4312 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4313 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4314
4315 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4316 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4317 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4318
4319 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4320 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4321 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4322
4323 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4324 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4325 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4326
4327 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4328 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4329 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4330
4331 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4332 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4333 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4334
4335 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4336 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4337 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4338
4339 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4340 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4341 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4342
4343 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4344 {
4345 return float16_abs(float16_sub(a, b, s));
4346 }
4347
abd_s(float32 a,float32 b,float_status * s)4348 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4349 {
4350 return float32_abs(float32_sub(a, b, s));
4351 }
4352
abd_d(float64 a,float64 b,float_status * s)4353 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4354 {
4355 return float64_abs(float64_sub(a, b, s));
4356 }
4357
DO_ZPZZ_FP(sve_fabd_h,uint16_t,H1_2,abd_h)4358 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4359 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4360 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4361
4362 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4363 {
4364 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4365 return float64_scalbn(a, b_int, s);
4366 }
4367
DO_ZPZZ_FP(sve_fscalbn_h,int16_t,H1_2,float16_scalbn)4368 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4369 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4370 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4371
4372 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4373 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4374 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4375
4376 #undef DO_ZPZZ_FP
4377
4378 /* Three-operand expander, with one scalar operand, controlled by
4379 * a predicate, with the extra float_status parameter.
4380 */
4381 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4382 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4383 void *status, uint32_t desc) \
4384 { \
4385 intptr_t i = simd_oprsz(desc); \
4386 uint64_t *g = vg; \
4387 TYPE mm = scalar; \
4388 do { \
4389 uint64_t pg = g[(i - 1) >> 6]; \
4390 do { \
4391 i -= sizeof(TYPE); \
4392 if (likely((pg >> (i & 63)) & 1)) { \
4393 TYPE nn = *(TYPE *)(vn + H(i)); \
4394 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4395 } \
4396 } while (i & 63); \
4397 } while (i != 0); \
4398 }
4399
4400 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4401 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4402 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4403
4404 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4405 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4406 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4407
4408 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4409 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4410 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4411
4412 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4413 {
4414 return float16_sub(b, a, s);
4415 }
4416
subr_s(float32 a,float32 b,float_status * s)4417 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4418 {
4419 return float32_sub(b, a, s);
4420 }
4421
subr_d(float64 a,float64 b,float_status * s)4422 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4423 {
4424 return float64_sub(b, a, s);
4425 }
4426
DO_ZPZS_FP(sve_fsubrs_h,float16,H1_2,subr_h)4427 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4428 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4429 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4430
4431 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4432 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4433 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4434
4435 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4436 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4437 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4438
4439 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4440 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4441 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4442
4443 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4444 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4445 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4446
4447 /* Fully general two-operand expander, controlled by a predicate,
4448 * With the extra float_status parameter.
4449 */
4450 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4451 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4452 { \
4453 intptr_t i = simd_oprsz(desc); \
4454 uint64_t *g = vg; \
4455 do { \
4456 uint64_t pg = g[(i - 1) >> 6]; \
4457 do { \
4458 i -= sizeof(TYPE); \
4459 if (likely((pg >> (i & 63)) & 1)) { \
4460 TYPE nn = *(TYPE *)(vn + H(i)); \
4461 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4462 } \
4463 } while (i & 63); \
4464 } while (i != 0); \
4465 }
4466
4467 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4468 * FZ16. When converting from fp16, this affects flushing input denormals;
4469 * when converting to fp16, this affects flushing output denormals.
4470 */
4471 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4472 {
4473 bool save = get_flush_inputs_to_zero(fpst);
4474 float32 ret;
4475
4476 set_flush_inputs_to_zero(false, fpst);
4477 ret = float16_to_float32(f, true, fpst);
4478 set_flush_inputs_to_zero(save, fpst);
4479 return ret;
4480 }
4481
sve_f16_to_f64(float16 f,float_status * fpst)4482 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4483 {
4484 bool save = get_flush_inputs_to_zero(fpst);
4485 float64 ret;
4486
4487 set_flush_inputs_to_zero(false, fpst);
4488 ret = float16_to_float64(f, true, fpst);
4489 set_flush_inputs_to_zero(save, fpst);
4490 return ret;
4491 }
4492
sve_f32_to_f16(float32 f,float_status * fpst)4493 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4494 {
4495 bool save = get_flush_to_zero(fpst);
4496 float16 ret;
4497
4498 set_flush_to_zero(false, fpst);
4499 ret = float32_to_float16(f, true, fpst);
4500 set_flush_to_zero(save, fpst);
4501 return ret;
4502 }
4503
sve_f64_to_f16(float64 f,float_status * fpst)4504 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4505 {
4506 bool save = get_flush_to_zero(fpst);
4507 float16 ret;
4508
4509 set_flush_to_zero(false, fpst);
4510 ret = float64_to_float16(f, true, fpst);
4511 set_flush_to_zero(save, fpst);
4512 return ret;
4513 }
4514
vfp_float16_to_int16_rtz(float16 f,float_status * s)4515 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4516 {
4517 if (float16_is_any_nan(f)) {
4518 float_raise(float_flag_invalid, s);
4519 return 0;
4520 }
4521 return float16_to_int16_round_to_zero(f, s);
4522 }
4523
vfp_float16_to_int64_rtz(float16 f,float_status * s)4524 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4525 {
4526 if (float16_is_any_nan(f)) {
4527 float_raise(float_flag_invalid, s);
4528 return 0;
4529 }
4530 return float16_to_int64_round_to_zero(f, s);
4531 }
4532
vfp_float32_to_int64_rtz(float32 f,float_status * s)4533 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4534 {
4535 if (float32_is_any_nan(f)) {
4536 float_raise(float_flag_invalid, s);
4537 return 0;
4538 }
4539 return float32_to_int64_round_to_zero(f, s);
4540 }
4541
vfp_float64_to_int64_rtz(float64 f,float_status * s)4542 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4543 {
4544 if (float64_is_any_nan(f)) {
4545 float_raise(float_flag_invalid, s);
4546 return 0;
4547 }
4548 return float64_to_int64_round_to_zero(f, s);
4549 }
4550
vfp_float16_to_uint16_rtz(float16 f,float_status * s)4551 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4552 {
4553 if (float16_is_any_nan(f)) {
4554 float_raise(float_flag_invalid, s);
4555 return 0;
4556 }
4557 return float16_to_uint16_round_to_zero(f, s);
4558 }
4559
vfp_float16_to_uint64_rtz(float16 f,float_status * s)4560 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4561 {
4562 if (float16_is_any_nan(f)) {
4563 float_raise(float_flag_invalid, s);
4564 return 0;
4565 }
4566 return float16_to_uint64_round_to_zero(f, s);
4567 }
4568
vfp_float32_to_uint64_rtz(float32 f,float_status * s)4569 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4570 {
4571 if (float32_is_any_nan(f)) {
4572 float_raise(float_flag_invalid, s);
4573 return 0;
4574 }
4575 return float32_to_uint64_round_to_zero(f, s);
4576 }
4577
vfp_float64_to_uint64_rtz(float64 f,float_status * s)4578 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4579 {
4580 if (float64_is_any_nan(f)) {
4581 float_raise(float_flag_invalid, s);
4582 return 0;
4583 }
4584 return float64_to_uint64_round_to_zero(f, s);
4585 }
4586
DO_ZPZ_FP(sve_fcvt_sh,uint32_t,H1_4,sve_f32_to_f16)4587 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4588 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4589 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4590 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4591 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4592 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4593 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4594
4595 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4596 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4597 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4598 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4599 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4600 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4601 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4602
4603 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4604 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4605 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4606 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4607 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4608 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4609 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4610
4611 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4612 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4613 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4614
4615 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4616 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4617 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4618
4619 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4620 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4621 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4622
4623 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4624 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4625 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4626
4627 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4628 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4629 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4630 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4631 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4632 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4633 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4634
4635 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4636 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4637 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4638 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4639 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4640 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4641 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4642
4643 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4644 {
4645 /* Extract frac to the top of the uint32_t. */
4646 uint32_t frac = (uint32_t)a << (16 + 6);
4647 int16_t exp = extract32(a, 10, 5);
4648
4649 if (unlikely(exp == 0)) {
4650 if (frac != 0) {
4651 if (!get_flush_inputs_to_zero(s)) {
4652 /* denormal: bias - fractional_zeros */
4653 return -15 - clz32(frac);
4654 }
4655 /* flush to zero */
4656 float_raise(float_flag_input_denormal, s);
4657 }
4658 } else if (unlikely(exp == 0x1f)) {
4659 if (frac == 0) {
4660 return INT16_MAX; /* infinity */
4661 }
4662 } else {
4663 /* normal: exp - bias */
4664 return exp - 15;
4665 }
4666 /* nan or zero */
4667 float_raise(float_flag_invalid, s);
4668 return INT16_MIN;
4669 }
4670
do_float32_logb_as_int(float32 a,float_status * s)4671 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4672 {
4673 /* Extract frac to the top of the uint32_t. */
4674 uint32_t frac = a << 9;
4675 int32_t exp = extract32(a, 23, 8);
4676
4677 if (unlikely(exp == 0)) {
4678 if (frac != 0) {
4679 if (!get_flush_inputs_to_zero(s)) {
4680 /* denormal: bias - fractional_zeros */
4681 return -127 - clz32(frac);
4682 }
4683 /* flush to zero */
4684 float_raise(float_flag_input_denormal, s);
4685 }
4686 } else if (unlikely(exp == 0xff)) {
4687 if (frac == 0) {
4688 return INT32_MAX; /* infinity */
4689 }
4690 } else {
4691 /* normal: exp - bias */
4692 return exp - 127;
4693 }
4694 /* nan or zero */
4695 float_raise(float_flag_invalid, s);
4696 return INT32_MIN;
4697 }
4698
do_float64_logb_as_int(float64 a,float_status * s)4699 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4700 {
4701 /* Extract frac to the top of the uint64_t. */
4702 uint64_t frac = a << 12;
4703 int64_t exp = extract64(a, 52, 11);
4704
4705 if (unlikely(exp == 0)) {
4706 if (frac != 0) {
4707 if (!get_flush_inputs_to_zero(s)) {
4708 /* denormal: bias - fractional_zeros */
4709 return -1023 - clz64(frac);
4710 }
4711 /* flush to zero */
4712 float_raise(float_flag_input_denormal, s);
4713 }
4714 } else if (unlikely(exp == 0x7ff)) {
4715 if (frac == 0) {
4716 return INT64_MAX; /* infinity */
4717 }
4718 } else {
4719 /* normal: exp - bias */
4720 return exp - 1023;
4721 }
4722 /* nan or zero */
4723 float_raise(float_flag_invalid, s);
4724 return INT64_MIN;
4725 }
4726
DO_ZPZ_FP(flogb_h,float16,H1_2,do_float16_logb_as_int)4727 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4728 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4729 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4730
4731 #undef DO_ZPZ_FP
4732
4733 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4734 float_status *status, uint32_t desc,
4735 uint16_t neg1, uint16_t neg3)
4736 {
4737 intptr_t i = simd_oprsz(desc);
4738 uint64_t *g = vg;
4739
4740 do {
4741 uint64_t pg = g[(i - 1) >> 6];
4742 do {
4743 i -= 2;
4744 if (likely((pg >> (i & 63)) & 1)) {
4745 float16 e1, e2, e3, r;
4746
4747 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4748 e2 = *(uint16_t *)(vm + H1_2(i));
4749 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4750 r = float16_muladd(e1, e2, e3, 0, status);
4751 *(uint16_t *)(vd + H1_2(i)) = r;
4752 }
4753 } while (i & 63);
4754 } while (i != 0);
4755 }
4756
HELPER(sve_fmla_zpzzz_h)4757 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4758 void *vg, void *status, uint32_t desc)
4759 {
4760 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4761 }
4762
HELPER(sve_fmls_zpzzz_h)4763 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4764 void *vg, void *status, uint32_t desc)
4765 {
4766 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4767 }
4768
HELPER(sve_fnmla_zpzzz_h)4769 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4770 void *vg, void *status, uint32_t desc)
4771 {
4772 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4773 }
4774
HELPER(sve_fnmls_zpzzz_h)4775 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4776 void *vg, void *status, uint32_t desc)
4777 {
4778 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4779 }
4780
do_fmla_zpzzz_s(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint32_t neg1,uint32_t neg3)4781 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4782 float_status *status, uint32_t desc,
4783 uint32_t neg1, uint32_t neg3)
4784 {
4785 intptr_t i = simd_oprsz(desc);
4786 uint64_t *g = vg;
4787
4788 do {
4789 uint64_t pg = g[(i - 1) >> 6];
4790 do {
4791 i -= 4;
4792 if (likely((pg >> (i & 63)) & 1)) {
4793 float32 e1, e2, e3, r;
4794
4795 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4796 e2 = *(uint32_t *)(vm + H1_4(i));
4797 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4798 r = float32_muladd(e1, e2, e3, 0, status);
4799 *(uint32_t *)(vd + H1_4(i)) = r;
4800 }
4801 } while (i & 63);
4802 } while (i != 0);
4803 }
4804
HELPER(sve_fmla_zpzzz_s)4805 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4806 void *vg, void *status, uint32_t desc)
4807 {
4808 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4809 }
4810
HELPER(sve_fmls_zpzzz_s)4811 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4812 void *vg, void *status, uint32_t desc)
4813 {
4814 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4815 }
4816
HELPER(sve_fnmla_zpzzz_s)4817 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4818 void *vg, void *status, uint32_t desc)
4819 {
4820 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4821 }
4822
HELPER(sve_fnmls_zpzzz_s)4823 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4824 void *vg, void *status, uint32_t desc)
4825 {
4826 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4827 }
4828
do_fmla_zpzzz_d(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint64_t neg1,uint64_t neg3)4829 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4830 float_status *status, uint32_t desc,
4831 uint64_t neg1, uint64_t neg3)
4832 {
4833 intptr_t i = simd_oprsz(desc);
4834 uint64_t *g = vg;
4835
4836 do {
4837 uint64_t pg = g[(i - 1) >> 6];
4838 do {
4839 i -= 8;
4840 if (likely((pg >> (i & 63)) & 1)) {
4841 float64 e1, e2, e3, r;
4842
4843 e1 = *(uint64_t *)(vn + i) ^ neg1;
4844 e2 = *(uint64_t *)(vm + i);
4845 e3 = *(uint64_t *)(va + i) ^ neg3;
4846 r = float64_muladd(e1, e2, e3, 0, status);
4847 *(uint64_t *)(vd + i) = r;
4848 }
4849 } while (i & 63);
4850 } while (i != 0);
4851 }
4852
HELPER(sve_fmla_zpzzz_d)4853 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4854 void *vg, void *status, uint32_t desc)
4855 {
4856 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4857 }
4858
HELPER(sve_fmls_zpzzz_d)4859 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4860 void *vg, void *status, uint32_t desc)
4861 {
4862 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4863 }
4864
HELPER(sve_fnmla_zpzzz_d)4865 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4866 void *vg, void *status, uint32_t desc)
4867 {
4868 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4869 }
4870
HELPER(sve_fnmls_zpzzz_d)4871 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4872 void *vg, void *status, uint32_t desc)
4873 {
4874 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4875 }
4876
4877 /* Two operand floating-point comparison controlled by a predicate.
4878 * Unlike the integer version, we are not allowed to optimistically
4879 * compare operands, since the comparison may have side effects wrt
4880 * the FPSR.
4881 */
4882 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4883 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4884 void *status, uint32_t desc) \
4885 { \
4886 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4887 uint64_t *d = vd, *g = vg; \
4888 do { \
4889 uint64_t out = 0, pg = g[j]; \
4890 do { \
4891 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4892 if (likely((pg >> (i & 63)) & 1)) { \
4893 TYPE nn = *(TYPE *)(vn + H(i)); \
4894 TYPE mm = *(TYPE *)(vm + H(i)); \
4895 out |= OP(TYPE, nn, mm, status); \
4896 } \
4897 } while (i & 63); \
4898 d[j--] = out; \
4899 } while (i > 0); \
4900 }
4901
4902 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4903 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4904 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4905 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4906 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4907 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4908
4909 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4910 DO_FPCMP_PPZZ_H(NAME, OP) \
4911 DO_FPCMP_PPZZ_S(NAME, OP) \
4912 DO_FPCMP_PPZZ_D(NAME, OP)
4913
4914 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4915 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4916 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4917 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4918 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4919 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4920 #define DO_FCMUO(TYPE, X, Y, ST) \
4921 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4922 #define DO_FACGE(TYPE, X, Y, ST) \
4923 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4924 #define DO_FACGT(TYPE, X, Y, ST) \
4925 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4926
DO_FPCMP_PPZZ_ALL(sve_fcmge,DO_FCMGE)4927 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4928 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4929 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4930 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4931 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4932 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4933 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4934
4935 #undef DO_FPCMP_PPZZ_ALL
4936 #undef DO_FPCMP_PPZZ_D
4937 #undef DO_FPCMP_PPZZ_S
4938 #undef DO_FPCMP_PPZZ_H
4939 #undef DO_FPCMP_PPZZ
4940
4941 /* One operand floating-point comparison against zero, controlled
4942 * by a predicate.
4943 */
4944 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4945 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4946 void *status, uint32_t desc) \
4947 { \
4948 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4949 uint64_t *d = vd, *g = vg; \
4950 do { \
4951 uint64_t out = 0, pg = g[j]; \
4952 do { \
4953 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4954 if ((pg >> (i & 63)) & 1) { \
4955 TYPE nn = *(TYPE *)(vn + H(i)); \
4956 out |= OP(TYPE, nn, 0, status); \
4957 } \
4958 } while (i & 63); \
4959 d[j--] = out; \
4960 } while (i > 0); \
4961 }
4962
4963 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4964 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4965 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4966 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4967 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4968 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4969
4970 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4971 DO_FPCMP_PPZ0_H(NAME, OP) \
4972 DO_FPCMP_PPZ0_S(NAME, OP) \
4973 DO_FPCMP_PPZ0_D(NAME, OP)
4974
4975 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4976 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4977 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4978 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4979 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4980 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4981
4982 /* FP Trig Multiply-Add. */
4983
4984 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4985 {
4986 static const float16 coeff[16] = {
4987 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4988 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989 };
4990 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4991 intptr_t x = simd_data(desc);
4992 float16 *d = vd, *n = vn, *m = vm;
4993 for (i = 0; i < opr_sz; i++) {
4994 float16 mm = m[i];
4995 intptr_t xx = x;
4996 if (float16_is_neg(mm)) {
4997 mm = float16_abs(mm);
4998 xx += 8;
4999 }
5000 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5001 }
5002 }
5003
HELPER(sve_ftmad_s)5004 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5005 {
5006 static const float32 coeff[16] = {
5007 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5008 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5009 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5010 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5011 };
5012 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5013 intptr_t x = simd_data(desc);
5014 float32 *d = vd, *n = vn, *m = vm;
5015 for (i = 0; i < opr_sz; i++) {
5016 float32 mm = m[i];
5017 intptr_t xx = x;
5018 if (float32_is_neg(mm)) {
5019 mm = float32_abs(mm);
5020 xx += 8;
5021 }
5022 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5023 }
5024 }
5025
HELPER(sve_ftmad_d)5026 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5027 {
5028 static const float64 coeff[16] = {
5029 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5030 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5031 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5032 0x3de5d8408868552full, 0x0000000000000000ull,
5033 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5034 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5035 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5036 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5037 };
5038 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5039 intptr_t x = simd_data(desc);
5040 float64 *d = vd, *n = vn, *m = vm;
5041 for (i = 0; i < opr_sz; i++) {
5042 float64 mm = m[i];
5043 intptr_t xx = x;
5044 if (float64_is_neg(mm)) {
5045 mm = float64_abs(mm);
5046 xx += 8;
5047 }
5048 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5049 }
5050 }
5051
5052 /*
5053 * FP Complex Add
5054 */
5055
HELPER(sve_fcadd_h)5056 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5057 void *vs, uint32_t desc)
5058 {
5059 intptr_t j, i = simd_oprsz(desc);
5060 uint64_t *g = vg;
5061 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5062 float16 neg_real = float16_chs(neg_imag);
5063
5064 do {
5065 uint64_t pg = g[(i - 1) >> 6];
5066 do {
5067 float16 e0, e1, e2, e3;
5068
5069 /* I holds the real index; J holds the imag index. */
5070 j = i - sizeof(float16);
5071 i -= 2 * sizeof(float16);
5072
5073 e0 = *(float16 *)(vn + H1_2(i));
5074 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5075 e2 = *(float16 *)(vn + H1_2(j));
5076 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5077
5078 if (likely((pg >> (i & 63)) & 1)) {
5079 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5080 }
5081 if (likely((pg >> (j & 63)) & 1)) {
5082 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5083 }
5084 } while (i & 63);
5085 } while (i != 0);
5086 }
5087
HELPER(sve_fcadd_s)5088 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5089 void *vs, uint32_t desc)
5090 {
5091 intptr_t j, i = simd_oprsz(desc);
5092 uint64_t *g = vg;
5093 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5094 float32 neg_real = float32_chs(neg_imag);
5095
5096 do {
5097 uint64_t pg = g[(i - 1) >> 6];
5098 do {
5099 float32 e0, e1, e2, e3;
5100
5101 /* I holds the real index; J holds the imag index. */
5102 j = i - sizeof(float32);
5103 i -= 2 * sizeof(float32);
5104
5105 e0 = *(float32 *)(vn + H1_2(i));
5106 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5107 e2 = *(float32 *)(vn + H1_2(j));
5108 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5109
5110 if (likely((pg >> (i & 63)) & 1)) {
5111 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5112 }
5113 if (likely((pg >> (j & 63)) & 1)) {
5114 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5115 }
5116 } while (i & 63);
5117 } while (i != 0);
5118 }
5119
HELPER(sve_fcadd_d)5120 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5121 void *vs, uint32_t desc)
5122 {
5123 intptr_t j, i = simd_oprsz(desc);
5124 uint64_t *g = vg;
5125 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5126 float64 neg_real = float64_chs(neg_imag);
5127
5128 do {
5129 uint64_t pg = g[(i - 1) >> 6];
5130 do {
5131 float64 e0, e1, e2, e3;
5132
5133 /* I holds the real index; J holds the imag index. */
5134 j = i - sizeof(float64);
5135 i -= 2 * sizeof(float64);
5136
5137 e0 = *(float64 *)(vn + H1_2(i));
5138 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5139 e2 = *(float64 *)(vn + H1_2(j));
5140 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5141
5142 if (likely((pg >> (i & 63)) & 1)) {
5143 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5144 }
5145 if (likely((pg >> (j & 63)) & 1)) {
5146 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5147 }
5148 } while (i & 63);
5149 } while (i != 0);
5150 }
5151
5152 /*
5153 * FP Complex Multiply
5154 */
5155
HELPER(sve_fcmla_zpzzz_h)5156 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5157 void *vg, void *status, uint32_t desc)
5158 {
5159 intptr_t j, i = simd_oprsz(desc);
5160 unsigned rot = simd_data(desc);
5161 bool flip = rot & 1;
5162 float16 neg_imag, neg_real;
5163 uint64_t *g = vg;
5164
5165 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5166 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5167
5168 do {
5169 uint64_t pg = g[(i - 1) >> 6];
5170 do {
5171 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5172
5173 /* I holds the real index; J holds the imag index. */
5174 j = i - sizeof(float16);
5175 i -= 2 * sizeof(float16);
5176
5177 nr = *(float16 *)(vn + H1_2(i));
5178 ni = *(float16 *)(vn + H1_2(j));
5179 mr = *(float16 *)(vm + H1_2(i));
5180 mi = *(float16 *)(vm + H1_2(j));
5181
5182 e2 = (flip ? ni : nr);
5183 e1 = (flip ? mi : mr) ^ neg_real;
5184 e4 = e2;
5185 e3 = (flip ? mr : mi) ^ neg_imag;
5186
5187 if (likely((pg >> (i & 63)) & 1)) {
5188 d = *(float16 *)(va + H1_2(i));
5189 d = float16_muladd(e2, e1, d, 0, status);
5190 *(float16 *)(vd + H1_2(i)) = d;
5191 }
5192 if (likely((pg >> (j & 63)) & 1)) {
5193 d = *(float16 *)(va + H1_2(j));
5194 d = float16_muladd(e4, e3, d, 0, status);
5195 *(float16 *)(vd + H1_2(j)) = d;
5196 }
5197 } while (i & 63);
5198 } while (i != 0);
5199 }
5200
HELPER(sve_fcmla_zpzzz_s)5201 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5202 void *vg, void *status, uint32_t desc)
5203 {
5204 intptr_t j, i = simd_oprsz(desc);
5205 unsigned rot = simd_data(desc);
5206 bool flip = rot & 1;
5207 float32 neg_imag, neg_real;
5208 uint64_t *g = vg;
5209
5210 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5211 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5212
5213 do {
5214 uint64_t pg = g[(i - 1) >> 6];
5215 do {
5216 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5217
5218 /* I holds the real index; J holds the imag index. */
5219 j = i - sizeof(float32);
5220 i -= 2 * sizeof(float32);
5221
5222 nr = *(float32 *)(vn + H1_2(i));
5223 ni = *(float32 *)(vn + H1_2(j));
5224 mr = *(float32 *)(vm + H1_2(i));
5225 mi = *(float32 *)(vm + H1_2(j));
5226
5227 e2 = (flip ? ni : nr);
5228 e1 = (flip ? mi : mr) ^ neg_real;
5229 e4 = e2;
5230 e3 = (flip ? mr : mi) ^ neg_imag;
5231
5232 if (likely((pg >> (i & 63)) & 1)) {
5233 d = *(float32 *)(va + H1_2(i));
5234 d = float32_muladd(e2, e1, d, 0, status);
5235 *(float32 *)(vd + H1_2(i)) = d;
5236 }
5237 if (likely((pg >> (j & 63)) & 1)) {
5238 d = *(float32 *)(va + H1_2(j));
5239 d = float32_muladd(e4, e3, d, 0, status);
5240 *(float32 *)(vd + H1_2(j)) = d;
5241 }
5242 } while (i & 63);
5243 } while (i != 0);
5244 }
5245
HELPER(sve_fcmla_zpzzz_d)5246 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5247 void *vg, void *status, uint32_t desc)
5248 {
5249 intptr_t j, i = simd_oprsz(desc);
5250 unsigned rot = simd_data(desc);
5251 bool flip = rot & 1;
5252 float64 neg_imag, neg_real;
5253 uint64_t *g = vg;
5254
5255 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5256 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5257
5258 do {
5259 uint64_t pg = g[(i - 1) >> 6];
5260 do {
5261 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5262
5263 /* I holds the real index; J holds the imag index. */
5264 j = i - sizeof(float64);
5265 i -= 2 * sizeof(float64);
5266
5267 nr = *(float64 *)(vn + H1_2(i));
5268 ni = *(float64 *)(vn + H1_2(j));
5269 mr = *(float64 *)(vm + H1_2(i));
5270 mi = *(float64 *)(vm + H1_2(j));
5271
5272 e2 = (flip ? ni : nr);
5273 e1 = (flip ? mi : mr) ^ neg_real;
5274 e4 = e2;
5275 e3 = (flip ? mr : mi) ^ neg_imag;
5276
5277 if (likely((pg >> (i & 63)) & 1)) {
5278 d = *(float64 *)(va + H1_2(i));
5279 d = float64_muladd(e2, e1, d, 0, status);
5280 *(float64 *)(vd + H1_2(i)) = d;
5281 }
5282 if (likely((pg >> (j & 63)) & 1)) {
5283 d = *(float64 *)(va + H1_2(j));
5284 d = float64_muladd(e4, e3, d, 0, status);
5285 *(float64 *)(vd + H1_2(j)) = d;
5286 }
5287 } while (i & 63);
5288 } while (i != 0);
5289 }
5290
5291 /*
5292 * Load contiguous data, protected by a governing predicate.
5293 */
5294
5295 /*
5296 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5297 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5298 * element >= @reg_off, or @reg_max if there were no active elements at all.
5299 */
find_next_active(uint64_t * vg,intptr_t reg_off,intptr_t reg_max,int esz)5300 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5301 intptr_t reg_max, int esz)
5302 {
5303 uint64_t pg_mask = pred_esz_masks[esz];
5304 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5305
5306 /* In normal usage, the first element is active. */
5307 if (likely(pg & 1)) {
5308 return reg_off;
5309 }
5310
5311 if (pg == 0) {
5312 reg_off &= -64;
5313 do {
5314 reg_off += 64;
5315 if (unlikely(reg_off >= reg_max)) {
5316 /* The entire predicate was false. */
5317 return reg_max;
5318 }
5319 pg = vg[reg_off >> 6] & pg_mask;
5320 } while (pg == 0);
5321 }
5322 reg_off += ctz64(pg);
5323
5324 /* We should never see an out of range predicate bit set. */
5325 tcg_debug_assert(reg_off < reg_max);
5326 return reg_off;
5327 }
5328
5329 /*
5330 * Resolve the guest virtual address to info->host and info->flags.
5331 * If @nofault, return false if the page is invalid, otherwise
5332 * exit via page fault exception.
5333 */
5334
sve_probe_page(SVEHostPage * info,bool nofault,CPUARMState * env,target_ulong addr,int mem_off,MMUAccessType access_type,int mmu_idx,uintptr_t retaddr)5335 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5336 target_ulong addr, int mem_off, MMUAccessType access_type,
5337 int mmu_idx, uintptr_t retaddr)
5338 {
5339 int flags;
5340
5341 addr += mem_off;
5342
5343 /*
5344 * User-only currently always issues with TBI. See the comment
5345 * above useronly_clean_ptr. Usually we clean this top byte away
5346 * during translation, but we can't do that for e.g. vector + imm
5347 * addressing modes.
5348 *
5349 * We currently always enable TBI for user-only, and do not provide
5350 * a way to turn it off. So clean the pointer unconditionally here,
5351 * rather than look it up here, or pass it down from above.
5352 */
5353 addr = useronly_clean_ptr(addr);
5354
5355 #ifdef CONFIG_USER_ONLY
5356 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5357 &info->host, retaddr);
5358 #else
5359 CPUTLBEntryFull *full;
5360 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5361 &info->host, &full, retaddr);
5362 #endif
5363 info->flags = flags;
5364
5365 if (flags & TLB_INVALID_MASK) {
5366 g_assert(nofault);
5367 return false;
5368 }
5369
5370 #ifdef CONFIG_USER_ONLY
5371 memset(&info->attrs, 0, sizeof(info->attrs));
5372 /* Require both ANON and MTE; see allocation_tag_mem(). */
5373 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5374 #else
5375 info->attrs = full->attrs;
5376 info->tagged = full->extra.arm.pte_attrs == 0xf0;
5377 #endif
5378
5379 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5380 info->host -= mem_off;
5381 return true;
5382 }
5383
5384 /*
5385 * Find first active element on each page, and a loose bound for the
5386 * final element on each page. Identify any single element that spans
5387 * the page boundary. Return true if there are any active elements.
5388 */
sve_cont_ldst_elements(SVEContLdSt * info,target_ulong addr,uint64_t * vg,intptr_t reg_max,int esz,int msize)5389 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5390 intptr_t reg_max, int esz, int msize)
5391 {
5392 const int esize = 1 << esz;
5393 const uint64_t pg_mask = pred_esz_masks[esz];
5394 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5395 intptr_t mem_off_last, mem_off_split;
5396 intptr_t page_split, elt_split;
5397 intptr_t i;
5398
5399 /* Set all of the element indices to -1, and the TLB data to 0. */
5400 memset(info, -1, offsetof(SVEContLdSt, page));
5401 memset(info->page, 0, sizeof(info->page));
5402
5403 /* Gross scan over the entire predicate to find bounds. */
5404 i = 0;
5405 do {
5406 uint64_t pg = vg[i] & pg_mask;
5407 if (pg) {
5408 reg_off_last = i * 64 + 63 - clz64(pg);
5409 if (reg_off_first < 0) {
5410 reg_off_first = i * 64 + ctz64(pg);
5411 }
5412 }
5413 } while (++i * 64 < reg_max);
5414
5415 if (unlikely(reg_off_first < 0)) {
5416 /* No active elements, no pages touched. */
5417 return false;
5418 }
5419 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5420
5421 info->reg_off_first[0] = reg_off_first;
5422 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5423 mem_off_last = (reg_off_last >> esz) * msize;
5424
5425 page_split = -(addr | TARGET_PAGE_MASK);
5426 if (likely(mem_off_last + msize <= page_split)) {
5427 /* The entire operation fits within a single page. */
5428 info->reg_off_last[0] = reg_off_last;
5429 return true;
5430 }
5431
5432 info->page_split = page_split;
5433 elt_split = page_split / msize;
5434 reg_off_split = elt_split << esz;
5435 mem_off_split = elt_split * msize;
5436
5437 /*
5438 * This is the last full element on the first page, but it is not
5439 * necessarily active. If there is no full element, i.e. the first
5440 * active element is the one that's split, this value remains -1.
5441 * It is useful as iteration bounds.
5442 */
5443 if (elt_split != 0) {
5444 info->reg_off_last[0] = reg_off_split - esize;
5445 }
5446
5447 /* Determine if an unaligned element spans the pages. */
5448 if (page_split % msize != 0) {
5449 /* It is helpful to know if the split element is active. */
5450 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5451 info->reg_off_split = reg_off_split;
5452 info->mem_off_split = mem_off_split;
5453
5454 if (reg_off_split == reg_off_last) {
5455 /* The page crossing element is last. */
5456 return true;
5457 }
5458 }
5459 reg_off_split += esize;
5460 mem_off_split += msize;
5461 }
5462
5463 /*
5464 * We do want the first active element on the second page, because
5465 * this may affect the address reported in an exception.
5466 */
5467 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5468 tcg_debug_assert(reg_off_split <= reg_off_last);
5469 info->reg_off_first[1] = reg_off_split;
5470 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5471 info->reg_off_last[1] = reg_off_last;
5472 return true;
5473 }
5474
5475 /*
5476 * Resolve the guest virtual addresses to info->page[].
5477 * Control the generation of page faults with @fault. Return false if
5478 * there is no work to do, which can only happen with @fault == FAULT_NO.
5479 */
sve_cont_ldst_pages(SVEContLdSt * info,SVEContFault fault,CPUARMState * env,target_ulong addr,MMUAccessType access_type,uintptr_t retaddr)5480 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5481 CPUARMState *env, target_ulong addr,
5482 MMUAccessType access_type, uintptr_t retaddr)
5483 {
5484 int mmu_idx = cpu_mmu_index(env, false);
5485 int mem_off = info->mem_off_first[0];
5486 bool nofault = fault == FAULT_NO;
5487 bool have_work = true;
5488
5489 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5490 access_type, mmu_idx, retaddr)) {
5491 /* No work to be done. */
5492 return false;
5493 }
5494
5495 if (likely(info->page_split < 0)) {
5496 /* The entire operation was on the one page. */
5497 return true;
5498 }
5499
5500 /*
5501 * If the second page is invalid, then we want the fault address to be
5502 * the first byte on that page which is accessed.
5503 */
5504 if (info->mem_off_split >= 0) {
5505 /*
5506 * There is an element split across the pages. The fault address
5507 * should be the first byte of the second page.
5508 */
5509 mem_off = info->page_split;
5510 /*
5511 * If the split element is also the first active element
5512 * of the vector, then: For first-fault we should continue
5513 * to generate faults for the second page. For no-fault,
5514 * we have work only if the second page is valid.
5515 */
5516 if (info->mem_off_first[0] < info->mem_off_split) {
5517 nofault = FAULT_FIRST;
5518 have_work = false;
5519 }
5520 } else {
5521 /*
5522 * There is no element split across the pages. The fault address
5523 * should be the first active element on the second page.
5524 */
5525 mem_off = info->mem_off_first[1];
5526 /*
5527 * There must have been one active element on the first page,
5528 * so we're out of first-fault territory.
5529 */
5530 nofault = fault != FAULT_ALL;
5531 }
5532
5533 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5534 access_type, mmu_idx, retaddr);
5535 return have_work;
5536 }
5537
5538 #ifndef CONFIG_USER_ONLY
sve_cont_ldst_watchpoints(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,int wp_access,uintptr_t retaddr)5539 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5540 uint64_t *vg, target_ulong addr,
5541 int esize, int msize, int wp_access,
5542 uintptr_t retaddr)
5543 {
5544 intptr_t mem_off, reg_off, reg_last;
5545 int flags0 = info->page[0].flags;
5546 int flags1 = info->page[1].flags;
5547
5548 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5549 return;
5550 }
5551
5552 /* Indicate that watchpoints are handled. */
5553 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5554 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5555
5556 if (flags0 & TLB_WATCHPOINT) {
5557 mem_off = info->mem_off_first[0];
5558 reg_off = info->reg_off_first[0];
5559 reg_last = info->reg_off_last[0];
5560
5561 while (reg_off <= reg_last) {
5562 uint64_t pg = vg[reg_off >> 6];
5563 do {
5564 if ((pg >> (reg_off & 63)) & 1) {
5565 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5566 msize, info->page[0].attrs,
5567 wp_access, retaddr);
5568 }
5569 reg_off += esize;
5570 mem_off += msize;
5571 } while (reg_off <= reg_last && (reg_off & 63));
5572 }
5573 }
5574
5575 mem_off = info->mem_off_split;
5576 if (mem_off >= 0) {
5577 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5578 info->page[0].attrs, wp_access, retaddr);
5579 }
5580
5581 mem_off = info->mem_off_first[1];
5582 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5583 reg_off = info->reg_off_first[1];
5584 reg_last = info->reg_off_last[1];
5585
5586 do {
5587 uint64_t pg = vg[reg_off >> 6];
5588 do {
5589 if ((pg >> (reg_off & 63)) & 1) {
5590 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5591 msize, info->page[1].attrs,
5592 wp_access, retaddr);
5593 }
5594 reg_off += esize;
5595 mem_off += msize;
5596 } while (reg_off & 63);
5597 } while (reg_off <= reg_last);
5598 }
5599 }
5600 #endif
5601
sve_cont_ldst_mte_check(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,uint32_t mtedesc,uintptr_t ra)5602 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5603 uint64_t *vg, target_ulong addr, int esize,
5604 int msize, uint32_t mtedesc, uintptr_t ra)
5605 {
5606 intptr_t mem_off, reg_off, reg_last;
5607
5608 /* Process the page only if MemAttr == Tagged. */
5609 if (info->page[0].tagged) {
5610 mem_off = info->mem_off_first[0];
5611 reg_off = info->reg_off_first[0];
5612 reg_last = info->reg_off_split;
5613 if (reg_last < 0) {
5614 reg_last = info->reg_off_last[0];
5615 }
5616
5617 do {
5618 uint64_t pg = vg[reg_off >> 6];
5619 do {
5620 if ((pg >> (reg_off & 63)) & 1) {
5621 mte_check(env, mtedesc, addr, ra);
5622 }
5623 reg_off += esize;
5624 mem_off += msize;
5625 } while (reg_off <= reg_last && (reg_off & 63));
5626 } while (reg_off <= reg_last);
5627 }
5628
5629 mem_off = info->mem_off_first[1];
5630 if (mem_off >= 0 && info->page[1].tagged) {
5631 reg_off = info->reg_off_first[1];
5632 reg_last = info->reg_off_last[1];
5633
5634 do {
5635 uint64_t pg = vg[reg_off >> 6];
5636 do {
5637 if ((pg >> (reg_off & 63)) & 1) {
5638 mte_check(env, mtedesc, addr, ra);
5639 }
5640 reg_off += esize;
5641 mem_off += msize;
5642 } while (reg_off & 63);
5643 } while (reg_off <= reg_last);
5644 }
5645 }
5646
5647 /*
5648 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5649 */
5650 static inline QEMU_ALWAYS_INLINE
sve_ldN_r(CPUARMState * env,uint64_t * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const int N,uint32_t mtedesc,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5651 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5652 uint32_t desc, const uintptr_t retaddr,
5653 const int esz, const int msz, const int N, uint32_t mtedesc,
5654 sve_ldst1_host_fn *host_fn,
5655 sve_ldst1_tlb_fn *tlb_fn)
5656 {
5657 const unsigned rd = simd_data(desc);
5658 const intptr_t reg_max = simd_oprsz(desc);
5659 intptr_t reg_off, reg_last, mem_off;
5660 SVEContLdSt info;
5661 void *host;
5662 int flags, i;
5663
5664 /* Find the active elements. */
5665 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5666 /* The entire predicate was false; no load occurs. */
5667 for (i = 0; i < N; ++i) {
5668 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5669 }
5670 return;
5671 }
5672
5673 /* Probe the page(s). Exit with exception for any invalid page. */
5674 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5675
5676 /* Handle watchpoints for all active elements. */
5677 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5678 BP_MEM_READ, retaddr);
5679
5680 /*
5681 * Handle mte checks for all active elements.
5682 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5683 */
5684 if (mtedesc) {
5685 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5686 mtedesc, retaddr);
5687 }
5688
5689 flags = info.page[0].flags | info.page[1].flags;
5690 if (unlikely(flags != 0)) {
5691 /*
5692 * At least one page includes MMIO.
5693 * Any bus operation can fail with cpu_transaction_failed,
5694 * which for ARM will raise SyncExternal. Perform the load
5695 * into scratch memory to preserve register state until the end.
5696 */
5697 ARMVectorReg scratch[4] = { };
5698
5699 mem_off = info.mem_off_first[0];
5700 reg_off = info.reg_off_first[0];
5701 reg_last = info.reg_off_last[1];
5702 if (reg_last < 0) {
5703 reg_last = info.reg_off_split;
5704 if (reg_last < 0) {
5705 reg_last = info.reg_off_last[0];
5706 }
5707 }
5708
5709 do {
5710 uint64_t pg = vg[reg_off >> 6];
5711 do {
5712 if ((pg >> (reg_off & 63)) & 1) {
5713 for (i = 0; i < N; ++i) {
5714 tlb_fn(env, &scratch[i], reg_off,
5715 addr + mem_off + (i << msz), retaddr);
5716 }
5717 }
5718 reg_off += 1 << esz;
5719 mem_off += N << msz;
5720 } while (reg_off & 63);
5721 } while (reg_off <= reg_last);
5722
5723 for (i = 0; i < N; ++i) {
5724 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5725 }
5726 return;
5727 }
5728
5729 /* The entire operation is in RAM, on valid pages. */
5730
5731 for (i = 0; i < N; ++i) {
5732 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5733 }
5734
5735 mem_off = info.mem_off_first[0];
5736 reg_off = info.reg_off_first[0];
5737 reg_last = info.reg_off_last[0];
5738 host = info.page[0].host;
5739
5740 while (reg_off <= reg_last) {
5741 uint64_t pg = vg[reg_off >> 6];
5742 do {
5743 if ((pg >> (reg_off & 63)) & 1) {
5744 for (i = 0; i < N; ++i) {
5745 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5746 host + mem_off + (i << msz));
5747 }
5748 }
5749 reg_off += 1 << esz;
5750 mem_off += N << msz;
5751 } while (reg_off <= reg_last && (reg_off & 63));
5752 }
5753
5754 /*
5755 * Use the slow path to manage the cross-page misalignment.
5756 * But we know this is RAM and cannot trap.
5757 */
5758 mem_off = info.mem_off_split;
5759 if (unlikely(mem_off >= 0)) {
5760 reg_off = info.reg_off_split;
5761 for (i = 0; i < N; ++i) {
5762 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5763 addr + mem_off + (i << msz), retaddr);
5764 }
5765 }
5766
5767 mem_off = info.mem_off_first[1];
5768 if (unlikely(mem_off >= 0)) {
5769 reg_off = info.reg_off_first[1];
5770 reg_last = info.reg_off_last[1];
5771 host = info.page[1].host;
5772
5773 do {
5774 uint64_t pg = vg[reg_off >> 6];
5775 do {
5776 if ((pg >> (reg_off & 63)) & 1) {
5777 for (i = 0; i < N; ++i) {
5778 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5779 host + mem_off + (i << msz));
5780 }
5781 }
5782 reg_off += 1 << esz;
5783 mem_off += N << msz;
5784 } while (reg_off & 63);
5785 } while (reg_off <= reg_last);
5786 }
5787 }
5788
5789 static inline QEMU_ALWAYS_INLINE
sve_ldN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5790 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5791 uint32_t desc, const uintptr_t ra,
5792 const int esz, const int msz, const int N,
5793 sve_ldst1_host_fn *host_fn,
5794 sve_ldst1_tlb_fn *tlb_fn)
5795 {
5796 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5797 int bit55 = extract64(addr, 55, 1);
5798
5799 /* Remove mtedesc from the normal sve descriptor. */
5800 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5801
5802 /* Perform gross MTE suppression early. */
5803 if (!tbi_check(mtedesc, bit55) ||
5804 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
5805 mtedesc = 0;
5806 }
5807
5808 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5809 }
5810
5811 #define DO_LD1_1(NAME, ESZ) \
5812 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5813 target_ulong addr, uint32_t desc) \
5814 { \
5815 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5816 sve_##NAME##_host, sve_##NAME##_tlb); \
5817 } \
5818 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5819 target_ulong addr, uint32_t desc) \
5820 { \
5821 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5822 sve_##NAME##_host, sve_##NAME##_tlb); \
5823 }
5824
5825 #define DO_LD1_2(NAME, ESZ, MSZ) \
5826 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5827 target_ulong addr, uint32_t desc) \
5828 { \
5829 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5830 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5831 } \
5832 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5833 target_ulong addr, uint32_t desc) \
5834 { \
5835 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5836 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5837 } \
5838 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5839 target_ulong addr, uint32_t desc) \
5840 { \
5841 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5842 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5843 } \
5844 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5845 target_ulong addr, uint32_t desc) \
5846 { \
5847 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5848 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5849 }
5850
DO_LD1_1(ld1bb,MO_8)5851 DO_LD1_1(ld1bb, MO_8)
5852 DO_LD1_1(ld1bhu, MO_16)
5853 DO_LD1_1(ld1bhs, MO_16)
5854 DO_LD1_1(ld1bsu, MO_32)
5855 DO_LD1_1(ld1bss, MO_32)
5856 DO_LD1_1(ld1bdu, MO_64)
5857 DO_LD1_1(ld1bds, MO_64)
5858
5859 DO_LD1_2(ld1hh, MO_16, MO_16)
5860 DO_LD1_2(ld1hsu, MO_32, MO_16)
5861 DO_LD1_2(ld1hss, MO_32, MO_16)
5862 DO_LD1_2(ld1hdu, MO_64, MO_16)
5863 DO_LD1_2(ld1hds, MO_64, MO_16)
5864
5865 DO_LD1_2(ld1ss, MO_32, MO_32)
5866 DO_LD1_2(ld1sdu, MO_64, MO_32)
5867 DO_LD1_2(ld1sds, MO_64, MO_32)
5868
5869 DO_LD1_2(ld1dd, MO_64, MO_64)
5870
5871 #undef DO_LD1_1
5872 #undef DO_LD1_2
5873
5874 #define DO_LDN_1(N) \
5875 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5876 target_ulong addr, uint32_t desc) \
5877 { \
5878 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5879 sve_ld1bb_host, sve_ld1bb_tlb); \
5880 } \
5881 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5882 target_ulong addr, uint32_t desc) \
5883 { \
5884 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5885 sve_ld1bb_host, sve_ld1bb_tlb); \
5886 }
5887
5888 #define DO_LDN_2(N, SUFF, ESZ) \
5889 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5890 target_ulong addr, uint32_t desc) \
5891 { \
5892 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5893 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5894 } \
5895 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5896 target_ulong addr, uint32_t desc) \
5897 { \
5898 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5899 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5900 } \
5901 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5902 target_ulong addr, uint32_t desc) \
5903 { \
5904 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5905 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5906 } \
5907 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5908 target_ulong addr, uint32_t desc) \
5909 { \
5910 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5911 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5912 }
5913
5914 DO_LDN_1(2)
5915 DO_LDN_1(3)
5916 DO_LDN_1(4)
5917
5918 DO_LDN_2(2, hh, MO_16)
5919 DO_LDN_2(3, hh, MO_16)
5920 DO_LDN_2(4, hh, MO_16)
5921
5922 DO_LDN_2(2, ss, MO_32)
5923 DO_LDN_2(3, ss, MO_32)
5924 DO_LDN_2(4, ss, MO_32)
5925
5926 DO_LDN_2(2, dd, MO_64)
5927 DO_LDN_2(3, dd, MO_64)
5928 DO_LDN_2(4, dd, MO_64)
5929
5930 #undef DO_LDN_1
5931 #undef DO_LDN_2
5932
5933 /*
5934 * Load contiguous data, first-fault and no-fault.
5935 *
5936 * For user-only, one could argue that we should hold the mmap_lock during
5937 * the operation so that there is no race between page_check_range and the
5938 * load operation. However, unmapping pages out from under a running thread
5939 * is extraordinarily unlikely. This theoretical race condition also affects
5940 * linux-user/ in its get_user/put_user macros.
5941 *
5942 * TODO: Construct some helpers, written in assembly, that interact with
5943 * host_signal_handler to produce memory ops which can properly report errors
5944 * without racing.
5945 */
5946
5947 /* Fault on byte I. All bits in FFR from I are cleared. The vector
5948 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5949 * option, which leaves subsequent data unchanged.
5950 */
5951 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5952 {
5953 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5954
5955 if (i & 63) {
5956 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5957 i = ROUND_UP(i, 64);
5958 }
5959 for (; i < oprsz; i += 64) {
5960 ffr[i / 64] = 0;
5961 }
5962 }
5963
5964 /*
5965 * Common helper for all contiguous no-fault and first-fault loads.
5966 */
5967 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r(CPUARMState * env,void * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,uint32_t mtedesc,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5968 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5969 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5970 const int esz, const int msz, const SVEContFault fault,
5971 sve_ldst1_host_fn *host_fn,
5972 sve_ldst1_tlb_fn *tlb_fn)
5973 {
5974 const unsigned rd = simd_data(desc);
5975 void *vd = &env->vfp.zregs[rd];
5976 const intptr_t reg_max = simd_oprsz(desc);
5977 intptr_t reg_off, mem_off, reg_last;
5978 SVEContLdSt info;
5979 int flags;
5980 void *host;
5981
5982 /* Find the active elements. */
5983 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5984 /* The entire predicate was false; no load occurs. */
5985 memset(vd, 0, reg_max);
5986 return;
5987 }
5988 reg_off = info.reg_off_first[0];
5989
5990 /* Probe the page(s). */
5991 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5992 /* Fault on first element. */
5993 tcg_debug_assert(fault == FAULT_NO);
5994 memset(vd, 0, reg_max);
5995 goto do_fault;
5996 }
5997
5998 mem_off = info.mem_off_first[0];
5999 flags = info.page[0].flags;
6000
6001 /*
6002 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6003 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6004 */
6005 if (!info.page[0].tagged) {
6006 mtedesc = 0;
6007 }
6008
6009 if (fault == FAULT_FIRST) {
6010 /* Trapping mte check for the first-fault element. */
6011 if (mtedesc) {
6012 mte_check(env, mtedesc, addr + mem_off, retaddr);
6013 }
6014
6015 /*
6016 * Special handling of the first active element,
6017 * if it crosses a page boundary or is MMIO.
6018 */
6019 bool is_split = mem_off == info.mem_off_split;
6020 if (unlikely(flags != 0) || unlikely(is_split)) {
6021 /*
6022 * Use the slow path for cross-page handling.
6023 * Might trap for MMIO or watchpoints.
6024 */
6025 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6026
6027 /* After any fault, zero the other elements. */
6028 swap_memzero(vd, reg_off);
6029 reg_off += 1 << esz;
6030 mem_off += 1 << msz;
6031 swap_memzero(vd + reg_off, reg_max - reg_off);
6032
6033 if (is_split) {
6034 goto second_page;
6035 }
6036 } else {
6037 memset(vd, 0, reg_max);
6038 }
6039 } else {
6040 memset(vd, 0, reg_max);
6041 if (unlikely(mem_off == info.mem_off_split)) {
6042 /* The first active element crosses a page boundary. */
6043 flags |= info.page[1].flags;
6044 if (unlikely(flags & TLB_MMIO)) {
6045 /* Some page is MMIO, see below. */
6046 goto do_fault;
6047 }
6048 if (unlikely(flags & TLB_WATCHPOINT) &&
6049 (cpu_watchpoint_address_matches
6050 (env_cpu(env), addr + mem_off, 1 << msz)
6051 & BP_MEM_READ)) {
6052 /* Watchpoint hit, see below. */
6053 goto do_fault;
6054 }
6055 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6056 goto do_fault;
6057 }
6058 /*
6059 * Use the slow path for cross-page handling.
6060 * This is RAM, without a watchpoint, and will not trap.
6061 */
6062 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6063 goto second_page;
6064 }
6065 }
6066
6067 /*
6068 * From this point on, all memory operations are MemSingleNF.
6069 *
6070 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6071 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6072 *
6073 * Unfortuately we do not have access to the memory attributes from the
6074 * PTE to tell Device memory from Normal memory. So we make a mostly
6075 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6076 * This gives the right answer for the common cases of "Normal memory,
6077 * backed by host RAM" and "Device memory, backed by MMIO".
6078 * The architecture allows us to suppress an NF load and return
6079 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6080 * case of "Normal memory, backed by MMIO" is permitted. The case we
6081 * get wrong is "Device memory, backed by host RAM", for which we
6082 * should return (UNKNOWN, FAULT) for but do not.
6083 *
6084 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6085 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6086 * architectural breakpoints the same.
6087 */
6088 if (unlikely(flags & TLB_MMIO)) {
6089 goto do_fault;
6090 }
6091
6092 reg_last = info.reg_off_last[0];
6093 host = info.page[0].host;
6094
6095 do {
6096 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6097 do {
6098 if ((pg >> (reg_off & 63)) & 1) {
6099 if (unlikely(flags & TLB_WATCHPOINT) &&
6100 (cpu_watchpoint_address_matches
6101 (env_cpu(env), addr + mem_off, 1 << msz)
6102 & BP_MEM_READ)) {
6103 goto do_fault;
6104 }
6105 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6106 goto do_fault;
6107 }
6108 host_fn(vd, reg_off, host + mem_off);
6109 }
6110 reg_off += 1 << esz;
6111 mem_off += 1 << msz;
6112 } while (reg_off <= reg_last && (reg_off & 63));
6113 } while (reg_off <= reg_last);
6114
6115 /*
6116 * MemSingleNF is allowed to fail for any reason. We have special
6117 * code above to handle the first element crossing a page boundary.
6118 * As an implementation choice, decline to handle a cross-page element
6119 * in any other position.
6120 */
6121 reg_off = info.reg_off_split;
6122 if (reg_off >= 0) {
6123 goto do_fault;
6124 }
6125
6126 second_page:
6127 reg_off = info.reg_off_first[1];
6128 if (likely(reg_off < 0)) {
6129 /* No active elements on the second page. All done. */
6130 return;
6131 }
6132
6133 /*
6134 * MemSingleNF is allowed to fail for any reason. As an implementation
6135 * choice, decline to handle elements on the second page. This should
6136 * be low frequency as the guest walks through memory -- the next
6137 * iteration of the guest's loop should be aligned on the page boundary,
6138 * and then all following iterations will stay aligned.
6139 */
6140
6141 do_fault:
6142 record_fault(env, reg_off, reg_max);
6143 }
6144
6145 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r_mte(CPUARMState * env,void * vg,target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6146 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6147 uint32_t desc, const uintptr_t retaddr,
6148 const int esz, const int msz, const SVEContFault fault,
6149 sve_ldst1_host_fn *host_fn,
6150 sve_ldst1_tlb_fn *tlb_fn)
6151 {
6152 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6153 int bit55 = extract64(addr, 55, 1);
6154
6155 /* Remove mtedesc from the normal sve descriptor. */
6156 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6157
6158 /* Perform gross MTE suppression early. */
6159 if (!tbi_check(mtedesc, bit55) ||
6160 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6161 mtedesc = 0;
6162 }
6163
6164 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6165 esz, msz, fault, host_fn, tlb_fn);
6166 }
6167
6168 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6169 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6170 target_ulong addr, uint32_t desc) \
6171 { \
6172 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6173 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6174 } \
6175 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6176 target_ulong addr, uint32_t desc) \
6177 { \
6178 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6179 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6180 } \
6181 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6182 target_ulong addr, uint32_t desc) \
6183 { \
6184 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6185 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6186 } \
6187 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6188 target_ulong addr, uint32_t desc) \
6189 { \
6190 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6191 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6192 }
6193
6194 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6195 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6196 target_ulong addr, uint32_t desc) \
6197 { \
6198 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6199 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6200 } \
6201 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6202 target_ulong addr, uint32_t desc) \
6203 { \
6204 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6205 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6206 } \
6207 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6208 target_ulong addr, uint32_t desc) \
6209 { \
6210 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6211 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6212 } \
6213 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6214 target_ulong addr, uint32_t desc) \
6215 { \
6216 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6217 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6218 } \
6219 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6220 target_ulong addr, uint32_t desc) \
6221 { \
6222 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6223 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6224 } \
6225 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6226 target_ulong addr, uint32_t desc) \
6227 { \
6228 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6229 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6230 } \
6231 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6232 target_ulong addr, uint32_t desc) \
6233 { \
6234 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6235 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6236 } \
6237 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6238 target_ulong addr, uint32_t desc) \
6239 { \
6240 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6241 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6242 }
6243
DO_LDFF1_LDNF1_1(bb,MO_8)6244 DO_LDFF1_LDNF1_1(bb, MO_8)
6245 DO_LDFF1_LDNF1_1(bhu, MO_16)
6246 DO_LDFF1_LDNF1_1(bhs, MO_16)
6247 DO_LDFF1_LDNF1_1(bsu, MO_32)
6248 DO_LDFF1_LDNF1_1(bss, MO_32)
6249 DO_LDFF1_LDNF1_1(bdu, MO_64)
6250 DO_LDFF1_LDNF1_1(bds, MO_64)
6251
6252 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6253 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6254 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6255 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6256 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6257
6258 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6259 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6260 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6261
6262 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6263
6264 #undef DO_LDFF1_LDNF1_1
6265 #undef DO_LDFF1_LDNF1_2
6266
6267 /*
6268 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6269 */
6270
6271 static inline QEMU_ALWAYS_INLINE
6272 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6273 uint32_t desc, const uintptr_t retaddr,
6274 const int esz, const int msz, const int N, uint32_t mtedesc,
6275 sve_ldst1_host_fn *host_fn,
6276 sve_ldst1_tlb_fn *tlb_fn)
6277 {
6278 const unsigned rd = simd_data(desc);
6279 const intptr_t reg_max = simd_oprsz(desc);
6280 intptr_t reg_off, reg_last, mem_off;
6281 SVEContLdSt info;
6282 void *host;
6283 int i, flags;
6284
6285 /* Find the active elements. */
6286 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6287 /* The entire predicate was false; no store occurs. */
6288 return;
6289 }
6290
6291 /* Probe the page(s). Exit with exception for any invalid page. */
6292 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6293
6294 /* Handle watchpoints for all active elements. */
6295 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6296 BP_MEM_WRITE, retaddr);
6297
6298 /*
6299 * Handle mte checks for all active elements.
6300 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6301 */
6302 if (mtedesc) {
6303 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6304 mtedesc, retaddr);
6305 }
6306
6307 flags = info.page[0].flags | info.page[1].flags;
6308 if (unlikely(flags != 0)) {
6309 #ifdef CONFIG_USER_ONLY
6310 g_assert_not_reached();
6311 #else
6312 /*
6313 * At least one page includes MMIO.
6314 * Any bus operation can fail with cpu_transaction_failed,
6315 * which for ARM will raise SyncExternal. We cannot avoid
6316 * this fault and will leave with the store incomplete.
6317 */
6318 mem_off = info.mem_off_first[0];
6319 reg_off = info.reg_off_first[0];
6320 reg_last = info.reg_off_last[1];
6321 if (reg_last < 0) {
6322 reg_last = info.reg_off_split;
6323 if (reg_last < 0) {
6324 reg_last = info.reg_off_last[0];
6325 }
6326 }
6327
6328 do {
6329 uint64_t pg = vg[reg_off >> 6];
6330 do {
6331 if ((pg >> (reg_off & 63)) & 1) {
6332 for (i = 0; i < N; ++i) {
6333 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6334 addr + mem_off + (i << msz), retaddr);
6335 }
6336 }
6337 reg_off += 1 << esz;
6338 mem_off += N << msz;
6339 } while (reg_off & 63);
6340 } while (reg_off <= reg_last);
6341 return;
6342 #endif
6343 }
6344
6345 mem_off = info.mem_off_first[0];
6346 reg_off = info.reg_off_first[0];
6347 reg_last = info.reg_off_last[0];
6348 host = info.page[0].host;
6349
6350 while (reg_off <= reg_last) {
6351 uint64_t pg = vg[reg_off >> 6];
6352 do {
6353 if ((pg >> (reg_off & 63)) & 1) {
6354 for (i = 0; i < N; ++i) {
6355 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6356 host + mem_off + (i << msz));
6357 }
6358 }
6359 reg_off += 1 << esz;
6360 mem_off += N << msz;
6361 } while (reg_off <= reg_last && (reg_off & 63));
6362 }
6363
6364 /*
6365 * Use the slow path to manage the cross-page misalignment.
6366 * But we know this is RAM and cannot trap.
6367 */
6368 mem_off = info.mem_off_split;
6369 if (unlikely(mem_off >= 0)) {
6370 reg_off = info.reg_off_split;
6371 for (i = 0; i < N; ++i) {
6372 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6373 addr + mem_off + (i << msz), retaddr);
6374 }
6375 }
6376
6377 mem_off = info.mem_off_first[1];
6378 if (unlikely(mem_off >= 0)) {
6379 reg_off = info.reg_off_first[1];
6380 reg_last = info.reg_off_last[1];
6381 host = info.page[1].host;
6382
6383 do {
6384 uint64_t pg = vg[reg_off >> 6];
6385 do {
6386 if ((pg >> (reg_off & 63)) & 1) {
6387 for (i = 0; i < N; ++i) {
6388 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6389 host + mem_off + (i << msz));
6390 }
6391 }
6392 reg_off += 1 << esz;
6393 mem_off += N << msz;
6394 } while (reg_off & 63);
6395 } while (reg_off <= reg_last);
6396 }
6397 }
6398
6399 static inline QEMU_ALWAYS_INLINE
sve_stN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6400 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6401 uint32_t desc, const uintptr_t ra,
6402 const int esz, const int msz, const int N,
6403 sve_ldst1_host_fn *host_fn,
6404 sve_ldst1_tlb_fn *tlb_fn)
6405 {
6406 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6407 int bit55 = extract64(addr, 55, 1);
6408
6409 /* Remove mtedesc from the normal sve descriptor. */
6410 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6411
6412 /* Perform gross MTE suppression early. */
6413 if (!tbi_check(mtedesc, bit55) ||
6414 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6415 mtedesc = 0;
6416 }
6417
6418 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6419 }
6420
6421 #define DO_STN_1(N, NAME, ESZ) \
6422 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6423 target_ulong addr, uint32_t desc) \
6424 { \
6425 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6426 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6427 } \
6428 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6429 target_ulong addr, uint32_t desc) \
6430 { \
6431 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6432 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6433 }
6434
6435 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6436 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6437 target_ulong addr, uint32_t desc) \
6438 { \
6439 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6440 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6441 } \
6442 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6443 target_ulong addr, uint32_t desc) \
6444 { \
6445 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6446 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6447 } \
6448 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6449 target_ulong addr, uint32_t desc) \
6450 { \
6451 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6452 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6453 } \
6454 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6455 target_ulong addr, uint32_t desc) \
6456 { \
6457 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6458 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6459 }
6460
6461 DO_STN_1(1, bb, MO_8)
6462 DO_STN_1(1, bh, MO_16)
6463 DO_STN_1(1, bs, MO_32)
6464 DO_STN_1(1, bd, MO_64)
6465 DO_STN_1(2, bb, MO_8)
6466 DO_STN_1(3, bb, MO_8)
6467 DO_STN_1(4, bb, MO_8)
6468
6469 DO_STN_2(1, hh, MO_16, MO_16)
6470 DO_STN_2(1, hs, MO_32, MO_16)
6471 DO_STN_2(1, hd, MO_64, MO_16)
6472 DO_STN_2(2, hh, MO_16, MO_16)
6473 DO_STN_2(3, hh, MO_16, MO_16)
6474 DO_STN_2(4, hh, MO_16, MO_16)
6475
6476 DO_STN_2(1, ss, MO_32, MO_32)
6477 DO_STN_2(1, sd, MO_64, MO_32)
6478 DO_STN_2(2, ss, MO_32, MO_32)
6479 DO_STN_2(3, ss, MO_32, MO_32)
6480 DO_STN_2(4, ss, MO_32, MO_32)
6481
6482 DO_STN_2(1, dd, MO_64, MO_64)
6483 DO_STN_2(2, dd, MO_64, MO_64)
6484 DO_STN_2(3, dd, MO_64, MO_64)
6485 DO_STN_2(4, dd, MO_64, MO_64)
6486
6487 #undef DO_STN_1
6488 #undef DO_STN_2
6489
6490 /*
6491 * Loads with a vector index.
6492 */
6493
6494 /*
6495 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6496 */
6497 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6498
off_zsu_s(void * reg,intptr_t reg_ofs)6499 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6500 {
6501 return *(uint32_t *)(reg + H1_4(reg_ofs));
6502 }
6503
off_zss_s(void * reg,intptr_t reg_ofs)6504 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6505 {
6506 return *(int32_t *)(reg + H1_4(reg_ofs));
6507 }
6508
off_zsu_d(void * reg,intptr_t reg_ofs)6509 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6510 {
6511 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6512 }
6513
off_zss_d(void * reg,intptr_t reg_ofs)6514 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6515 {
6516 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6517 }
6518
off_zd_d(void * reg,intptr_t reg_ofs)6519 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6520 {
6521 return *(uint64_t *)(reg + reg_ofs);
6522 }
6523
6524 static inline QEMU_ALWAYS_INLINE
sve_ld1_z(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,uint32_t mtedesc,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6525 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6526 target_ulong base, uint32_t desc, uintptr_t retaddr,
6527 uint32_t mtedesc, int esize, int msize,
6528 zreg_off_fn *off_fn,
6529 sve_ldst1_host_fn *host_fn,
6530 sve_ldst1_tlb_fn *tlb_fn)
6531 {
6532 const int mmu_idx = cpu_mmu_index(env, false);
6533 const intptr_t reg_max = simd_oprsz(desc);
6534 const int scale = simd_data(desc);
6535 ARMVectorReg scratch;
6536 intptr_t reg_off;
6537 SVEHostPage info, info2;
6538
6539 memset(&scratch, 0, reg_max);
6540 reg_off = 0;
6541 do {
6542 uint64_t pg = vg[reg_off >> 6];
6543 do {
6544 if (likely(pg & 1)) {
6545 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6546 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6547
6548 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6549 mmu_idx, retaddr);
6550
6551 if (likely(in_page >= msize)) {
6552 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6553 cpu_check_watchpoint(env_cpu(env), addr, msize,
6554 info.attrs, BP_MEM_READ, retaddr);
6555 }
6556 if (mtedesc && info.tagged) {
6557 mte_check(env, mtedesc, addr, retaddr);
6558 }
6559 if (unlikely(info.flags & TLB_MMIO)) {
6560 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6561 } else {
6562 host_fn(&scratch, reg_off, info.host);
6563 }
6564 } else {
6565 /* Element crosses the page boundary. */
6566 sve_probe_page(&info2, false, env, addr + in_page, 0,
6567 MMU_DATA_LOAD, mmu_idx, retaddr);
6568 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6569 cpu_check_watchpoint(env_cpu(env), addr,
6570 msize, info.attrs,
6571 BP_MEM_READ, retaddr);
6572 }
6573 if (mtedesc && info.tagged) {
6574 mte_check(env, mtedesc, addr, retaddr);
6575 }
6576 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6577 }
6578 }
6579 reg_off += esize;
6580 pg >>= esize;
6581 } while (reg_off & 63);
6582 } while (reg_off < reg_max);
6583
6584 /* Wait until all exceptions have been raised to write back. */
6585 memcpy(vd, &scratch, reg_max);
6586 }
6587
6588 static inline QEMU_ALWAYS_INLINE
sve_ld1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6589 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6590 target_ulong base, uint32_t desc, uintptr_t retaddr,
6591 int esize, int msize, zreg_off_fn *off_fn,
6592 sve_ldst1_host_fn *host_fn,
6593 sve_ldst1_tlb_fn *tlb_fn)
6594 {
6595 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6596 /* Remove mtedesc from the normal sve descriptor. */
6597 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6598
6599 /*
6600 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6601 * offset base entirely over the address space hole to change the
6602 * pointer tag, or change the bit55 selector. So we could here
6603 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6604 */
6605 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6606 esize, msize, off_fn, host_fn, tlb_fn);
6607 }
6608
6609 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6610 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6611 void *vm, target_ulong base, uint32_t desc) \
6612 { \
6613 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6614 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6615 } \
6616 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6617 void *vm, target_ulong base, uint32_t desc) \
6618 { \
6619 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6620 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6621 }
6622
6623 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6624 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6625 void *vm, target_ulong base, uint32_t desc) \
6626 { \
6627 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6628 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6629 } \
6630 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6631 void *vm, target_ulong base, uint32_t desc) \
6632 { \
6633 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6634 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6635 }
6636
DO_LD1_ZPZ_S(bsu,zsu,MO_8)6637 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6638 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6639 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6640 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6641 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6642
6643 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6644 DO_LD1_ZPZ_S(bss, zss, MO_8)
6645 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6646 DO_LD1_ZPZ_D(bds, zss, MO_8)
6647 DO_LD1_ZPZ_D(bds, zd, MO_8)
6648
6649 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6650 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6651 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6652 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6653 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6654
6655 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6656 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6657 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6658 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6659 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6660
6661 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6662 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6663 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6664 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6665 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6666
6667 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6668 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6669 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6670 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6671 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6672
6673 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6674 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6675 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6676 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6677 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6678
6679 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6680 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6681 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6682 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6683 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6684
6685 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6686 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6687 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6688
6689 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6690 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6691 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6692
6693 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6694 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6695 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6696
6697 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6698 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6699 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6700
6701 #undef DO_LD1_ZPZ_S
6702 #undef DO_LD1_ZPZ_D
6703
6704 /* First fault loads with a vector index. */
6705
6706 /*
6707 * Common helpers for all gather first-faulting loads.
6708 */
6709
6710 static inline QEMU_ALWAYS_INLINE
6711 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6712 target_ulong base, uint32_t desc, uintptr_t retaddr,
6713 uint32_t mtedesc, const int esz, const int msz,
6714 zreg_off_fn *off_fn,
6715 sve_ldst1_host_fn *host_fn,
6716 sve_ldst1_tlb_fn *tlb_fn)
6717 {
6718 const int mmu_idx = cpu_mmu_index(env, false);
6719 const intptr_t reg_max = simd_oprsz(desc);
6720 const int scale = simd_data(desc);
6721 const int esize = 1 << esz;
6722 const int msize = 1 << msz;
6723 intptr_t reg_off;
6724 SVEHostPage info;
6725 target_ulong addr, in_page;
6726 ARMVectorReg scratch;
6727
6728 /* Skip to the first true predicate. */
6729 reg_off = find_next_active(vg, 0, reg_max, esz);
6730 if (unlikely(reg_off >= reg_max)) {
6731 /* The entire predicate was false; no load occurs. */
6732 memset(vd, 0, reg_max);
6733 return;
6734 }
6735
6736 /* Protect against overlap between vd and vm. */
6737 if (unlikely(vd == vm)) {
6738 vm = memcpy(&scratch, vm, reg_max);
6739 }
6740
6741 /*
6742 * Probe the first element, allowing faults.
6743 */
6744 addr = base + (off_fn(vm, reg_off) << scale);
6745 if (mtedesc) {
6746 mte_check(env, mtedesc, addr, retaddr);
6747 }
6748 tlb_fn(env, vd, reg_off, addr, retaddr);
6749
6750 /* After any fault, zero the other elements. */
6751 swap_memzero(vd, reg_off);
6752 reg_off += esize;
6753 swap_memzero(vd + reg_off, reg_max - reg_off);
6754
6755 /*
6756 * Probe the remaining elements, not allowing faults.
6757 */
6758 while (reg_off < reg_max) {
6759 uint64_t pg = vg[reg_off >> 6];
6760 do {
6761 if (likely((pg >> (reg_off & 63)) & 1)) {
6762 addr = base + (off_fn(vm, reg_off) << scale);
6763 in_page = -(addr | TARGET_PAGE_MASK);
6764
6765 if (unlikely(in_page < msize)) {
6766 /* Stop if the element crosses a page boundary. */
6767 goto fault;
6768 }
6769
6770 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6771 mmu_idx, retaddr);
6772 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6773 goto fault;
6774 }
6775 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6776 (cpu_watchpoint_address_matches
6777 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6778 goto fault;
6779 }
6780 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6781 goto fault;
6782 }
6783
6784 host_fn(vd, reg_off, info.host);
6785 }
6786 reg_off += esize;
6787 } while (reg_off & 63);
6788 }
6789 return;
6790
6791 fault:
6792 record_fault(env, reg_off, reg_max);
6793 }
6794
6795 static inline QEMU_ALWAYS_INLINE
sve_ldff1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,const int esz,const int msz,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6796 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6797 target_ulong base, uint32_t desc, uintptr_t retaddr,
6798 const int esz, const int msz,
6799 zreg_off_fn *off_fn,
6800 sve_ldst1_host_fn *host_fn,
6801 sve_ldst1_tlb_fn *tlb_fn)
6802 {
6803 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6804 /* Remove mtedesc from the normal sve descriptor. */
6805 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6806
6807 /*
6808 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6809 * offset base entirely over the address space hole to change the
6810 * pointer tag, or change the bit55 selector. So we could here
6811 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6812 */
6813 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6814 esz, msz, off_fn, host_fn, tlb_fn);
6815 }
6816
6817 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6818 void HELPER(sve_ldff##MEM##_##OFS) \
6819 (CPUARMState *env, void *vd, void *vg, \
6820 void *vm, target_ulong base, uint32_t desc) \
6821 { \
6822 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6823 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6824 } \
6825 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6826 (CPUARMState *env, void *vd, void *vg, \
6827 void *vm, target_ulong base, uint32_t desc) \
6828 { \
6829 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6830 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6831 }
6832
6833 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6834 void HELPER(sve_ldff##MEM##_##OFS) \
6835 (CPUARMState *env, void *vd, void *vg, \
6836 void *vm, target_ulong base, uint32_t desc) \
6837 { \
6838 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6839 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6840 } \
6841 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6842 (CPUARMState *env, void *vd, void *vg, \
6843 void *vm, target_ulong base, uint32_t desc) \
6844 { \
6845 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6846 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6847 }
6848
DO_LDFF1_ZPZ_S(bsu,zsu,MO_8)6849 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6850 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6851 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6852 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6853 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6854
6855 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6856 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6857 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6858 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6859 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6860
6861 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6862 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6863 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6864 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6865 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6866
6867 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6868 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6869 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6870 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6871 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6872
6873 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6874 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6875 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6876 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6877 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6878
6879 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6880 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6881 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6882 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6883 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6884
6885 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6886 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6887 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6888 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6889 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6890
6891 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6892 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6893 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6894 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6895 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6896
6897 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6898 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6899 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6900
6901 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6902 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6903 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6904
6905 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6906 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6907 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6908
6909 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6910 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6911 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6912
6913 /* Stores with a vector index. */
6914
6915 static inline QEMU_ALWAYS_INLINE
6916 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6917 target_ulong base, uint32_t desc, uintptr_t retaddr,
6918 uint32_t mtedesc, int esize, int msize,
6919 zreg_off_fn *off_fn,
6920 sve_ldst1_host_fn *host_fn,
6921 sve_ldst1_tlb_fn *tlb_fn)
6922 {
6923 const int mmu_idx = cpu_mmu_index(env, false);
6924 const intptr_t reg_max = simd_oprsz(desc);
6925 const int scale = simd_data(desc);
6926 void *host[ARM_MAX_VQ * 4];
6927 intptr_t reg_off, i;
6928 SVEHostPage info, info2;
6929
6930 /*
6931 * Probe all of the elements for host addresses and flags.
6932 */
6933 i = reg_off = 0;
6934 do {
6935 uint64_t pg = vg[reg_off >> 6];
6936 do {
6937 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6938 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6939
6940 host[i] = NULL;
6941 if (likely((pg >> (reg_off & 63)) & 1)) {
6942 if (likely(in_page >= msize)) {
6943 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6944 mmu_idx, retaddr);
6945 if (!(info.flags & TLB_MMIO)) {
6946 host[i] = info.host;
6947 }
6948 } else {
6949 /*
6950 * Element crosses the page boundary.
6951 * Probe both pages, but do not record the host address,
6952 * so that we use the slow path.
6953 */
6954 sve_probe_page(&info, false, env, addr, 0,
6955 MMU_DATA_STORE, mmu_idx, retaddr);
6956 sve_probe_page(&info2, false, env, addr + in_page, 0,
6957 MMU_DATA_STORE, mmu_idx, retaddr);
6958 info.flags |= info2.flags;
6959 }
6960
6961 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6962 cpu_check_watchpoint(env_cpu(env), addr, msize,
6963 info.attrs, BP_MEM_WRITE, retaddr);
6964 }
6965
6966 if (mtedesc && info.tagged) {
6967 mte_check(env, mtedesc, addr, retaddr);
6968 }
6969 }
6970 i += 1;
6971 reg_off += esize;
6972 } while (reg_off & 63);
6973 } while (reg_off < reg_max);
6974
6975 /*
6976 * Now that we have recognized all exceptions except SyncExternal
6977 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6978 *
6979 * Note for the common case of an element in RAM, not crossing a page
6980 * boundary, we have stored the host address in host[]. This doubles
6981 * as a first-level check against the predicate, since only enabled
6982 * elements have non-null host addresses.
6983 */
6984 i = reg_off = 0;
6985 do {
6986 void *h = host[i];
6987 if (likely(h != NULL)) {
6988 host_fn(vd, reg_off, h);
6989 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6990 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6991 tlb_fn(env, vd, reg_off, addr, retaddr);
6992 }
6993 i += 1;
6994 reg_off += esize;
6995 } while (reg_off < reg_max);
6996 }
6997
6998 static inline QEMU_ALWAYS_INLINE
sve_st1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6999 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7000 target_ulong base, uint32_t desc, uintptr_t retaddr,
7001 int esize, int msize, zreg_off_fn *off_fn,
7002 sve_ldst1_host_fn *host_fn,
7003 sve_ldst1_tlb_fn *tlb_fn)
7004 {
7005 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7006 /* Remove mtedesc from the normal sve descriptor. */
7007 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7008
7009 /*
7010 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7011 * offset base entirely over the address space hole to change the
7012 * pointer tag, or change the bit55 selector. So we could here
7013 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7014 */
7015 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7016 esize, msize, off_fn, host_fn, tlb_fn);
7017 }
7018
7019 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7020 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7021 void *vm, target_ulong base, uint32_t desc) \
7022 { \
7023 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7024 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7025 } \
7026 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7027 void *vm, target_ulong base, uint32_t desc) \
7028 { \
7029 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7030 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7031 }
7032
7033 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7034 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7035 void *vm, target_ulong base, uint32_t desc) \
7036 { \
7037 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7038 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7039 } \
7040 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7041 void *vm, target_ulong base, uint32_t desc) \
7042 { \
7043 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7044 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7045 }
7046
DO_ST1_ZPZ_S(bs,zsu,MO_8)7047 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7048 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7049 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7050 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7051 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7052
7053 DO_ST1_ZPZ_S(bs, zss, MO_8)
7054 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7055 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7056 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7057 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7058
7059 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7060 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7061 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7062 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7063 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7064 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7065 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7066
7067 DO_ST1_ZPZ_D(bd, zss, MO_8)
7068 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7069 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7070 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7071 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7072 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7073 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7074
7075 DO_ST1_ZPZ_D(bd, zd, MO_8)
7076 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7077 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7078 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7079 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7080 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7081 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7082
7083 #undef DO_ST1_ZPZ_S
7084 #undef DO_ST1_ZPZ_D
7085
7086 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7087 {
7088 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7089 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7090
7091 for (i = 0; i < opr_sz; ++i) {
7092 d[i] = n[i] ^ m[i] ^ k[i];
7093 }
7094 }
7095
HELPER(sve2_bcax)7096 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7097 {
7098 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7099 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7100
7101 for (i = 0; i < opr_sz; ++i) {
7102 d[i] = n[i] ^ (m[i] & ~k[i]);
7103 }
7104 }
7105
HELPER(sve2_bsl1n)7106 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7107 {
7108 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7109 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7110
7111 for (i = 0; i < opr_sz; ++i) {
7112 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7113 }
7114 }
7115
HELPER(sve2_bsl2n)7116 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7117 {
7118 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7119 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7120
7121 for (i = 0; i < opr_sz; ++i) {
7122 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7123 }
7124 }
7125
HELPER(sve2_nbsl)7126 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7127 {
7128 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7129 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7130
7131 for (i = 0; i < opr_sz; ++i) {
7132 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7133 }
7134 }
7135
7136 /*
7137 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7138 * See hasless(v,1) from
7139 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7140 */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)7141 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7142 {
7143 int bits = 8 << esz;
7144 uint64_t ones = dup_const(esz, 1);
7145 uint64_t signs = ones << (bits - 1);
7146 uint64_t cmp0, cmp1;
7147
7148 cmp1 = dup_const(esz, n);
7149 cmp0 = cmp1 ^ m0;
7150 cmp1 = cmp1 ^ m1;
7151 cmp0 = (cmp0 - ones) & ~cmp0;
7152 cmp1 = (cmp1 - ones) & ~cmp1;
7153 return (cmp0 | cmp1) & signs;
7154 }
7155
do_match(void * vd,void * vn,void * vm,void * vg,uint32_t desc,int esz,bool nmatch)7156 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7157 uint32_t desc, int esz, bool nmatch)
7158 {
7159 uint16_t esz_mask = pred_esz_masks[esz];
7160 intptr_t opr_sz = simd_oprsz(desc);
7161 uint32_t flags = PREDTEST_INIT;
7162 intptr_t i, j, k;
7163
7164 for (i = 0; i < opr_sz; i += 16) {
7165 uint64_t m0 = *(uint64_t *)(vm + i);
7166 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7167 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7168 uint16_t out = 0;
7169
7170 for (j = 0; j < 16; j += 8) {
7171 uint64_t n = *(uint64_t *)(vn + i + j);
7172
7173 for (k = 0; k < 8; k += 1 << esz) {
7174 if (pg & (1 << (j + k))) {
7175 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7176 out |= (o ^ nmatch) << (j + k);
7177 }
7178 }
7179 }
7180 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7181 flags = iter_predtest_fwd(out, pg, flags);
7182 }
7183 return flags;
7184 }
7185
7186 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7187 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7188 { \
7189 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7190 }
7191
DO_PPZZ_MATCH(sve2_match_ppzz_b,MO_8,false)7192 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7193 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7194
7195 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7196 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7197
7198 #undef DO_PPZZ_MATCH
7199
7200 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7201 uint32_t desc)
7202 {
7203 ARMVectorReg scratch;
7204 intptr_t i, j;
7205 intptr_t opr_sz = simd_oprsz(desc);
7206 uint32_t *d = vd, *n = vn, *m = vm;
7207 uint8_t *pg = vg;
7208
7209 if (d == n) {
7210 n = memcpy(&scratch, n, opr_sz);
7211 if (d == m) {
7212 m = n;
7213 }
7214 } else if (d == m) {
7215 m = memcpy(&scratch, m, opr_sz);
7216 }
7217
7218 for (i = 0; i < opr_sz; i += 4) {
7219 uint64_t count = 0;
7220 uint8_t pred;
7221
7222 pred = pg[H1(i >> 3)] >> (i & 7);
7223 if (pred & 1) {
7224 uint32_t nn = n[H4(i >> 2)];
7225
7226 for (j = 0; j <= i; j += 4) {
7227 pred = pg[H1(j >> 3)] >> (j & 7);
7228 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7229 ++count;
7230 }
7231 }
7232 }
7233 d[H4(i >> 2)] = count;
7234 }
7235 }
7236
HELPER(sve2_histcnt_d)7237 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7238 uint32_t desc)
7239 {
7240 ARMVectorReg scratch;
7241 intptr_t i, j;
7242 intptr_t opr_sz = simd_oprsz(desc);
7243 uint64_t *d = vd, *n = vn, *m = vm;
7244 uint8_t *pg = vg;
7245
7246 if (d == n) {
7247 n = memcpy(&scratch, n, opr_sz);
7248 if (d == m) {
7249 m = n;
7250 }
7251 } else if (d == m) {
7252 m = memcpy(&scratch, m, opr_sz);
7253 }
7254
7255 for (i = 0; i < opr_sz / 8; ++i) {
7256 uint64_t count = 0;
7257 if (pg[H1(i)] & 1) {
7258 uint64_t nn = n[i];
7259 for (j = 0; j <= i; ++j) {
7260 if ((pg[H1(j)] & 1) && nn == m[j]) {
7261 ++count;
7262 }
7263 }
7264 }
7265 d[i] = count;
7266 }
7267 }
7268
7269 /*
7270 * Returns the number of bytes in m0 and m1 that match n.
7271 * Unlike do_match2 we don't just need true/false, we need an exact count.
7272 * This requires two extra logical operations.
7273 */
do_histseg_cnt(uint8_t n,uint64_t m0,uint64_t m1)7274 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7275 {
7276 const uint64_t mask = dup_const(MO_8, 0x7f);
7277 uint64_t cmp0, cmp1;
7278
7279 cmp1 = dup_const(MO_8, n);
7280 cmp0 = cmp1 ^ m0;
7281 cmp1 = cmp1 ^ m1;
7282
7283 /*
7284 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7285 * 2: carry in to msb if byte != 0 (+ mask)
7286 * 3: set msb if cmp has msb set (| cmp)
7287 * 4: set ~msb to ignore them (| mask)
7288 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7289 * 5: invert, resulting in 0x80 if and only if byte == 0.
7290 */
7291 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7292 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7293
7294 /*
7295 * Combine the two compares in a way that the bits do
7296 * not overlap, and so preserves the count of set bits.
7297 * If the host has an efficient instruction for ctpop,
7298 * then ctpop(x) + ctpop(y) has the same number of
7299 * operations as ctpop(x | (y >> 1)). If the host does
7300 * not have an efficient ctpop, then we only want to
7301 * use it once.
7302 */
7303 return ctpop64(cmp0 | (cmp1 >> 1));
7304 }
7305
HELPER(sve2_histseg)7306 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7307 {
7308 intptr_t i, j;
7309 intptr_t opr_sz = simd_oprsz(desc);
7310
7311 for (i = 0; i < opr_sz; i += 16) {
7312 uint64_t n0 = *(uint64_t *)(vn + i);
7313 uint64_t m0 = *(uint64_t *)(vm + i);
7314 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7315 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7316 uint64_t out0 = 0;
7317 uint64_t out1 = 0;
7318
7319 for (j = 0; j < 64; j += 8) {
7320 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7321 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7322 out0 |= cnt0 << j;
7323 out1 |= cnt1 << j;
7324 }
7325
7326 *(uint64_t *)(vd + i) = out0;
7327 *(uint64_t *)(vd + i + 8) = out1;
7328 }
7329 }
7330
HELPER(sve2_xar_b)7331 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7332 {
7333 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7334 int shr = simd_data(desc);
7335 int shl = 8 - shr;
7336 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7337 uint64_t *d = vd, *n = vn, *m = vm;
7338
7339 for (i = 0; i < opr_sz; ++i) {
7340 uint64_t t = n[i] ^ m[i];
7341 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7342 }
7343 }
7344
HELPER(sve2_xar_h)7345 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7346 {
7347 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7348 int shr = simd_data(desc);
7349 int shl = 16 - shr;
7350 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7351 uint64_t *d = vd, *n = vn, *m = vm;
7352
7353 for (i = 0; i < opr_sz; ++i) {
7354 uint64_t t = n[i] ^ m[i];
7355 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7356 }
7357 }
7358
HELPER(sve2_xar_s)7359 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7360 {
7361 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7362 int shr = simd_data(desc);
7363 uint32_t *d = vd, *n = vn, *m = vm;
7364
7365 for (i = 0; i < opr_sz; ++i) {
7366 d[i] = ror32(n[i] ^ m[i], shr);
7367 }
7368 }
7369
HELPER(fmmla_s)7370 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7371 void *status, uint32_t desc)
7372 {
7373 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7374
7375 for (s = 0; s < opr_sz; ++s) {
7376 float32 *n = vn + s * sizeof(float32) * 4;
7377 float32 *m = vm + s * sizeof(float32) * 4;
7378 float32 *a = va + s * sizeof(float32) * 4;
7379 float32 *d = vd + s * sizeof(float32) * 4;
7380 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7381 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7382 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7383 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7384 float32 p0, p1;
7385
7386 /* i = 0, j = 0 */
7387 p0 = float32_mul(n00, m00, status);
7388 p1 = float32_mul(n01, m01, status);
7389 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7390
7391 /* i = 0, j = 1 */
7392 p0 = float32_mul(n00, m10, status);
7393 p1 = float32_mul(n01, m11, status);
7394 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7395
7396 /* i = 1, j = 0 */
7397 p0 = float32_mul(n10, m00, status);
7398 p1 = float32_mul(n11, m01, status);
7399 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7400
7401 /* i = 1, j = 1 */
7402 p0 = float32_mul(n10, m10, status);
7403 p1 = float32_mul(n11, m11, status);
7404 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7405 }
7406 }
7407
HELPER(fmmla_d)7408 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7409 void *status, uint32_t desc)
7410 {
7411 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7412
7413 for (s = 0; s < opr_sz; ++s) {
7414 float64 *n = vn + s * sizeof(float64) * 4;
7415 float64 *m = vm + s * sizeof(float64) * 4;
7416 float64 *a = va + s * sizeof(float64) * 4;
7417 float64 *d = vd + s * sizeof(float64) * 4;
7418 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7419 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7420 float64 p0, p1;
7421
7422 /* i = 0, j = 0 */
7423 p0 = float64_mul(n00, m00, status);
7424 p1 = float64_mul(n01, m01, status);
7425 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7426
7427 /* i = 0, j = 1 */
7428 p0 = float64_mul(n00, m10, status);
7429 p1 = float64_mul(n01, m11, status);
7430 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7431
7432 /* i = 1, j = 0 */
7433 p0 = float64_mul(n10, m00, status);
7434 p1 = float64_mul(n11, m01, status);
7435 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7436
7437 /* i = 1, j = 1 */
7438 p0 = float64_mul(n10, m10, status);
7439 p1 = float64_mul(n11, m11, status);
7440 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7441 }
7442 }
7443
7444 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7445 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7446 { \
7447 intptr_t i = simd_oprsz(desc); \
7448 uint64_t *g = vg; \
7449 do { \
7450 uint64_t pg = g[(i - 1) >> 6]; \
7451 do { \
7452 i -= sizeof(TYPEW); \
7453 if (likely((pg >> (i & 63)) & 1)) { \
7454 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7455 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7456 } \
7457 } while (i & 63); \
7458 } while (i != 0); \
7459 }
7460
7461 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7462 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7463 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7464
7465 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7466 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7467 { \
7468 intptr_t i = simd_oprsz(desc); \
7469 uint64_t *g = vg; \
7470 do { \
7471 uint64_t pg = g[(i - 1) >> 6]; \
7472 do { \
7473 i -= sizeof(TYPEW); \
7474 if (likely((pg >> (i & 63)) & 1)) { \
7475 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7476 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7477 } \
7478 } while (i & 63); \
7479 } while (i != 0); \
7480 }
7481
7482 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7483 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7484
7485 #undef DO_FCVTLT
7486 #undef DO_FCVTNT
7487