1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/page-protection.h"
24 #include "exec/helper-proto.h"
25 #include "exec/target_page.h"
26 #include "exec/tlb-flags.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg.h"
30 #include "vec_internal.h"
31 #include "sve_ldst_internal.h"
32 #include "accel/tcg/cpu-ldst.h"
33 #include "accel/tcg/helper-retaddr.h"
34 #include "accel/tcg/cpu-ops.h"
35 #include "accel/tcg/probe.h"
36 #ifdef CONFIG_USER_ONLY
37 #include "user/page-protection.h"
38 #endif
39
40
41 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
42 *
43 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
44 * and bit 0 set if C is set. Compare the definitions of these variables
45 * within CPUARMState.
46 */
47
48 /* For no G bits set, NZCV = C. */
49 #define PREDTEST_INIT 1
50
51 /* This is an iterative function, called for each Pd and Pg word
52 * moving forward.
53 */
iter_predtest_fwd(uint64_t d,uint64_t g,uint32_t flags)54 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
55 {
56 if (likely(g)) {
57 /* Compute N from first D & G.
58 Use bit 2 to signal first G bit seen. */
59 if (!(flags & 4)) {
60 flags |= ((d & (g & -g)) != 0) << 31;
61 flags |= 4;
62 }
63
64 /* Accumulate Z from each D & G. */
65 flags |= ((d & g) != 0) << 1;
66
67 /* Compute C from last !(D & G). Replace previous. */
68 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
69 }
70 return flags;
71 }
72
73 /* This is an iterative function, called for each Pd and Pg word
74 * moving backward.
75 */
iter_predtest_bwd(uint64_t d,uint64_t g,uint32_t flags)76 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
77 {
78 if (likely(g)) {
79 /* Compute C from first (i.e last) !(D & G).
80 Use bit 2 to signal first G bit seen. */
81 if (!(flags & 4)) {
82 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
83 flags |= (d & pow2floor(g)) == 0;
84 }
85
86 /* Accumulate Z from each D & G. */
87 flags |= ((d & g) != 0) << 1;
88
89 /* Compute N from last (i.e first) D & G. Replace previous. */
90 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
91 }
92 return flags;
93 }
94
95 /* The same for a single word predicate. */
HELPER(sve_predtest1)96 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
97 {
98 return iter_predtest_fwd(d, g, PREDTEST_INIT);
99 }
100
101 /* The same for a multi-word predicate. */
HELPER(sve_predtest)102 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
103 {
104 uint32_t flags = PREDTEST_INIT;
105 uint64_t *d = vd, *g = vg;
106 uintptr_t i = 0;
107
108 do {
109 flags = iter_predtest_fwd(d[i], g[i], flags);
110 } while (++i < words);
111
112 return flags;
113 }
114
115 /* Similarly for single word elements. */
expand_pred_s(uint8_t byte)116 static inline uint64_t expand_pred_s(uint8_t byte)
117 {
118 static const uint64_t word[] = {
119 [0x01] = 0x00000000ffffffffull,
120 [0x10] = 0xffffffff00000000ull,
121 [0x11] = 0xffffffffffffffffull,
122 };
123 return word[byte & 0x11];
124 }
125
expand_pred_d(uint8_t byte)126 static inline uint64_t expand_pred_d(uint8_t byte)
127 {
128 return -(uint64_t)(byte & 1);
129 }
130
131 #define LOGICAL_PPPP(NAME, FUNC) \
132 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
133 { \
134 uintptr_t opr_sz = simd_oprsz(desc); \
135 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
136 uintptr_t i; \
137 for (i = 0; i < opr_sz / 8; ++i) { \
138 d[i] = FUNC(n[i], m[i], g[i]); \
139 } \
140 }
141
142 #define DO_AND(N, M, G) (((N) & (M)) & (G))
143 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
144 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
145 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
146 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
147 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
148 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
149 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
150
LOGICAL_PPPP(sve_and_pppp,DO_AND)151 LOGICAL_PPPP(sve_and_pppp, DO_AND)
152 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
153 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
154 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
155 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
156 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
157 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
158 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
159
160 #undef DO_AND
161 #undef DO_BIC
162 #undef DO_EOR
163 #undef DO_ORR
164 #undef DO_ORN
165 #undef DO_NOR
166 #undef DO_NAND
167 #undef DO_SEL
168 #undef LOGICAL_PPPP
169
170 /* Fully general three-operand expander, controlled by a predicate.
171 * This is complicated by the host-endian storage of the register file.
172 */
173 /* ??? I don't expect the compiler could ever vectorize this itself.
174 * With some tables we can convert bit masks to byte masks, and with
175 * extra care wrt byte/word ordering we could use gcc generic vectors
176 * and do 16 bytes at a time.
177 */
178 #define DO_ZPZZ(NAME, TYPE, H, OP) \
179 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
180 { \
181 intptr_t i, opr_sz = simd_oprsz(desc); \
182 for (i = 0; i < opr_sz; ) { \
183 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
184 do { \
185 if (pg & 1) { \
186 TYPE nn = *(TYPE *)(vn + H(i)); \
187 TYPE mm = *(TYPE *)(vm + H(i)); \
188 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
189 } \
190 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
191 } while (i & 15); \
192 } \
193 }
194
195 /* Similarly, specialized for 64-bit operands. */
196 #define DO_ZPZZ_D(NAME, TYPE, OP) \
197 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
198 { \
199 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
200 TYPE *d = vd, *n = vn, *m = vm; \
201 uint8_t *pg = vg; \
202 for (i = 0; i < opr_sz; i += 1) { \
203 if (pg[H1(i)] & 1) { \
204 TYPE nn = n[i], mm = m[i]; \
205 d[i] = OP(nn, mm); \
206 } \
207 } \
208 }
209
210 #define DO_AND(N, M) (N & M)
211 #define DO_EOR(N, M) (N ^ M)
212 #define DO_ORR(N, M) (N | M)
213 #define DO_BIC(N, M) (N & ~M)
214 #define DO_ORC(N, M) (N | ~M)
215 #define DO_ADD(N, M) (N + M)
216 #define DO_SUB(N, M) (N - M)
217 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
218 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
219 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
220 #define DO_MUL(N, M) (N * M)
221
222
223 /*
224 * We must avoid the C undefined behaviour cases: division by
225 * zero and signed division of INT_MIN by -1. Both of these
226 * have architecturally defined required results for Arm.
227 * We special case all signed divisions by -1 to avoid having
228 * to deduce the minimum integer for the type involved.
229 */
230 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
231 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
232
233 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
234 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
235 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
236 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
237
238 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
239 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
240 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
241 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
242
243 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
244 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
245 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
246 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
247
248 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
249 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
250 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
251 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
252
253 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
254 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
255 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
256 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
257
258 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
259 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
260 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
261 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
262
263 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
264 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
265 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
266 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
267
268 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
269 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
270 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
271 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
272
273 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
274 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
275 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
276 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
277
278 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
279 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
280 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
281 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
282
283 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
284 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
285 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
286 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
287
288 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
289 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
290 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
291 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
292
293 /* Because the computation type is at least twice as large as required,
294 these work for both signed and unsigned source types. */
295 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
296 {
297 return (n * m) >> 8;
298 }
299
do_mulh_h(int32_t n,int32_t m)300 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
301 {
302 return (n * m) >> 16;
303 }
304
do_mulh_s(int64_t n,int64_t m)305 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
306 {
307 return (n * m) >> 32;
308 }
309
do_smulh_d(uint64_t n,uint64_t m)310 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
311 {
312 uint64_t lo, hi;
313 muls64(&lo, &hi, n, m);
314 return hi;
315 }
316
do_umulh_d(uint64_t n,uint64_t m)317 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
318 {
319 uint64_t lo, hi;
320 mulu64(&lo, &hi, n, m);
321 return hi;
322 }
323
DO_ZPZZ(sve_mul_zpzz_b,uint8_t,H1,DO_MUL)324 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
325 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
326 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
327 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
328
329 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
330 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
331 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
332 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
333
334 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
335 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
336 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
337 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
338
339 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
340 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
341
342 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
343 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
344
345 /* Note that all bits of the shift are significant
346 and not modulo the element size. */
347 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
348 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
349 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
350
351 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
352 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
353 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
354
355 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
356 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
357 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
358
359 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
360 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
361 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
362
363 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
364 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
365 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
366
367 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
368 {
369 int8_t n1 = n, n2 = n >> 8;
370 return m + n1 + n2;
371 }
372
do_sadalp_s(int32_t n,int32_t m)373 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
374 {
375 int16_t n1 = n, n2 = n >> 16;
376 return m + n1 + n2;
377 }
378
do_sadalp_d(int64_t n,int64_t m)379 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
380 {
381 int32_t n1 = n, n2 = n >> 32;
382 return m + n1 + n2;
383 }
384
DO_ZPZZ(sve2_sadalp_zpzz_h,int16_t,H1_2,do_sadalp_h)385 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
386 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
387 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
388
389 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
390 {
391 uint8_t n1 = n, n2 = n >> 8;
392 return m + n1 + n2;
393 }
394
do_uadalp_s(uint32_t n,uint32_t m)395 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
396 {
397 uint16_t n1 = n, n2 = n >> 16;
398 return m + n1 + n2;
399 }
400
do_uadalp_d(uint64_t n,uint64_t m)401 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
402 {
403 uint32_t n1 = n, n2 = n >> 32;
404 return m + n1 + n2;
405 }
406
DO_ZPZZ(sve2_uadalp_zpzz_h,uint16_t,H1_2,do_uadalp_h)407 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
408 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
409 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
410
411 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
412 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
413 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
414 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
415
416 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
417 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
418 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
419 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
420
421 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
422 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
423 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
424 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
425
426 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
427 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
428 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
429 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
430
431 /*
432 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
433 * We pass in a pointer to a dummy saturation field to trigger
434 * the saturating arithmetic but discard the information about
435 * whether it has occurred.
436 */
437 #define do_sqshl_b(n, m) \
438 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
439 #define do_sqshl_h(n, m) \
440 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
441 #define do_sqshl_s(n, m) \
442 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
443 #define do_sqshl_d(n, m) \
444 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
445
446 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
447 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
448 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
449 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
450
451 #define do_uqshl_b(n, m) \
452 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
453 #define do_uqshl_h(n, m) \
454 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
455 #define do_uqshl_s(n, m) \
456 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
457 #define do_uqshl_d(n, m) \
458 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
459
460 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
461 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
462 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
463 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
464
465 #define do_sqrshl_b(n, m) \
466 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
467 #define do_sqrshl_h(n, m) \
468 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
469 #define do_sqrshl_s(n, m) \
470 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
471 #define do_sqrshl_d(n, m) \
472 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
473
474 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
475 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
476 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
477 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
478
479 #undef do_sqrshl_d
480
481 #define do_uqrshl_b(n, m) \
482 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
483 #define do_uqrshl_h(n, m) \
484 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
485 #define do_uqrshl_s(n, m) \
486 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
487 #define do_uqrshl_d(n, m) \
488 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
489
490 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
491 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
492 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
493 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
494
495 #undef do_uqrshl_d
496
497 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
498 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
499
500 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
501 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
502 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
503 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
504
505 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
506 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
507 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
508 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
509
510 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
511 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
512
513 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
514 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
515 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
516 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
517
518 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
519 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
520 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
521 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
522
523 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
524 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
525
526 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
527 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
528 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
529 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
530
531 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
532 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
533 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
534 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
535
536 #define DO_SQADD_B(n, m) do_ssat_b((int64_t)n + m)
537 #define DO_SQADD_H(n, m) do_ssat_h((int64_t)n + m)
538 #define DO_SQADD_S(n, m) do_ssat_s((int64_t)n + m)
539
540 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
541 {
542 int64_t r = n + m;
543 if (((r ^ n) & ~(n ^ m)) < 0) {
544 /* Signed overflow. */
545 return r < 0 ? INT64_MAX : INT64_MIN;
546 }
547 return r;
548 }
549
DO_ZPZZ(sve2_sqadd_zpzz_b,int8_t,H1,DO_SQADD_B)550 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
551 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
552 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
553 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
554
555 #define DO_UQADD_B(n, m) do_usat_b((int64_t)n + m)
556 #define DO_UQADD_H(n, m) do_usat_h((int64_t)n + m)
557 #define DO_UQADD_S(n, m) do_usat_s((int64_t)n + m)
558
559 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
560 {
561 uint64_t r = n + m;
562 return r < n ? UINT64_MAX : r;
563 }
564
DO_ZPZZ(sve2_uqadd_zpzz_b,uint8_t,H1,DO_UQADD_B)565 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
566 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
567 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
568 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
569
570 #define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m)
571 #define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m)
572 #define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m)
573
574 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
575 {
576 int64_t r = n - m;
577 if (((r ^ n) & (n ^ m)) < 0) {
578 /* Signed overflow. */
579 return r < 0 ? INT64_MAX : INT64_MIN;
580 }
581 return r;
582 }
583
DO_ZPZZ(sve2_sqsub_zpzz_b,int8_t,H1,DO_SQSUB_B)584 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
585 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
586 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
587 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
588
589 #define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m)
590 #define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m)
591 #define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m)
592
593 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
594 {
595 return n > m ? n - m : 0;
596 }
597
DO_ZPZZ(sve2_uqsub_zpzz_b,uint8_t,H1,DO_UQSUB_B)598 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
599 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
600 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
601 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
602
603 #define DO_SUQADD_B(n, m) do_ssat_b((int64_t)(int8_t)n + m)
604 #define DO_SUQADD_H(n, m) do_ssat_h((int64_t)(int16_t)n + m)
605 #define DO_SUQADD_S(n, m) do_ssat_s((int64_t)(int32_t)n + m)
606
607 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
608 {
609 uint64_t r = n + m;
610
611 if (n < 0) {
612 /* Note that m - abs(n) cannot underflow. */
613 if (r > INT64_MAX) {
614 /* Result is either very large positive or negative. */
615 if (m > -n) {
616 /* m > abs(n), so r is a very large positive. */
617 return INT64_MAX;
618 }
619 /* Result is negative. */
620 }
621 } else {
622 /* Both inputs are positive: check for overflow. */
623 if (r < m || r > INT64_MAX) {
624 return INT64_MAX;
625 }
626 }
627 return r;
628 }
629
DO_ZPZZ(sve2_suqadd_zpzz_b,uint8_t,H1,DO_SUQADD_B)630 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
631 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
632 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
633 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
634
635 #define DO_USQADD_B(n, m) do_usat_b((int64_t)n + (int8_t)m)
636 #define DO_USQADD_H(n, m) do_usat_h((int64_t)n + (int16_t)m)
637 #define DO_USQADD_S(n, m) do_usat_s((int64_t)n + (int32_t)m)
638
639 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
640 {
641 uint64_t r = n + m;
642
643 if (m < 0) {
644 return n < -m ? 0 : r;
645 }
646 return r < n ? UINT64_MAX : r;
647 }
648
DO_ZPZZ(sve2_usqadd_zpzz_b,uint8_t,H1,DO_USQADD_B)649 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
650 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
651 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
652 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
653
654 #undef DO_ZPZZ
655 #undef DO_ZPZZ_D
656
657 /*
658 * Three operand expander, operating on element pairs.
659 * If the slot I is even, the elements from from VN {I, I+1}.
660 * If the slot I is odd, the elements from from VM {I-1, I}.
661 * Load all of the input elements in each pair before overwriting output.
662 */
663 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
664 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
665 { \
666 intptr_t i, opr_sz = simd_oprsz(desc); \
667 for (i = 0; i < opr_sz; ) { \
668 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
669 do { \
670 TYPE n0 = *(TYPE *)(vn + H(i)); \
671 TYPE m0 = *(TYPE *)(vm + H(i)); \
672 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
673 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
674 if (pg & 1) { \
675 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
676 } \
677 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
678 if (pg & 1) { \
679 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
680 } \
681 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
682 } while (i & 15); \
683 } \
684 }
685
686 /* Similarly, specialized for 64-bit operands. */
687 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
688 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
689 { \
690 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
691 TYPE *d = vd, *n = vn, *m = vm; \
692 uint8_t *pg = vg; \
693 for (i = 0; i < opr_sz; i += 2) { \
694 TYPE n0 = n[i], n1 = n[i + 1]; \
695 TYPE m0 = m[i], m1 = m[i + 1]; \
696 if (pg[H1(i)] & 1) { \
697 d[i] = OP(n0, n1); \
698 } \
699 if (pg[H1(i + 1)] & 1) { \
700 d[i + 1] = OP(m0, m1); \
701 } \
702 } \
703 }
704
705 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
706 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
707 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
708 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
709
710 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
713 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
714
715 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
718 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
719
720 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
723 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
724
725 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
728 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
729
730 #undef DO_ZPZZ_PAIR
731 #undef DO_ZPZZ_PAIR_D
732
733 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
734 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
735 float_status *status, uint32_t desc) \
736 { \
737 intptr_t i, opr_sz = simd_oprsz(desc); \
738 for (i = 0; i < opr_sz; ) { \
739 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
740 do { \
741 TYPE n0 = *(TYPE *)(vn + H(i)); \
742 TYPE m0 = *(TYPE *)(vm + H(i)); \
743 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
744 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
745 if (pg & 1) { \
746 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
747 } \
748 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
749 if (pg & 1) { \
750 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
751 } \
752 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
753 } while (i & 15); \
754 } \
755 }
756
757 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
760
761 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
764
765 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
768
769 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
772
773 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
776
777 #undef DO_ZPZZ_PAIR_FP
778
779 /* Three-operand expander, controlled by a predicate, in which the
780 * third operand is "wide". That is, for D = N op M, the same 64-bit
781 * value of M is used with all of the narrower values of N.
782 */
783 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
784 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
785 { \
786 intptr_t i, opr_sz = simd_oprsz(desc); \
787 for (i = 0; i < opr_sz; ) { \
788 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
789 TYPEW mm = *(TYPEW *)(vm + i); \
790 do { \
791 if (pg & 1) { \
792 TYPE nn = *(TYPE *)(vn + H(i)); \
793 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
794 } \
795 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
796 } while (i & 7); \
797 } \
798 }
799
800 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
801 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
802 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
803
804 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
805 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
806 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
807
808 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
809 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
810 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
811
812 #undef DO_ZPZW
813
814 /* Fully general two-operand expander, controlled by a predicate.
815 */
816 #define DO_ZPZ(NAME, TYPE, H, OP) \
817 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
818 { \
819 intptr_t i, opr_sz = simd_oprsz(desc); \
820 for (i = 0; i < opr_sz; ) { \
821 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
822 do { \
823 if (pg & 1) { \
824 TYPE nn = *(TYPE *)(vn + H(i)); \
825 *(TYPE *)(vd + H(i)) = OP(nn); \
826 } \
827 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
828 } while (i & 15); \
829 } \
830 }
831
832 /* Similarly, specialized for 64-bit operands. */
833 #define DO_ZPZ_D(NAME, TYPE, OP) \
834 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
835 { \
836 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
837 TYPE *d = vd, *n = vn; \
838 uint8_t *pg = vg; \
839 for (i = 0; i < opr_sz; i += 1) { \
840 if (pg[H1(i)] & 1) { \
841 TYPE nn = n[i]; \
842 d[i] = OP(nn); \
843 } \
844 } \
845 }
846
847 #define DO_CLS_B(N) (clrsb32(N) - 24)
848 #define DO_CLS_H(N) (clrsb32(N) - 16)
849
850 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
851 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
852 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
853 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
854
855 #define DO_CLZ_B(N) (clz32(N) - 24)
856 #define DO_CLZ_H(N) (clz32(N) - 16)
857
858 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
859 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
860 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
861 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
862
863 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
864 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
865 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
866 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
867
868 #define DO_CNOT(N) (N == 0)
869
870 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
871 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
872 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
873 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
874
875 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
876
877 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
878 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
879 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
880
881 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N))
882 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N))
883 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N))
884
885 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H)
886 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S)
887 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D)
888
889 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
890
891 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
892 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
893 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
894
895 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N))
896 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N))
897 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N))
898
899 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H)
900 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S)
901 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D)
902
903 #define DO_NOT(N) (~N)
904
905 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
906 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
907 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
908 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
909
910 #define DO_SXTB(N) ((int8_t)N)
911 #define DO_SXTH(N) ((int16_t)N)
912 #define DO_SXTS(N) ((int32_t)N)
913 #define DO_UXTB(N) ((uint8_t)N)
914 #define DO_UXTH(N) ((uint16_t)N)
915 #define DO_UXTS(N) ((uint32_t)N)
916
917 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
918 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
919 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
920 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
921 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
922 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
923
924 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
925 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
926 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
927 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
928 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
929 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
930
931 #define DO_ABS(N) (N < 0 ? -N : N)
932
933 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
934 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
935 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
936 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
937
938 #define DO_NEG(N) (-N)
939
940 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
941 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
942 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
943 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
944
945 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
946 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
947 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
948
949 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
950 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
951
952 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
953
954 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
955 {
956 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
957 uint64_t *d = vd, *n = vn;
958 uint8_t *pg = vg;
959
960 for (i = 0; i < opr_sz; i += 2) {
961 if (pg[H1(i)] & 1) {
962 uint64_t n0 = n[i + 0];
963 uint64_t n1 = n[i + 1];
964 d[i + 0] = n1;
965 d[i + 1] = n0;
966 }
967 }
968 }
969
DO_ZPZ(sve_rbit_b,uint8_t,H1,revbit8)970 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
971 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
972 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
973 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
974
975 #define DO_SQABS(X) \
976 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
977 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
978
979 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
980 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
981 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
982 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
983
984 #define DO_SQNEG(X) \
985 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
986 x_ == min_ ? -min_ - 1 : -x_; })
987
988 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
989 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
990 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
991 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
992
993 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
994 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
995
996 /* Three-operand expander, unpredicated, in which the third operand is "wide".
997 */
998 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
999 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1000 { \
1001 intptr_t i, opr_sz = simd_oprsz(desc); \
1002 for (i = 0; i < opr_sz; ) { \
1003 TYPEW mm = *(TYPEW *)(vm + i); \
1004 do { \
1005 TYPE nn = *(TYPE *)(vn + H(i)); \
1006 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1007 i += sizeof(TYPE); \
1008 } while (i & 7); \
1009 } \
1010 }
1011
1012 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1013 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1014 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1015
1016 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1017 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1018 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1019
1020 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1021 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1022 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1023
1024 #undef DO_ZZW
1025
1026 #undef DO_CLS_B
1027 #undef DO_CLS_H
1028 #undef DO_CLZ_B
1029 #undef DO_CLZ_H
1030 #undef DO_CNOT
1031 #undef DO_FABS
1032 #undef DO_FNEG
1033 #undef DO_ABS
1034 #undef DO_NEG
1035 #undef DO_ZPZ
1036 #undef DO_ZPZ_D
1037
1038 /*
1039 * Three-operand expander, unpredicated, in which the two inputs are
1040 * selected from the top or bottom half of the wide column.
1041 */
1042 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1043 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1044 { \
1045 intptr_t i, opr_sz = simd_oprsz(desc); \
1046 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1047 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1048 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1049 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1050 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1051 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1052 } \
1053 }
1054
1055 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1056 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1057 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1058
1059 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1060 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1061 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1062
1063 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1064 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1065 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1066
1067 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1068 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1069 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1070
1071 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1072 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1073 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1074
1075 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1076 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1077 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1078
1079 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1080 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1081 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1082
1083 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1084 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1085 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1086
1087 /* Note that the multiply cannot overflow, but the doubling can. */
1088 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1089 {
1090 int16_t val = n * m;
1091 return DO_SQADD_H(val, val);
1092 }
1093
do_sqdmull_s(int32_t n,int32_t m)1094 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1095 {
1096 int32_t val = n * m;
1097 return DO_SQADD_S(val, val);
1098 }
1099
do_sqdmull_d(int64_t n,int64_t m)1100 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1101 {
1102 int64_t val = n * m;
1103 return do_sqadd_d(val, val);
1104 }
1105
DO_ZZZ_TB(sve2_sqdmull_zzz_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h)1106 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1107 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1108 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1109
1110 #undef DO_ZZZ_TB
1111
1112 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1113 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1114 { \
1115 intptr_t i, opr_sz = simd_oprsz(desc); \
1116 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1117 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1118 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1119 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1120 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1121 } \
1122 }
1123
1124 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1125 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1126 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1127
1128 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1129 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1130 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1131
1132 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1133 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1134 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1135
1136 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1137 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1138 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1139
1140 #undef DO_ZZZ_WTB
1141
1142 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1143 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1144 { \
1145 intptr_t i, opr_sz = simd_oprsz(desc); \
1146 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1147 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1148 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1149 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1150 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1151 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1152 } \
1153 }
1154
1155 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1156 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1157 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1158 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1159
1160 #undef DO_ZZZ_NTB
1161
1162 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1163 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1164 { \
1165 intptr_t i, opr_sz = simd_oprsz(desc); \
1166 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1167 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1168 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1169 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1170 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1171 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1172 } \
1173 }
1174
1175 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1176 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1177 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1178
1179 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1180 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1181 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1182
1183 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1184 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1185 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1186
1187 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1188 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1189 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1190
1191 #define DO_NMUL(N, M) -(N * M)
1192
1193 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1194 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1196
1197 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1198 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1200
1201 #undef DO_ZZZW_ACC
1202
1203 #define DO_XTNB(NAME, TYPE, OP) \
1204 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1205 { \
1206 intptr_t i, opr_sz = simd_oprsz(desc); \
1207 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1208 TYPE nn = *(TYPE *)(vn + i); \
1209 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1210 *(TYPE *)(vd + i) = nn; \
1211 } \
1212 }
1213
1214 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1215 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1216 { \
1217 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1218 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1219 TYPE nn = *(TYPE *)(vn + i); \
1220 *(TYPEN *)(vd + i + odd) = OP(nn); \
1221 } \
1222 }
1223
1224 DO_XTNB(sve2_sqxtnb_h, int16_t, do_ssat_b)
1225 DO_XTNB(sve2_sqxtnb_s, int32_t, do_ssat_h)
1226 DO_XTNB(sve2_sqxtnb_d, int64_t, do_ssat_s)
1227
1228 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, do_ssat_b)
1229 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, do_ssat_h)
1230 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, do_ssat_s)
1231
1232 DO_XTNB(sve2_uqxtnb_h, uint16_t, do_usat_b)
1233 DO_XTNB(sve2_uqxtnb_s, uint32_t, do_usat_h)
1234 DO_XTNB(sve2_uqxtnb_d, uint64_t, do_usat_s)
1235
1236 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, do_usat_b)
1237 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, do_usat_h)
1238 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, do_usat_s)
1239
1240 DO_XTNB(sve2_sqxtunb_h, int16_t, do_usat_b)
1241 DO_XTNB(sve2_sqxtunb_s, int32_t, do_usat_h)
1242 DO_XTNB(sve2_sqxtunb_d, int64_t, do_usat_s)
1243
1244 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, do_usat_b)
1245 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, do_usat_h)
1246 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, do_usat_s)
1247
1248 #undef DO_XTNB
1249 #undef DO_XTNT
1250
1251 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1252 {
1253 intptr_t i, opr_sz = simd_oprsz(desc);
1254 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1255 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1256 uint32_t *a = va, *n = vn;
1257 uint64_t *d = vd, *m = vm;
1258
1259 for (i = 0; i < opr_sz / 8; ++i) {
1260 uint32_t e1 = a[2 * i + H4(0)];
1261 uint32_t e2 = n[2 * i + sel] ^ inv;
1262 uint64_t c = extract64(m[i], 32, 1);
1263 /* Compute and store the entire 33-bit result at once. */
1264 d[i] = c + e1 + e2;
1265 }
1266 }
1267
HELPER(sve2_adcl_d)1268 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1269 {
1270 intptr_t i, opr_sz = simd_oprsz(desc);
1271 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1272 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1273 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1274
1275 for (i = 0; i < opr_sz / 8; i += 2) {
1276 Int128 e1 = int128_make64(a[i]);
1277 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1278 Int128 c = int128_make64(m[i + 1] & 1);
1279 Int128 r = int128_add(int128_add(e1, e2), c);
1280 d[i + 0] = int128_getlo(r);
1281 d[i + 1] = int128_gethi(r);
1282 }
1283 }
1284
1285 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1287 { \
1288 intptr_t i, opr_sz = simd_oprsz(desc); \
1289 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1290 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1291 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1292 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1293 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1294 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1295 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1296 } \
1297 }
1298
DO_SQDMLAL(sve2_sqdmlal_zzzw_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h,DO_SQADD_H)1299 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1300 do_sqdmull_h, DO_SQADD_H)
1301 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1302 do_sqdmull_s, DO_SQADD_S)
1303 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1304 do_sqdmull_d, do_sqadd_d)
1305
1306 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1307 do_sqdmull_h, DO_SQSUB_H)
1308 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1309 do_sqdmull_s, DO_SQSUB_S)
1310 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1311 do_sqdmull_d, do_sqsub_d)
1312
1313 #undef DO_SQDMLAL
1314
1315 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1316 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1317 { \
1318 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1319 int rot = simd_data(desc); \
1320 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1321 bool sub_r = rot == 1 || rot == 2; \
1322 bool sub_i = rot >= 2; \
1323 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1324 for (i = 0; i < opr_sz; i += 2) { \
1325 TYPE elt1_a = n[H(i + sel_a)]; \
1326 TYPE elt2_a = m[H(i + sel_a)]; \
1327 TYPE elt2_b = m[H(i + sel_b)]; \
1328 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1329 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1330 } \
1331 }
1332
1333 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1334
1335 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1336 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1337 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1338 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1339
1340 #define DO_SQRDMLAH_B(N, M, A, S) \
1341 do_sqrdmlah_b(N, M, A, S, true)
1342 #define DO_SQRDMLAH_H(N, M, A, S) \
1343 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1344 #define DO_SQRDMLAH_S(N, M, A, S) \
1345 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1346 #define DO_SQRDMLAH_D(N, M, A, S) \
1347 do_sqrdmlah_d(N, M, A, S, true)
1348
1349 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1350 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1351 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1352 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1353
1354 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1355 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1356 { \
1357 intptr_t i, j, oprsz = simd_oprsz(desc); \
1358 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1359 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1360 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1361 bool sub_r = rot == 1 || rot == 2; \
1362 bool sub_i = rot >= 2; \
1363 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1364 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1365 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1366 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1367 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1368 TYPE elt1_a = n[H(i + j + sel_a)]; \
1369 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1370 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1371 } \
1372 } \
1373 }
1374
1375 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1376 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1377
1378 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1379 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1380
1381 #undef DO_CMLA
1382 #undef DO_CMLA_FUNC
1383 #undef DO_CMLA_IDX_FUNC
1384 #undef DO_SQRDMLAH_B
1385 #undef DO_SQRDMLAH_H
1386 #undef DO_SQRDMLAH_S
1387 #undef DO_SQRDMLAH_D
1388
1389 /* Note N and M are 4 elements bundled into one unit. */
1390 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1391 int sel_a, int sel_b, int sub_i)
1392 {
1393 for (int i = 0; i <= 1; i++) {
1394 int32_t elt1_r = (int8_t)(n >> (16 * i));
1395 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1396 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1397 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1398
1399 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1400 }
1401 return a;
1402 }
1403
do_cdot_d(uint64_t n,uint64_t m,int64_t a,int sel_a,int sel_b,int sub_i)1404 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1405 int sel_a, int sel_b, int sub_i)
1406 {
1407 for (int i = 0; i <= 1; i++) {
1408 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1409 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1410 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1411 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1412
1413 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1414 }
1415 return a;
1416 }
1417
HELPER(sve2_cdot_zzzz_s)1418 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1419 void *va, uint32_t desc)
1420 {
1421 int opr_sz = simd_oprsz(desc);
1422 int rot = simd_data(desc);
1423 int sel_a = rot & 1;
1424 int sel_b = sel_a ^ 1;
1425 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1426 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1427
1428 for (int e = 0; e < opr_sz / 4; e++) {
1429 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1430 }
1431 }
1432
HELPER(sve2_cdot_zzzz_d)1433 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1434 void *va, uint32_t desc)
1435 {
1436 int opr_sz = simd_oprsz(desc);
1437 int rot = simd_data(desc);
1438 int sel_a = rot & 1;
1439 int sel_b = sel_a ^ 1;
1440 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1441 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1442
1443 for (int e = 0; e < opr_sz / 8; e++) {
1444 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1445 }
1446 }
1447
HELPER(sve2_cdot_idx_s)1448 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1449 void *va, uint32_t desc)
1450 {
1451 int opr_sz = simd_oprsz(desc);
1452 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1453 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1454 int sel_a = rot & 1;
1455 int sel_b = sel_a ^ 1;
1456 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1457 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1458
1459 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1460 uint32_t seg_m = m[seg + idx];
1461 for (int e = 0; e < 4; e++) {
1462 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1463 sel_a, sel_b, sub_i);
1464 }
1465 }
1466 }
1467
HELPER(sve2_cdot_idx_d)1468 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1469 void *va, uint32_t desc)
1470 {
1471 int seg, opr_sz = simd_oprsz(desc);
1472 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1473 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1474 int sel_a = rot & 1;
1475 int sel_b = sel_a ^ 1;
1476 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1477 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1478
1479 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1480 uint64_t seg_m = m[seg + idx];
1481 for (int e = 0; e < 2; e++) {
1482 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1483 sel_a, sel_b, sub_i);
1484 }
1485 }
1486 }
1487
1488 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1489 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1490 { \
1491 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1492 intptr_t i, j, idx = simd_data(desc); \
1493 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1494 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1495 TYPE mm = m[i]; \
1496 for (j = 0; j < segment; j++) { \
1497 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1498 } \
1499 } \
1500 }
1501
1502 #define DO_SQRDMLAH_H(N, M, A) \
1503 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1504 #define DO_SQRDMLAH_S(N, M, A) \
1505 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1506 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1507
DO_ZZXZ(sve2_sqrdmlah_idx_h,int16_t,H2,DO_SQRDMLAH_H)1508 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1509 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1510 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1511
1512 #define DO_SQRDMLSH_H(N, M, A) \
1513 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1514 #define DO_SQRDMLSH_S(N, M, A) \
1515 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1516 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1517
1518 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1519 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1520 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1521
1522 #undef DO_ZZXZ
1523
1524 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1525 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1526 { \
1527 intptr_t i, j, oprsz = simd_oprsz(desc); \
1528 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1529 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1530 for (i = 0; i < oprsz; i += 16) { \
1531 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1532 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1533 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1534 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1535 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1536 } \
1537 } \
1538 }
1539
1540 #define DO_MLA(N, M, A) (A + N * M)
1541
1542 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1543 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1544 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1545 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1546
1547 #define DO_MLS(N, M, A) (A - N * M)
1548
1549 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1550 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1551 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1552 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1553
1554 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1555 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1556
1557 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1558 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1559
1560 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1561 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1562
1563 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1564 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1565
1566 #undef DO_MLA
1567 #undef DO_MLS
1568 #undef DO_ZZXW
1569
1570 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1571 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1572 { \
1573 intptr_t i, j, oprsz = simd_oprsz(desc); \
1574 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1575 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1576 for (i = 0; i < oprsz; i += 16) { \
1577 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1578 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1579 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1580 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1581 } \
1582 } \
1583 }
1584
1585 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1586 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1587
1588 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1589 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1590
1591 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1592 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1593
1594 #undef DO_ZZX
1595
1596 #define DO_BITPERM(NAME, TYPE, OP) \
1597 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1598 { \
1599 intptr_t i, opr_sz = simd_oprsz(desc); \
1600 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1601 TYPE nn = *(TYPE *)(vn + i); \
1602 TYPE mm = *(TYPE *)(vm + i); \
1603 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1604 } \
1605 }
1606
1607 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1608 {
1609 uint64_t res = 0;
1610 int db, rb = 0;
1611
1612 for (db = 0; db < n; ++db) {
1613 if ((mask >> db) & 1) {
1614 res |= ((data >> db) & 1) << rb;
1615 ++rb;
1616 }
1617 }
1618 return res;
1619 }
1620
DO_BITPERM(sve2_bext_b,uint8_t,bitextract)1621 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1622 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1623 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1624 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1625
1626 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1627 {
1628 uint64_t res = 0;
1629 int rb, db = 0;
1630
1631 for (rb = 0; rb < n; ++rb) {
1632 if ((mask >> rb) & 1) {
1633 res |= ((data >> db) & 1) << rb;
1634 ++db;
1635 }
1636 }
1637 return res;
1638 }
1639
DO_BITPERM(sve2_bdep_b,uint8_t,bitdeposit)1640 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1641 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1642 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1643 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1644
1645 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1646 {
1647 uint64_t resm = 0, resu = 0;
1648 int db, rbm = 0, rbu = 0;
1649
1650 for (db = 0; db < n; ++db) {
1651 uint64_t val = (data >> db) & 1;
1652 if ((mask >> db) & 1) {
1653 resm |= val << rbm++;
1654 } else {
1655 resu |= val << rbu++;
1656 }
1657 }
1658
1659 return resm | (resu << rbm);
1660 }
1661
DO_BITPERM(sve2_bgrp_b,uint8_t,bitgroup)1662 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1663 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1664 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1665 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1666
1667 #undef DO_BITPERM
1668
1669 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1670 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1671 { \
1672 intptr_t i, opr_sz = simd_oprsz(desc); \
1673 int sub_r = simd_data(desc); \
1674 if (sub_r) { \
1675 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1676 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1677 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1678 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1679 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1680 acc_r = ADD_OP(acc_r, el2_i); \
1681 acc_i = SUB_OP(acc_i, el2_r); \
1682 *(TYPE *)(vd + H(i)) = acc_r; \
1683 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1684 } \
1685 } else { \
1686 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1687 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1688 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1689 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1690 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1691 acc_r = SUB_OP(acc_r, el2_i); \
1692 acc_i = ADD_OP(acc_i, el2_r); \
1693 *(TYPE *)(vd + H(i)) = acc_r; \
1694 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1695 } \
1696 } \
1697 }
1698
1699 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1700 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1701 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1702 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1703
1704 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1705 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1706 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1707 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1708
1709 #undef DO_CADD
1710
1711 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1712 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1713 { \
1714 intptr_t i, opr_sz = simd_oprsz(desc); \
1715 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1716 int shift = simd_data(desc) >> 1; \
1717 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1718 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1719 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1720 } \
1721 }
1722
1723 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1724 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1725 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1726
1727 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1728 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1729 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1730
1731 #undef DO_ZZI_SHLL
1732
1733 /* Two-operand reduction expander, controlled by a predicate.
1734 * The difference between TYPERED and TYPERET has to do with
1735 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1736 * but TYPERET must be unsigned so that e.g. a 32-bit value
1737 * is not sign-extended to the ABI uint64_t return type.
1738 */
1739 /* ??? If we were to vectorize this by hand the reduction ordering
1740 * would change. For integer operands, this is perfectly fine.
1741 */
1742 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1743 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1744 { \
1745 intptr_t i, opr_sz = simd_oprsz(desc); \
1746 TYPERED ret = INIT; \
1747 for (i = 0; i < opr_sz; ) { \
1748 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1749 do { \
1750 if (pg & 1) { \
1751 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1752 ret = OP(ret, nn); \
1753 } \
1754 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1755 } while (i & 15); \
1756 } \
1757 return (TYPERET)ret; \
1758 }
1759
1760 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1761 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1762 { \
1763 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1764 TYPEE *n = vn; \
1765 uint8_t *pg = vg; \
1766 TYPER ret = INIT; \
1767 for (i = 0; i < opr_sz; i += 1) { \
1768 if (pg[H1(i)] & 1) { \
1769 TYPEE nn = n[i]; \
1770 ret = OP(ret, nn); \
1771 } \
1772 } \
1773 return ret; \
1774 }
1775
1776 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1777 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1778 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1779 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1780
1781 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1782 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1783 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1784 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1785
1786 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1787 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1788 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1789 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1790
1791 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1792 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1793 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1794
1795 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1796 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1797 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1798 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1799
1800 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1801 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1802 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1803 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1804
1805 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1806 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1807 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1808 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1809
1810 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1811 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1812 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1813 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1814
1815 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1816 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1817 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1818 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1819
1820 #undef DO_VPZ
1821 #undef DO_VPZ_D
1822
1823 #define DO_VPQ(NAME, TYPE, H, INIT, OP) \
1824 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1825 { \
1826 TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \
1827 TYPE *n = vn; uint16_t *g = vg; \
1828 uintptr_t oprsz = simd_oprsz(desc); \
1829 uintptr_t nseg = oprsz / 16, nsegelt = 16 / sizeof(TYPE); \
1830 for (uintptr_t s = 0; s < nseg; s++) { \
1831 uint16_t pg = g[H2(s)]; \
1832 for (uintptr_t e = 0; e < nsegelt; e++, pg >>= sizeof(TYPE)) { \
1833 if (pg & 1) { \
1834 tmp[e] = OP(tmp[H(e)], n[s * nsegelt + H(e)]); \
1835 } \
1836 } \
1837 } \
1838 memcpy(vd, tmp, 16); \
1839 clear_tail(vd, 16, simd_maxsz(desc)); \
1840 }
1841
1842 DO_VPQ(sve2p1_addqv_b, uint8_t, H1, 0, DO_ADD)
1843 DO_VPQ(sve2p1_addqv_h, uint16_t, H2, 0, DO_ADD)
1844 DO_VPQ(sve2p1_addqv_s, uint32_t, H4, 0, DO_ADD)
1845 DO_VPQ(sve2p1_addqv_d, uint64_t, H8, 0, DO_ADD)
1846
1847 DO_VPQ(sve2p1_smaxqv_b, int8_t, H1, INT8_MIN, DO_MAX)
1848 DO_VPQ(sve2p1_smaxqv_h, int16_t, H2, INT16_MIN, DO_MAX)
1849 DO_VPQ(sve2p1_smaxqv_s, int32_t, H4, INT32_MIN, DO_MAX)
1850 DO_VPQ(sve2p1_smaxqv_d, int64_t, H8, INT64_MIN, DO_MAX)
1851
1852 DO_VPQ(sve2p1_sminqv_b, int8_t, H1, INT8_MAX, DO_MIN)
1853 DO_VPQ(sve2p1_sminqv_h, int16_t, H2, INT16_MAX, DO_MIN)
1854 DO_VPQ(sve2p1_sminqv_s, int32_t, H4, INT32_MAX, DO_MIN)
1855 DO_VPQ(sve2p1_sminqv_d, int64_t, H8, INT64_MAX, DO_MIN)
1856
1857 DO_VPQ(sve2p1_umaxqv_b, uint8_t, H1, 0, DO_MAX)
1858 DO_VPQ(sve2p1_umaxqv_h, uint16_t, H2, 0, DO_MAX)
1859 DO_VPQ(sve2p1_umaxqv_s, uint32_t, H4, 0, DO_MAX)
1860 DO_VPQ(sve2p1_umaxqv_d, uint64_t, H8, 0, DO_MAX)
1861
1862 DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN)
1863 DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN)
1864 DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN)
1865 DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN)
1866
1867 #undef DO_VPQ
1868
1869 /* Two vector operand, one scalar operand, unpredicated. */
1870 #define DO_ZZI(NAME, TYPE, OP) \
1871 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1872 { \
1873 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1874 TYPE s = s64, *d = vd, *n = vn; \
1875 for (i = 0; i < opr_sz; ++i) { \
1876 d[i] = OP(n[i], s); \
1877 } \
1878 }
1879
1880 #define DO_SUBR(X, Y) (Y - X)
1881
1882 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1883 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1884 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1885 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1886
1887 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1888 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1889 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1890 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1891
1892 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1893 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1894 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1895 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1896
1897 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1898 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1899 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1900 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1901
1902 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1903 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1904 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1905 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1906
1907 #undef DO_ZZI
1908
1909 #define DO_LOGIC_QV(NAME, SUFF, INIT, VOP, POP) \
1910 void HELPER(NAME ## _ ## SUFF)(void *vd, void *vn, void *vg, uint32_t desc) \
1911 { \
1912 unsigned seg = simd_oprsz(desc) / 16; \
1913 uint64_t r0 = INIT, r1 = INIT; \
1914 for (unsigned s = 0; s < seg; s++) { \
1915 uint64_t p0 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2))); \
1916 uint64_t p1 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2 + 1))); \
1917 uint64_t v0 = *(uint64_t *)(vn + s * 16); \
1918 uint64_t v1 = *(uint64_t *)(vn + s * 16 + 8); \
1919 v0 = POP(v0, p0), v1 = POP(v1, p1); \
1920 r0 = VOP(r0, v0), r1 = VOP(r1, v1); \
1921 } \
1922 *(uint64_t *)(vd + 0) = r0; \
1923 *(uint64_t *)(vd + 8) = r1; \
1924 clear_tail(vd, 16, simd_maxsz(desc)); \
1925 }
1926
1927 DO_LOGIC_QV(sve2p1_orqv, b, 0, DO_ORR, DO_AND)
1928 DO_LOGIC_QV(sve2p1_orqv, h, 0, DO_ORR, DO_AND)
1929 DO_LOGIC_QV(sve2p1_orqv, s, 0, DO_ORR, DO_AND)
1930 DO_LOGIC_QV(sve2p1_orqv, d, 0, DO_ORR, DO_AND)
1931
1932 DO_LOGIC_QV(sve2p1_eorqv, b, 0, DO_EOR, DO_AND)
1933 DO_LOGIC_QV(sve2p1_eorqv, h, 0, DO_EOR, DO_AND)
1934 DO_LOGIC_QV(sve2p1_eorqv, s, 0, DO_EOR, DO_AND)
1935 DO_LOGIC_QV(sve2p1_eorqv, d, 0, DO_EOR, DO_AND)
1936
1937 DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC)
1938 DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC)
1939 DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC)
1940 DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC)
1941
1942 #undef DO_LOGIC_QV
1943
1944 #undef DO_AND
1945 #undef DO_ORR
1946 #undef DO_EOR
1947 #undef DO_BIC
1948 #undef DO_ORC
1949 #undef DO_ADD
1950 #undef DO_SUB
1951 #undef DO_MAX
1952 #undef DO_MIN
1953 #undef DO_ABD
1954 #undef DO_MUL
1955 #undef DO_DIV
1956 #undef DO_ASR
1957 #undef DO_LSR
1958 #undef DO_LSL
1959 #undef DO_SUBR
1960
1961 /* Similar to the ARM LastActiveElement pseudocode function, except the
1962 result is multiplied by the element size. This includes the not found
1963 indication; e.g. not found for esz=3 is -8. */
1964 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1965 {
1966 uint64_t mask = pred_esz_masks[esz];
1967 intptr_t i = words;
1968
1969 do {
1970 uint64_t this_g = g[--i] & mask;
1971 if (this_g) {
1972 return i * 64 + (63 - clz64(this_g));
1973 }
1974 } while (i > 0);
1975 return (intptr_t)-1 << esz;
1976 }
1977
HELPER(sve_pfirst)1978 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1979 {
1980 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1981 uint32_t flags = PREDTEST_INIT;
1982 uint64_t *d = vd, *g = vg;
1983 intptr_t i = 0;
1984
1985 do {
1986 uint64_t this_d = d[i];
1987 uint64_t this_g = g[i];
1988
1989 if (this_g) {
1990 if (!(flags & 4)) {
1991 /* Set in D the first bit of G. */
1992 this_d |= this_g & -this_g;
1993 d[i] = this_d;
1994 }
1995 flags = iter_predtest_fwd(this_d, this_g, flags);
1996 }
1997 } while (++i < words);
1998
1999 return flags;
2000 }
2001
HELPER(sve_pnext)2002 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
2003 {
2004 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2005 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2006 uint32_t flags = PREDTEST_INIT;
2007 uint64_t *d = vd, *g = vg, esz_mask;
2008 intptr_t i, next;
2009
2010 next = last_active_element(vd, words, esz) + (1 << esz);
2011 esz_mask = pred_esz_masks[esz];
2012
2013 /* Similar to the pseudocode for pnext, but scaled by ESZ
2014 so that we find the correct bit. */
2015 if (next < words * 64) {
2016 uint64_t mask = -1;
2017
2018 if (next & 63) {
2019 mask = ~((1ull << (next & 63)) - 1);
2020 next &= -64;
2021 }
2022 do {
2023 uint64_t this_g = g[next / 64] & esz_mask & mask;
2024 if (this_g != 0) {
2025 next = (next & -64) + ctz64(this_g);
2026 break;
2027 }
2028 next += 64;
2029 mask = -1;
2030 } while (next < words * 64);
2031 }
2032
2033 i = 0;
2034 do {
2035 uint64_t this_d = 0;
2036 if (i == next / 64) {
2037 this_d = 1ull << (next & 63);
2038 }
2039 d[i] = this_d;
2040 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
2041 } while (++i < words);
2042
2043 return flags;
2044 }
2045
2046 /*
2047 * Copy Zn into Zd, and store zero into inactive elements.
2048 * If inv, store zeros into the active elements.
2049 */
HELPER(sve_movz_b)2050 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
2051 {
2052 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2053 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2054 uint64_t *d = vd, *n = vn;
2055 uint8_t *pg = vg;
2056
2057 for (i = 0; i < opr_sz; i += 1) {
2058 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
2059 }
2060 }
2061
HELPER(sve_movz_h)2062 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
2063 {
2064 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2065 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2066 uint64_t *d = vd, *n = vn;
2067 uint8_t *pg = vg;
2068
2069 for (i = 0; i < opr_sz; i += 1) {
2070 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2071 }
2072 }
2073
HELPER(sve_movz_s)2074 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2075 {
2076 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2077 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2078 uint64_t *d = vd, *n = vn;
2079 uint8_t *pg = vg;
2080
2081 for (i = 0; i < opr_sz; i += 1) {
2082 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2083 }
2084 }
2085
HELPER(sve_movz_d)2086 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2087 {
2088 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2089 uint64_t *d = vd, *n = vn;
2090 uint8_t *pg = vg;
2091 uint8_t inv = simd_data(desc);
2092
2093 for (i = 0; i < opr_sz; i += 1) {
2094 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2095 }
2096 }
2097
2098 /* Three-operand expander, immediate operand, controlled by a predicate.
2099 */
2100 #define DO_ZPZI(NAME, TYPE, H, OP) \
2101 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2102 { \
2103 intptr_t i, opr_sz = simd_oprsz(desc); \
2104 TYPE imm = simd_data(desc); \
2105 for (i = 0; i < opr_sz; ) { \
2106 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2107 do { \
2108 if (pg & 1) { \
2109 TYPE nn = *(TYPE *)(vn + H(i)); \
2110 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2111 } \
2112 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2113 } while (i & 15); \
2114 } \
2115 }
2116
2117 /* Similarly, specialized for 64-bit operands. */
2118 #define DO_ZPZI_D(NAME, TYPE, OP) \
2119 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2120 { \
2121 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2122 TYPE *d = vd, *n = vn; \
2123 TYPE imm = simd_data(desc); \
2124 uint8_t *pg = vg; \
2125 for (i = 0; i < opr_sz; i += 1) { \
2126 if (pg[H1(i)] & 1) { \
2127 TYPE nn = n[i]; \
2128 d[i] = OP(nn, imm); \
2129 } \
2130 } \
2131 }
2132
2133 #define DO_SHR(N, M) (N >> M)
2134 #define DO_SHL(N, M) (N << M)
2135
2136 /* Arithmetic shift right for division. This rounds negative numbers
2137 toward zero as per signed division. Therefore before shifting,
2138 when N is negative, add 2**M-1. */
2139 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2140
DO_ZPZI(sve_asr_zpzi_b,int8_t,H1,DO_SHR)2141 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2142 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2143 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2144 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2145
2146 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2147 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2148 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2149 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2150
2151 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2152 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2153 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2154 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2155
2156 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2157 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2158 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2159 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2160
2161 /* SVE2 bitwise shift by immediate */
2162 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2163 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2164 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2165 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2166
2167 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2168 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2169 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2170 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2171
2172 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2173 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2174 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2175 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2176
2177 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2178 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2179 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2180 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2181
2182 #define do_suqrshl_b(n, m) \
2183 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2184 #define do_suqrshl_h(n, m) \
2185 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2186 #define do_suqrshl_s(n, m) \
2187 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2188 #define do_suqrshl_d(n, m) \
2189 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2190
2191 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2192 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2193 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2194 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2195
2196 #undef DO_ASRD
2197 #undef DO_ZPZI
2198 #undef DO_ZPZI_D
2199
2200 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2201 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2202 { \
2203 intptr_t i, opr_sz = simd_oprsz(desc); \
2204 int shift = simd_data(desc); \
2205 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2206 TYPEW nn = *(TYPEW *)(vn + i); \
2207 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2208 } \
2209 }
2210
2211 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2212 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2213 { \
2214 intptr_t i, opr_sz = simd_oprsz(desc); \
2215 int shift = simd_data(desc); \
2216 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2217 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2218 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2219 } \
2220 }
2221
2222 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2223 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2224 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2225
2226 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2227 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2228 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2229
2230 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2231 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2232 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2233
2234 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2235 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2236 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2237
2238 #define DO_SQSHRUN_H(x, sh) do_usat_b((int64_t)(x) >> sh)
2239 #define DO_SQSHRUN_S(x, sh) do_usat_h((int64_t)(x) >> sh)
2240 #define DO_SQSHRUN_D(x, sh) do_usat_s((int64_t)(x) >> (sh < 64 ? sh : 63))
2241
2242 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2243 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2244 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2245
2246 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2247 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2248 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2249
2250 #define DO_SQRSHRUN_H(x, sh) do_usat_b(do_srshr(x, sh))
2251 #define DO_SQRSHRUN_S(x, sh) do_usat_h(do_srshr(x, sh))
2252 #define DO_SQRSHRUN_D(x, sh) do_usat_s(do_srshr(x, sh))
2253
2254 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2255 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2256 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2257
2258 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2259 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2260 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2261
2262 #define DO_SQSHRN_H(x, sh) do_ssat_b(x >> sh)
2263 #define DO_SQSHRN_S(x, sh) do_ssat_h(x >> sh)
2264 #define DO_SQSHRN_D(x, sh) do_ssat_s(x >> sh)
2265
2266 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2267 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2268 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2269
2270 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2271 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2272 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2273
2274 #define DO_SQRSHRN_H(x, sh) do_ssat_b(do_srshr(x, sh))
2275 #define DO_SQRSHRN_S(x, sh) do_ssat_h(do_srshr(x, sh))
2276 #define DO_SQRSHRN_D(x, sh) do_ssat_s(do_srshr(x, sh))
2277
2278 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2279 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2280 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2281
2282 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2283 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2284 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2285
2286 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2287 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2288 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2289
2290 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2291 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2292 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2293
2294 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2295 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2296 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2297
2298 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2299 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2300 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2301
2302 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2303 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2304 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2305
2306 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2307 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2308 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2309
2310 #undef DO_SHRNB
2311 #undef DO_SHRNT
2312
2313 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2314 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2315 { \
2316 intptr_t i, opr_sz = simd_oprsz(desc); \
2317 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2318 TYPEW nn = *(TYPEW *)(vn + i); \
2319 TYPEW mm = *(TYPEW *)(vm + i); \
2320 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2321 } \
2322 }
2323
2324 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2325 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2326 { \
2327 intptr_t i, opr_sz = simd_oprsz(desc); \
2328 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2329 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2330 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2331 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2332 } \
2333 }
2334
2335 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2336 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2337 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2338 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2339
2340 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2341 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2342 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2343
2344 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2345 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2346 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2347
2348 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2349 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2350 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2351
2352 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2353 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2354 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2355
2356 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2357 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2358 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2359
2360 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2361 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2362 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2363
2364 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2365 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2366 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2367
2368 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2369 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2370 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2371
2372 #undef DO_RSUBHN
2373 #undef DO_SUBHN
2374 #undef DO_RADDHN
2375 #undef DO_ADDHN
2376
2377 #undef DO_BINOPNB
2378
2379 /* Fully general four-operand expander, controlled by a predicate.
2380 */
2381 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2382 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2383 void *vg, uint32_t desc) \
2384 { \
2385 intptr_t i, opr_sz = simd_oprsz(desc); \
2386 for (i = 0; i < opr_sz; ) { \
2387 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2388 do { \
2389 if (pg & 1) { \
2390 TYPE nn = *(TYPE *)(vn + H(i)); \
2391 TYPE mm = *(TYPE *)(vm + H(i)); \
2392 TYPE aa = *(TYPE *)(va + H(i)); \
2393 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2394 } \
2395 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2396 } while (i & 15); \
2397 } \
2398 }
2399
2400 /* Similarly, specialized for 64-bit operands. */
2401 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2402 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2403 void *vg, uint32_t desc) \
2404 { \
2405 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2406 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2407 uint8_t *pg = vg; \
2408 for (i = 0; i < opr_sz; i += 1) { \
2409 if (pg[H1(i)] & 1) { \
2410 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2411 d[i] = OP(aa, nn, mm); \
2412 } \
2413 } \
2414 }
2415
2416 #define DO_MLA(A, N, M) (A + N * M)
2417 #define DO_MLS(A, N, M) (A - N * M)
2418
2419 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2420 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2421
2422 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2423 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2424
2425 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2426 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2427
2428 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2429 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2430
2431 #undef DO_MLA
2432 #undef DO_MLS
2433 #undef DO_ZPZZZ
2434 #undef DO_ZPZZZ_D
2435
2436 void HELPER(sve_index_b)(void *vd, uint32_t start,
2437 uint32_t incr, uint32_t desc)
2438 {
2439 intptr_t i, opr_sz = simd_oprsz(desc);
2440 uint8_t *d = vd;
2441 for (i = 0; i < opr_sz; i += 1) {
2442 d[H1(i)] = start + i * incr;
2443 }
2444 }
2445
HELPER(sve_index_h)2446 void HELPER(sve_index_h)(void *vd, uint32_t start,
2447 uint32_t incr, uint32_t desc)
2448 {
2449 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2450 uint16_t *d = vd;
2451 for (i = 0; i < opr_sz; i += 1) {
2452 d[H2(i)] = start + i * incr;
2453 }
2454 }
2455
HELPER(sve_index_s)2456 void HELPER(sve_index_s)(void *vd, uint32_t start,
2457 uint32_t incr, uint32_t desc)
2458 {
2459 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2460 uint32_t *d = vd;
2461 for (i = 0; i < opr_sz; i += 1) {
2462 d[H4(i)] = start + i * incr;
2463 }
2464 }
2465
HELPER(sve_index_d)2466 void HELPER(sve_index_d)(void *vd, uint64_t start,
2467 uint64_t incr, uint32_t desc)
2468 {
2469 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2470 uint64_t *d = vd;
2471 for (i = 0; i < opr_sz; i += 1) {
2472 d[i] = start + i * incr;
2473 }
2474 }
2475
HELPER(sve_adr_p32)2476 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2477 {
2478 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2479 uint32_t sh = simd_data(desc);
2480 uint32_t *d = vd, *n = vn, *m = vm;
2481 for (i = 0; i < opr_sz; i += 1) {
2482 d[i] = n[i] + (m[i] << sh);
2483 }
2484 }
2485
HELPER(sve_adr_p64)2486 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2487 {
2488 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2489 uint64_t sh = simd_data(desc);
2490 uint64_t *d = vd, *n = vn, *m = vm;
2491 for (i = 0; i < opr_sz; i += 1) {
2492 d[i] = n[i] + (m[i] << sh);
2493 }
2494 }
2495
HELPER(sve_adr_s32)2496 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2497 {
2498 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2499 uint64_t sh = simd_data(desc);
2500 uint64_t *d = vd, *n = vn, *m = vm;
2501 for (i = 0; i < opr_sz; i += 1) {
2502 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2503 }
2504 }
2505
HELPER(sve_adr_u32)2506 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2507 {
2508 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2509 uint64_t sh = simd_data(desc);
2510 uint64_t *d = vd, *n = vn, *m = vm;
2511 for (i = 0; i < opr_sz; i += 1) {
2512 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2513 }
2514 }
2515
HELPER(sve_fexpa_h)2516 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2517 {
2518 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2519 static const uint16_t coeff[] = {
2520 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2521 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2522 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2523 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2524 };
2525 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2526 uint16_t *d = vd, *n = vn;
2527
2528 for (i = 0; i < opr_sz; i++) {
2529 uint16_t nn = n[i];
2530 intptr_t idx = extract32(nn, 0, 5);
2531 uint16_t exp = extract32(nn, 5, 5);
2532 d[i] = coeff[idx] | (exp << 10);
2533 }
2534 }
2535
HELPER(sve_fexpa_s)2536 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2537 {
2538 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2539 static const uint32_t coeff[] = {
2540 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2541 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2542 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2543 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2544 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2545 0x1ef532, 0x20b051, 0x227043, 0x243516,
2546 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2547 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2548 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2549 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2550 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2551 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2552 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2553 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2554 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2555 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2556 };
2557 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2558 uint32_t *d = vd, *n = vn;
2559
2560 for (i = 0; i < opr_sz; i++) {
2561 uint32_t nn = n[i];
2562 intptr_t idx = extract32(nn, 0, 6);
2563 uint32_t exp = extract32(nn, 6, 8);
2564 d[i] = coeff[idx] | (exp << 23);
2565 }
2566 }
2567
HELPER(sve_fexpa_d)2568 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2569 {
2570 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2571 static const uint64_t coeff[] = {
2572 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2573 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2574 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2575 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2576 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2577 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2578 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2579 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2580 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2581 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2582 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2583 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2584 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2585 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2586 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2587 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2588 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2589 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2590 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2591 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2592 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2593 0xFA7C1819E90D8ull,
2594 };
2595 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2596 uint64_t *d = vd, *n = vn;
2597
2598 for (i = 0; i < opr_sz; i++) {
2599 uint64_t nn = n[i];
2600 intptr_t idx = extract32(nn, 0, 6);
2601 uint64_t exp = extract32(nn, 6, 11);
2602 d[i] = coeff[idx] | (exp << 52);
2603 }
2604 }
2605
HELPER(sve_ftssel_h)2606 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2607 {
2608 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2609 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2610 uint16_t *d = vd, *n = vn, *m = vm;
2611 for (i = 0; i < opr_sz; i += 1) {
2612 uint16_t nn = n[i];
2613 uint16_t mm = m[i];
2614 if (mm & 1) {
2615 nn = float16_one;
2616 }
2617 if (mm & 2) {
2618 nn = float16_maybe_ah_chs(nn, fpcr_ah);
2619 }
2620 d[i] = nn;
2621 }
2622 }
2623
HELPER(sve_ftssel_s)2624 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2625 {
2626 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2627 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2628 uint32_t *d = vd, *n = vn, *m = vm;
2629 for (i = 0; i < opr_sz; i += 1) {
2630 uint32_t nn = n[i];
2631 uint32_t mm = m[i];
2632 if (mm & 1) {
2633 nn = float32_one;
2634 }
2635 if (mm & 2) {
2636 nn = float32_maybe_ah_chs(nn, fpcr_ah);
2637 }
2638 d[i] = nn;
2639 }
2640 }
2641
HELPER(sve_ftssel_d)2642 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2643 {
2644 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2645 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2646 uint64_t *d = vd, *n = vn, *m = vm;
2647 for (i = 0; i < opr_sz; i += 1) {
2648 uint64_t nn = n[i];
2649 uint64_t mm = m[i];
2650 if (mm & 1) {
2651 nn = float64_one;
2652 }
2653 if (mm & 2) {
2654 nn = float64_maybe_ah_chs(nn, fpcr_ah);
2655 }
2656 d[i] = nn;
2657 }
2658 }
2659
2660 /*
2661 * Signed saturating addition with scalar operand.
2662 */
2663
HELPER(sve_sqaddi_b)2664 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2665 {
2666 intptr_t i, oprsz = simd_oprsz(desc);
2667
2668 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2669 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2670 }
2671 }
2672
HELPER(sve_sqaddi_h)2673 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2674 {
2675 intptr_t i, oprsz = simd_oprsz(desc);
2676
2677 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2678 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2679 }
2680 }
2681
HELPER(sve_sqaddi_s)2682 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2683 {
2684 intptr_t i, oprsz = simd_oprsz(desc);
2685
2686 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2687 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2688 }
2689 }
2690
HELPER(sve_sqaddi_d)2691 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2692 {
2693 intptr_t i, oprsz = simd_oprsz(desc);
2694
2695 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2696 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2697 }
2698 }
2699
2700 /*
2701 * Unsigned saturating addition with scalar operand.
2702 */
2703
HELPER(sve_uqaddi_b)2704 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2705 {
2706 intptr_t i, oprsz = simd_oprsz(desc);
2707
2708 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2709 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2710 }
2711 }
2712
HELPER(sve_uqaddi_h)2713 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2714 {
2715 intptr_t i, oprsz = simd_oprsz(desc);
2716
2717 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2718 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2719 }
2720 }
2721
HELPER(sve_uqaddi_s)2722 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2723 {
2724 intptr_t i, oprsz = simd_oprsz(desc);
2725
2726 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2727 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2728 }
2729 }
2730
HELPER(sve_uqaddi_d)2731 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2732 {
2733 intptr_t i, oprsz = simd_oprsz(desc);
2734
2735 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2736 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2737 }
2738 }
2739
HELPER(sve_uqsubi_d)2740 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2741 {
2742 intptr_t i, oprsz = simd_oprsz(desc);
2743
2744 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2745 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2746 }
2747 }
2748
2749 /* Two operand predicated copy immediate with merge. All valid immediates
2750 * can fit within 17 signed bits in the simd_data field.
2751 */
HELPER(sve_cpy_m_b)2752 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2753 uint64_t mm, uint32_t desc)
2754 {
2755 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2756 uint64_t *d = vd, *n = vn;
2757 uint8_t *pg = vg;
2758
2759 mm = dup_const(MO_8, mm);
2760 for (i = 0; i < opr_sz; i += 1) {
2761 uint64_t nn = n[i];
2762 uint64_t pp = expand_pred_b(pg[H1(i)]);
2763 d[i] = (mm & pp) | (nn & ~pp);
2764 }
2765 }
2766
HELPER(sve_cpy_m_h)2767 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2768 uint64_t mm, uint32_t desc)
2769 {
2770 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2771 uint64_t *d = vd, *n = vn;
2772 uint8_t *pg = vg;
2773
2774 mm = dup_const(MO_16, mm);
2775 for (i = 0; i < opr_sz; i += 1) {
2776 uint64_t nn = n[i];
2777 uint64_t pp = expand_pred_h(pg[H1(i)]);
2778 d[i] = (mm & pp) | (nn & ~pp);
2779 }
2780 }
2781
HELPER(sve_cpy_m_s)2782 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2783 uint64_t mm, uint32_t desc)
2784 {
2785 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2786 uint64_t *d = vd, *n = vn;
2787 uint8_t *pg = vg;
2788
2789 mm = dup_const(MO_32, mm);
2790 for (i = 0; i < opr_sz; i += 1) {
2791 uint64_t nn = n[i];
2792 uint64_t pp = expand_pred_s(pg[H1(i)]);
2793 d[i] = (mm & pp) | (nn & ~pp);
2794 }
2795 }
2796
HELPER(sve_cpy_m_d)2797 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2798 uint64_t mm, uint32_t desc)
2799 {
2800 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2801 uint64_t *d = vd, *n = vn;
2802 uint8_t *pg = vg;
2803
2804 for (i = 0; i < opr_sz; i += 1) {
2805 uint64_t nn = n[i];
2806 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2807 }
2808 }
2809
HELPER(sve_cpy_z_b)2810 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2811 {
2812 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2813 uint64_t *d = vd;
2814 uint8_t *pg = vg;
2815
2816 val = dup_const(MO_8, val);
2817 for (i = 0; i < opr_sz; i += 1) {
2818 d[i] = val & expand_pred_b(pg[H1(i)]);
2819 }
2820 }
2821
HELPER(sve_cpy_z_h)2822 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2823 {
2824 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2825 uint64_t *d = vd;
2826 uint8_t *pg = vg;
2827
2828 val = dup_const(MO_16, val);
2829 for (i = 0; i < opr_sz; i += 1) {
2830 d[i] = val & expand_pred_h(pg[H1(i)]);
2831 }
2832 }
2833
HELPER(sve_cpy_z_s)2834 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2835 {
2836 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2837 uint64_t *d = vd;
2838 uint8_t *pg = vg;
2839
2840 val = dup_const(MO_32, val);
2841 for (i = 0; i < opr_sz; i += 1) {
2842 d[i] = val & expand_pred_s(pg[H1(i)]);
2843 }
2844 }
2845
HELPER(sve_cpy_z_d)2846 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2847 {
2848 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2849 uint64_t *d = vd;
2850 uint8_t *pg = vg;
2851
2852 for (i = 0; i < opr_sz; i += 1) {
2853 d[i] = (pg[H1(i)] & 1 ? val : 0);
2854 }
2855 }
2856
2857 /* Big-endian hosts need to frob the byte indices. If the copy
2858 * happens to be 8-byte aligned, then no frobbing necessary.
2859 */
swap_memmove(void * vd,void * vs,size_t n)2860 static void swap_memmove(void *vd, void *vs, size_t n)
2861 {
2862 uintptr_t d = (uintptr_t)vd;
2863 uintptr_t s = (uintptr_t)vs;
2864 uintptr_t o = (d | s | n) & 7;
2865 size_t i;
2866
2867 #if !HOST_BIG_ENDIAN
2868 o = 0;
2869 #endif
2870 switch (o) {
2871 case 0:
2872 memmove(vd, vs, n);
2873 break;
2874
2875 case 4:
2876 if (d < s || d >= s + n) {
2877 for (i = 0; i < n; i += 4) {
2878 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2879 }
2880 } else {
2881 for (i = n; i > 0; ) {
2882 i -= 4;
2883 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2884 }
2885 }
2886 break;
2887
2888 case 2:
2889 case 6:
2890 if (d < s || d >= s + n) {
2891 for (i = 0; i < n; i += 2) {
2892 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2893 }
2894 } else {
2895 for (i = n; i > 0; ) {
2896 i -= 2;
2897 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2898 }
2899 }
2900 break;
2901
2902 default:
2903 if (d < s || d >= s + n) {
2904 for (i = 0; i < n; i++) {
2905 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2906 }
2907 } else {
2908 for (i = n; i > 0; ) {
2909 i -= 1;
2910 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2911 }
2912 }
2913 break;
2914 }
2915 }
2916
2917 /* Similarly for memset of 0. */
swap_memzero(void * vd,size_t n)2918 static void swap_memzero(void *vd, size_t n)
2919 {
2920 uintptr_t d = (uintptr_t)vd;
2921 uintptr_t o = (d | n) & 7;
2922 size_t i;
2923
2924 /* Usually, the first bit of a predicate is set, so N is 0. */
2925 if (likely(n == 0)) {
2926 return;
2927 }
2928
2929 #if !HOST_BIG_ENDIAN
2930 o = 0;
2931 #endif
2932 switch (o) {
2933 case 0:
2934 memset(vd, 0, n);
2935 break;
2936
2937 case 4:
2938 for (i = 0; i < n; i += 4) {
2939 *(uint32_t *)H1_4(d + i) = 0;
2940 }
2941 break;
2942
2943 case 2:
2944 case 6:
2945 for (i = 0; i < n; i += 2) {
2946 *(uint16_t *)H1_2(d + i) = 0;
2947 }
2948 break;
2949
2950 default:
2951 for (i = 0; i < n; i++) {
2952 *(uint8_t *)H1(d + i) = 0;
2953 }
2954 break;
2955 }
2956 }
2957
HELPER(sve_ext)2958 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2959 {
2960 intptr_t opr_sz = simd_oprsz(desc);
2961 size_t n_ofs = simd_data(desc);
2962 size_t n_siz = opr_sz - n_ofs;
2963
2964 if (vd != vm) {
2965 swap_memmove(vd, vn + n_ofs, n_siz);
2966 swap_memmove(vd + n_siz, vm, n_ofs);
2967 } else if (vd != vn) {
2968 swap_memmove(vd + n_siz, vd, n_ofs);
2969 swap_memmove(vd, vn + n_ofs, n_siz);
2970 } else {
2971 /* vd == vn == vm. Need temp space. */
2972 ARMVectorReg tmp;
2973 swap_memmove(&tmp, vm, n_ofs);
2974 swap_memmove(vd, vd + n_ofs, n_siz);
2975 memcpy(vd + n_siz, &tmp, n_ofs);
2976 }
2977 }
2978
2979 #define DO_INSR(NAME, TYPE, H) \
2980 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2981 { \
2982 intptr_t opr_sz = simd_oprsz(desc); \
2983 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2984 *(TYPE *)(vd + H(0)) = val; \
2985 }
2986
DO_INSR(sve_insr_b,uint8_t,H1)2987 DO_INSR(sve_insr_b, uint8_t, H1)
2988 DO_INSR(sve_insr_h, uint16_t, H1_2)
2989 DO_INSR(sve_insr_s, uint32_t, H1_4)
2990 DO_INSR(sve_insr_d, uint64_t, H1_8)
2991
2992 #undef DO_INSR
2993
2994 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2995 {
2996 intptr_t i, j, opr_sz = simd_oprsz(desc);
2997 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2998 uint64_t f = *(uint64_t *)(vn + i);
2999 uint64_t b = *(uint64_t *)(vn + j);
3000 *(uint64_t *)(vd + i) = bswap64(b);
3001 *(uint64_t *)(vd + j) = bswap64(f);
3002 }
3003 }
3004
HELPER(sve_rev_h)3005 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
3006 {
3007 intptr_t i, j, opr_sz = simd_oprsz(desc);
3008 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3009 uint64_t f = *(uint64_t *)(vn + i);
3010 uint64_t b = *(uint64_t *)(vn + j);
3011 *(uint64_t *)(vd + i) = hswap64(b);
3012 *(uint64_t *)(vd + j) = hswap64(f);
3013 }
3014 }
3015
HELPER(sve_rev_s)3016 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
3017 {
3018 intptr_t i, j, opr_sz = simd_oprsz(desc);
3019 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3020 uint64_t f = *(uint64_t *)(vn + i);
3021 uint64_t b = *(uint64_t *)(vn + j);
3022 *(uint64_t *)(vd + i) = rol64(b, 32);
3023 *(uint64_t *)(vd + j) = rol64(f, 32);
3024 }
3025 }
3026
HELPER(sve_rev_d)3027 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
3028 {
3029 intptr_t i, j, opr_sz = simd_oprsz(desc);
3030 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3031 uint64_t f = *(uint64_t *)(vn + i);
3032 uint64_t b = *(uint64_t *)(vn + j);
3033 *(uint64_t *)(vd + i) = b;
3034 *(uint64_t *)(vd + j) = f;
3035 }
3036 }
3037
3038 /*
3039 * TODO: This could use half_shuffle64 and similar bit tricks to
3040 * expand blocks of bits at once.
3041 */
3042 #define DO_PMOV_PV(NAME, ESIZE) \
3043 void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
3044 { \
3045 unsigned vl = simd_oprsz(desc); \
3046 unsigned idx = simd_data(desc); \
3047 unsigned elements = vl / ESIZE; \
3048 ARMPredicateReg *d = vd; \
3049 ARMVectorReg *s = vs; \
3050 memset(d, 0, sizeof(*d)); \
3051 for (unsigned e = 0; e < elements; ++e) { \
3052 depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \
3053 } \
3054 }
3055
3056 DO_PMOV_PV(pmov_pv_h, 2)
3057 DO_PMOV_PV(pmov_pv_s, 4)
3058 DO_PMOV_PV(pmov_pv_d, 8)
3059
3060 #undef DO_PMOV_PV
3061
3062 /*
3063 * TODO: This could use half_unshuffle64 and similar bit tricks to
3064 * compress blocks of bits at once.
3065 */
3066 #define DO_PMOV_VP(NAME, ESIZE) \
3067 void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
3068 { \
3069 unsigned vl = simd_oprsz(desc); \
3070 unsigned idx = simd_data(desc); \
3071 unsigned elements = vl / ESIZE; \
3072 ARMVectorReg *d = vd; \
3073 ARMPredicateReg *s = vs; \
3074 if (idx == 0) { \
3075 memset(d, 0, vl); \
3076 } \
3077 for (unsigned e = 0; e < elements; ++e) { \
3078 depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \
3079 } \
3080 }
3081
3082 DO_PMOV_VP(pmov_vp_h, 2)
3083 DO_PMOV_VP(pmov_vp_s, 4)
3084 DO_PMOV_VP(pmov_vp_d, 8)
3085
3086 #undef DO_PMOV_VP
3087
3088 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
3089
do_tbl1(void * vd,void * vn,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)3090 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
3091 bool is_tbx, tb_impl_fn *fn)
3092 {
3093 ARMVectorReg scratch;
3094 uintptr_t oprsz = simd_oprsz(desc);
3095
3096 if (unlikely(vd == vn)) {
3097 vn = memcpy(&scratch, vn, oprsz);
3098 }
3099
3100 fn(vd, vn, NULL, vm, oprsz, is_tbx);
3101 }
3102
do_tbl2(void * vd,void * vn0,void * vn1,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)3103 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3104 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3105 {
3106 ARMVectorReg scratch;
3107 uintptr_t oprsz = simd_oprsz(desc);
3108
3109 if (unlikely(vd == vn0)) {
3110 vn0 = memcpy(&scratch, vn0, oprsz);
3111 if (vd == vn1) {
3112 vn1 = vn0;
3113 }
3114 } else if (unlikely(vd == vn1)) {
3115 vn1 = memcpy(&scratch, vn1, oprsz);
3116 }
3117
3118 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3119 }
3120
3121 #define DO_TB(SUFF, TYPE, H) \
3122 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
3123 void *vm, uintptr_t oprsz, bool is_tbx) \
3124 { \
3125 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
3126 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
3127 for (i = 0; i < nelem; ++i) { \
3128 TYPE index = indexes[H1(i)], val = 0; \
3129 if (index < nelem) { \
3130 val = tbl0[H(index)]; \
3131 } else { \
3132 index -= nelem; \
3133 if (tbl1 && index < nelem) { \
3134 val = tbl1[H(index)]; \
3135 } else if (is_tbx) { \
3136 continue; \
3137 } \
3138 } \
3139 d[H(i)] = val; \
3140 } \
3141 } \
3142 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3143 { \
3144 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3145 } \
3146 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3147 void *vm, uint32_t desc) \
3148 { \
3149 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3150 } \
3151 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3152 { \
3153 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3154 }
3155
3156 DO_TB(b, uint8_t, H1)
3157 DO_TB(h, uint16_t, H2)
3158 DO_TB(s, uint32_t, H4)
3159 DO_TB(d, uint64_t, H8)
3160
3161 #undef DO_TB
3162
3163 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3164 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3165 { \
3166 intptr_t i, opr_sz = simd_oprsz(desc); \
3167 TYPED *d = vd; \
3168 TYPES *n = vn; \
3169 ARMVectorReg tmp; \
3170 if (unlikely(vn - vd < opr_sz)) { \
3171 n = memcpy(&tmp, n, opr_sz / 2); \
3172 } \
3173 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3174 d[HD(i)] = n[HS(i)]; \
3175 } \
3176 }
3177
3178 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3179 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3180 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3181
3182 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3183 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3184 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3185
3186 #undef DO_UNPK
3187
3188 /* Mask of bits included in the even numbered predicates of width esz.
3189 * We also use this for expand_bits/compress_bits, and so extend the
3190 * same pattern out to 16-bit units.
3191 */
3192 static const uint64_t even_bit_esz_masks[5] = {
3193 0x5555555555555555ull,
3194 0x3333333333333333ull,
3195 0x0f0f0f0f0f0f0f0full,
3196 0x00ff00ff00ff00ffull,
3197 0x0000ffff0000ffffull,
3198 };
3199
3200 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3201 * For N==0, this corresponds to the operation that in qemu/bitops.h
3202 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3203 * section 7-2 Shuffling Bits.
3204 */
expand_bits(uint64_t x,int n)3205 static uint64_t expand_bits(uint64_t x, int n)
3206 {
3207 int i;
3208
3209 x &= 0xffffffffu;
3210 for (i = 4; i >= n; i--) {
3211 int sh = 1 << i;
3212 x = ((x << sh) | x) & even_bit_esz_masks[i];
3213 }
3214 return x;
3215 }
3216
3217 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3218 * For N==0, this corresponds to the operation that in qemu/bitops.h
3219 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3220 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3221 */
compress_bits(uint64_t x,int n)3222 static uint64_t compress_bits(uint64_t x, int n)
3223 {
3224 int i;
3225
3226 for (i = n; i <= 4; i++) {
3227 int sh = 1 << i;
3228 x &= even_bit_esz_masks[i];
3229 x = (x >> sh) | x;
3230 }
3231 return x & 0xffffffffu;
3232 }
3233
HELPER(sve_zip_p)3234 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3235 {
3236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3237 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3238 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3239 int esize = 1 << esz;
3240 uint64_t *d = vd;
3241 intptr_t i;
3242
3243 if (oprsz <= 8) {
3244 uint64_t nn = *(uint64_t *)vn;
3245 uint64_t mm = *(uint64_t *)vm;
3246 int half = 4 * oprsz;
3247
3248 nn = extract64(nn, high * half, half);
3249 mm = extract64(mm, high * half, half);
3250 nn = expand_bits(nn, esz);
3251 mm = expand_bits(mm, esz);
3252 d[0] = nn | (mm << esize);
3253 } else {
3254 ARMPredicateReg tmp;
3255
3256 /* We produce output faster than we consume input.
3257 Therefore we must be mindful of possible overlap. */
3258 if (vd == vn) {
3259 vn = memcpy(&tmp, vn, oprsz);
3260 if (vd == vm) {
3261 vm = vn;
3262 }
3263 } else if (vd == vm) {
3264 vm = memcpy(&tmp, vm, oprsz);
3265 }
3266 if (high) {
3267 high = oprsz >> 1;
3268 }
3269
3270 if ((oprsz & 7) == 0) {
3271 uint32_t *n = vn, *m = vm;
3272 high >>= 2;
3273
3274 for (i = 0; i < oprsz / 8; i++) {
3275 uint64_t nn = n[H4(high + i)];
3276 uint64_t mm = m[H4(high + i)];
3277
3278 nn = expand_bits(nn, esz);
3279 mm = expand_bits(mm, esz);
3280 d[i] = nn | (mm << esize);
3281 }
3282 } else {
3283 uint8_t *n = vn, *m = vm;
3284 uint16_t *d16 = vd;
3285
3286 for (i = 0; i < oprsz / 2; i++) {
3287 uint16_t nn = n[H1(high + i)];
3288 uint16_t mm = m[H1(high + i)];
3289
3290 nn = expand_bits(nn, esz);
3291 mm = expand_bits(mm, esz);
3292 d16[H2(i)] = nn | (mm << esize);
3293 }
3294 }
3295 }
3296 }
3297
HELPER(sve_uzp_p)3298 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3299 {
3300 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3301 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3302 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3303 uint64_t *d = vd, *n = vn, *m = vm;
3304 uint64_t l, h;
3305 intptr_t i;
3306
3307 if (oprsz <= 8) {
3308 l = compress_bits(n[0] >> odd, esz);
3309 h = compress_bits(m[0] >> odd, esz);
3310 d[0] = l | (h << (4 * oprsz));
3311 } else {
3312 ARMPredicateReg tmp_m;
3313 intptr_t oprsz_16 = oprsz / 16;
3314
3315 if ((vm - vd) < (uintptr_t)oprsz) {
3316 m = memcpy(&tmp_m, vm, oprsz);
3317 }
3318
3319 for (i = 0; i < oprsz_16; i++) {
3320 l = n[2 * i + 0];
3321 h = n[2 * i + 1];
3322 l = compress_bits(l >> odd, esz);
3323 h = compress_bits(h >> odd, esz);
3324 d[i] = l | (h << 32);
3325 }
3326
3327 /*
3328 * For VL which is not a multiple of 512, the results from M do not
3329 * align nicely with the uint64_t for D. Put the aligned results
3330 * from M into TMP_M and then copy it into place afterward.
3331 */
3332 if (oprsz & 15) {
3333 int final_shift = (oprsz & 15) * 2;
3334
3335 l = n[2 * i + 0];
3336 h = n[2 * i + 1];
3337 l = compress_bits(l >> odd, esz);
3338 h = compress_bits(h >> odd, esz);
3339 d[i] = l | (h << final_shift);
3340
3341 for (i = 0; i < oprsz_16; i++) {
3342 l = m[2 * i + 0];
3343 h = m[2 * i + 1];
3344 l = compress_bits(l >> odd, esz);
3345 h = compress_bits(h >> odd, esz);
3346 tmp_m.p[i] = l | (h << 32);
3347 }
3348 l = m[2 * i + 0];
3349 h = m[2 * i + 1];
3350 l = compress_bits(l >> odd, esz);
3351 h = compress_bits(h >> odd, esz);
3352 tmp_m.p[i] = l | (h << final_shift);
3353
3354 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3355 } else {
3356 for (i = 0; i < oprsz_16; i++) {
3357 l = m[2 * i + 0];
3358 h = m[2 * i + 1];
3359 l = compress_bits(l >> odd, esz);
3360 h = compress_bits(h >> odd, esz);
3361 d[oprsz_16 + i] = l | (h << 32);
3362 }
3363 }
3364 }
3365 }
3366
HELPER(sve_trn_p)3367 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3368 {
3369 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3370 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3371 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3372 uint64_t *d = vd, *n = vn, *m = vm;
3373 uint64_t mask;
3374 int shr, shl;
3375 intptr_t i;
3376
3377 shl = 1 << esz;
3378 shr = 0;
3379 mask = even_bit_esz_masks[esz];
3380 if (odd) {
3381 mask <<= shl;
3382 shr = shl;
3383 shl = 0;
3384 }
3385
3386 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3387 uint64_t nn = (n[i] & mask) >> shr;
3388 uint64_t mm = (m[i] & mask) << shl;
3389 d[i] = nn + mm;
3390 }
3391 }
3392
3393 /* Reverse units of 2**N bits. */
reverse_bits_64(uint64_t x,int n)3394 static uint64_t reverse_bits_64(uint64_t x, int n)
3395 {
3396 int i, sh;
3397
3398 x = bswap64(x);
3399 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3400 uint64_t mask = even_bit_esz_masks[i];
3401 x = ((x & mask) << sh) | ((x >> sh) & mask);
3402 }
3403 return x;
3404 }
3405
reverse_bits_8(uint8_t x,int n)3406 static uint8_t reverse_bits_8(uint8_t x, int n)
3407 {
3408 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3409 int i, sh;
3410
3411 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3412 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3413 }
3414 return x;
3415 }
3416
HELPER(sve_rev_p)3417 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3418 {
3419 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3420 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3421 intptr_t i, oprsz_2 = oprsz / 2;
3422
3423 if (oprsz <= 8) {
3424 uint64_t l = *(uint64_t *)vn;
3425 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3426 *(uint64_t *)vd = l;
3427 } else if ((oprsz & 15) == 0) {
3428 for (i = 0; i < oprsz_2; i += 8) {
3429 intptr_t ih = oprsz - 8 - i;
3430 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3431 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3432 *(uint64_t *)(vd + i) = h;
3433 *(uint64_t *)(vd + ih) = l;
3434 }
3435 } else {
3436 for (i = 0; i < oprsz_2; i += 1) {
3437 intptr_t il = H1(i);
3438 intptr_t ih = H1(oprsz - 1 - i);
3439 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3440 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3441 *(uint8_t *)(vd + il) = h;
3442 *(uint8_t *)(vd + ih) = l;
3443 }
3444 }
3445 }
3446
HELPER(sve_punpk_p)3447 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3448 {
3449 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3450 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3451 uint64_t *d = vd;
3452 intptr_t i;
3453
3454 if (oprsz <= 8) {
3455 uint64_t nn = *(uint64_t *)vn;
3456 int half = 4 * oprsz;
3457
3458 nn = extract64(nn, high * half, half);
3459 nn = expand_bits(nn, 0);
3460 d[0] = nn;
3461 } else {
3462 ARMPredicateReg tmp_n;
3463
3464 /* We produce output faster than we consume input.
3465 Therefore we must be mindful of possible overlap. */
3466 if ((vn - vd) < (uintptr_t)oprsz) {
3467 vn = memcpy(&tmp_n, vn, oprsz);
3468 }
3469 if (high) {
3470 high = oprsz >> 1;
3471 }
3472
3473 if ((oprsz & 7) == 0) {
3474 uint32_t *n = vn;
3475 high >>= 2;
3476
3477 for (i = 0; i < oprsz / 8; i++) {
3478 uint64_t nn = n[H4(high + i)];
3479 d[i] = expand_bits(nn, 0);
3480 }
3481 } else {
3482 uint16_t *d16 = vd;
3483 uint8_t *n = vn;
3484
3485 for (i = 0; i < oprsz / 2; i++) {
3486 uint16_t nn = n[H1(high + i)];
3487 d16[H2(i)] = expand_bits(nn, 0);
3488 }
3489 }
3490 }
3491 }
3492
3493 #define DO_ZIP(NAME, TYPE, H) \
3494 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3495 { \
3496 intptr_t oprsz = simd_oprsz(desc); \
3497 intptr_t odd_ofs = simd_data(desc); \
3498 intptr_t i, oprsz_2 = oprsz / 2; \
3499 ARMVectorReg tmp_n, tmp_m; \
3500 /* We produce output faster than we consume input. \
3501 Therefore we must be mindful of possible overlap. */ \
3502 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3503 vn = memcpy(&tmp_n, vn, oprsz); \
3504 } \
3505 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3506 vm = memcpy(&tmp_m, vm, oprsz); \
3507 } \
3508 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3509 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3510 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3511 *(TYPE *)(vm + odd_ofs + H(i)); \
3512 } \
3513 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3514 memset(vd + oprsz - 16, 0, 16); \
3515 } \
3516 }
3517
3518 DO_ZIP(sve_zip_b, uint8_t, H1)
3519 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3520 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3521 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3522 DO_ZIP(sve2_zip_q, Int128, )
3523
3524 #define DO_UZP(NAME, TYPE, H) \
3525 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3526 { \
3527 intptr_t oprsz = simd_oprsz(desc); \
3528 intptr_t odd_ofs = simd_data(desc); \
3529 intptr_t i, p; \
3530 ARMVectorReg tmp_m; \
3531 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3532 vm = memcpy(&tmp_m, vm, oprsz); \
3533 } \
3534 i = 0, p = odd_ofs; \
3535 do { \
3536 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3537 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3538 } while (p < oprsz); \
3539 p -= oprsz; \
3540 do { \
3541 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3542 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3543 } while (p < oprsz); \
3544 tcg_debug_assert(i == oprsz); \
3545 }
3546
3547 DO_UZP(sve_uzp_b, uint8_t, H1)
3548 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3549 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3550 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3551 DO_UZP(sve2_uzp_q, Int128, )
3552
3553 typedef void perseg_zzz_fn(void *vd, void *vn, void *vm, uint32_t desc);
3554
do_perseg_zzz(void * vd,void * vn,void * vm,uint32_t desc,perseg_zzz_fn * fn)3555 static void do_perseg_zzz(void *vd, void *vn, void *vm,
3556 uint32_t desc, perseg_zzz_fn *fn)
3557 {
3558 intptr_t oprsz = simd_oprsz(desc);
3559
3560 desc = simd_desc(16, 16, simd_data(desc));
3561 for (intptr_t i = 0; i < oprsz; i += 16) {
3562 fn(vd + i, vn + i, vm + i, desc);
3563 }
3564 }
3565
3566 #define DO_PERSEG_ZZZ(NAME, FUNC) \
3567 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3568 { do_perseg_zzz(vd, vn, vm, desc, FUNC); }
3569
DO_PERSEG_ZZZ(sve2p1_uzpq_b,helper_sve_uzp_b)3570 DO_PERSEG_ZZZ(sve2p1_uzpq_b, helper_sve_uzp_b)
3571 DO_PERSEG_ZZZ(sve2p1_uzpq_h, helper_sve_uzp_h)
3572 DO_PERSEG_ZZZ(sve2p1_uzpq_s, helper_sve_uzp_s)
3573 DO_PERSEG_ZZZ(sve2p1_uzpq_d, helper_sve_uzp_d)
3574
3575 DO_PERSEG_ZZZ(sve2p1_zipq_b, helper_sve_zip_b)
3576 DO_PERSEG_ZZZ(sve2p1_zipq_h, helper_sve_zip_h)
3577 DO_PERSEG_ZZZ(sve2p1_zipq_s, helper_sve_zip_s)
3578 DO_PERSEG_ZZZ(sve2p1_zipq_d, helper_sve_zip_d)
3579
3580 DO_PERSEG_ZZZ(sve2p1_tblq_b, helper_sve_tbl_b)
3581 DO_PERSEG_ZZZ(sve2p1_tblq_h, helper_sve_tbl_h)
3582 DO_PERSEG_ZZZ(sve2p1_tblq_s, helper_sve_tbl_s)
3583 DO_PERSEG_ZZZ(sve2p1_tblq_d, helper_sve_tbl_d)
3584
3585 DO_PERSEG_ZZZ(sve2p1_tbxq_b, helper_sve2_tbx_b)
3586 DO_PERSEG_ZZZ(sve2p1_tbxq_h, helper_sve2_tbx_h)
3587 DO_PERSEG_ZZZ(sve2p1_tbxq_s, helper_sve2_tbx_s)
3588 DO_PERSEG_ZZZ(sve2p1_tbxq_d, helper_sve2_tbx_d)
3589
3590 #undef DO_PERSEG_ZZZ
3591
3592 #define DO_TRN(NAME, TYPE, H) \
3593 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3594 { \
3595 intptr_t oprsz = simd_oprsz(desc); \
3596 intptr_t odd_ofs = simd_data(desc); \
3597 intptr_t i; \
3598 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3599 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3600 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3601 *(TYPE *)(vd + H(i + 0)) = ae; \
3602 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3603 } \
3604 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3605 memset(vd + oprsz - 16, 0, 16); \
3606 } \
3607 }
3608
3609 DO_TRN(sve_trn_b, uint8_t, H1)
3610 DO_TRN(sve_trn_h, uint16_t, H1_2)
3611 DO_TRN(sve_trn_s, uint32_t, H1_4)
3612 DO_TRN(sve_trn_d, uint64_t, H1_8)
3613 DO_TRN(sve2_trn_q, Int128, )
3614
3615 #undef DO_ZIP
3616 #undef DO_UZP
3617 #undef DO_TRN
3618
3619 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3620 {
3621 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3622 uint32_t *d = vd, *n = vn;
3623 uint8_t *pg = vg;
3624
3625 for (i = j = 0; i < opr_sz; i++) {
3626 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3627 d[H4(j)] = n[H4(i)];
3628 j++;
3629 }
3630 }
3631 for (; j < opr_sz; j++) {
3632 d[H4(j)] = 0;
3633 }
3634 }
3635
HELPER(sve_compact_d)3636 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3637 {
3638 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3639 uint64_t *d = vd, *n = vn;
3640 uint8_t *pg = vg;
3641
3642 for (i = j = 0; i < opr_sz; i++) {
3643 if (pg[H1(i)] & 1) {
3644 d[j] = n[i];
3645 j++;
3646 }
3647 }
3648 for (; j < opr_sz; j++) {
3649 d[j] = 0;
3650 }
3651 }
3652
3653 /* Similar to the ARM LastActiveElement pseudocode function, except the
3654 * result is multiplied by the element size. This includes the not found
3655 * indication; e.g. not found for esz=3 is -8.
3656 */
HELPER(sve_last_active_element)3657 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3658 {
3659 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3660 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3661
3662 return last_active_element(vg, words, esz);
3663 }
3664
HELPER(sve_splice)3665 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3666 {
3667 intptr_t opr_sz = simd_oprsz(desc) / 8;
3668 int esz = simd_data(desc);
3669 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3670 intptr_t i, first_i, last_i;
3671 ARMVectorReg tmp;
3672
3673 first_i = last_i = 0;
3674 first_g = last_g = 0;
3675
3676 /* Find the extent of the active elements within VG. */
3677 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3678 pg = *(uint64_t *)(vg + i) & mask;
3679 if (pg) {
3680 if (last_g == 0) {
3681 last_g = pg;
3682 last_i = i;
3683 }
3684 first_g = pg;
3685 first_i = i;
3686 }
3687 }
3688
3689 len = 0;
3690 if (first_g != 0) {
3691 first_i = first_i * 8 + ctz64(first_g);
3692 last_i = last_i * 8 + 63 - clz64(last_g);
3693 len = last_i - first_i + (1 << esz);
3694 if (vd == vm) {
3695 vm = memcpy(&tmp, vm, opr_sz * 8);
3696 }
3697 swap_memmove(vd, vn + first_i, len);
3698 }
3699 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3700 }
3701
HELPER(sve_sel_zpzz_b)3702 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3703 void *vg, uint32_t desc)
3704 {
3705 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3706 uint64_t *d = vd, *n = vn, *m = vm;
3707 uint8_t *pg = vg;
3708
3709 for (i = 0; i < opr_sz; i += 1) {
3710 uint64_t nn = n[i], mm = m[i];
3711 uint64_t pp = expand_pred_b(pg[H1(i)]);
3712 d[i] = (nn & pp) | (mm & ~pp);
3713 }
3714 }
3715
HELPER(sve_sel_zpzz_h)3716 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3717 void *vg, uint32_t desc)
3718 {
3719 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3720 uint64_t *d = vd, *n = vn, *m = vm;
3721 uint8_t *pg = vg;
3722
3723 for (i = 0; i < opr_sz; i += 1) {
3724 uint64_t nn = n[i], mm = m[i];
3725 uint64_t pp = expand_pred_h(pg[H1(i)]);
3726 d[i] = (nn & pp) | (mm & ~pp);
3727 }
3728 }
3729
HELPER(sve_sel_zpzz_s)3730 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3731 void *vg, uint32_t desc)
3732 {
3733 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3734 uint64_t *d = vd, *n = vn, *m = vm;
3735 uint8_t *pg = vg;
3736
3737 for (i = 0; i < opr_sz; i += 1) {
3738 uint64_t nn = n[i], mm = m[i];
3739 uint64_t pp = expand_pred_s(pg[H1(i)]);
3740 d[i] = (nn & pp) | (mm & ~pp);
3741 }
3742 }
3743
HELPER(sve_sel_zpzz_d)3744 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3745 void *vg, uint32_t desc)
3746 {
3747 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3748 uint64_t *d = vd, *n = vn, *m = vm;
3749 uint8_t *pg = vg;
3750
3751 for (i = 0; i < opr_sz; i += 1) {
3752 uint64_t nn = n[i], mm = m[i];
3753 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3754 }
3755 }
3756
HELPER(sve_sel_zpzz_q)3757 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3758 void *vg, uint32_t desc)
3759 {
3760 intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3761 Int128 *d = vd, *n = vn, *m = vm;
3762 uint16_t *pg = vg;
3763
3764 for (i = 0; i < opr_sz; i += 1) {
3765 d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3766 }
3767 }
3768
3769 /* Two operand comparison controlled by a predicate.
3770 * ??? It is very tempting to want to be able to expand this inline
3771 * with x86 instructions, e.g.
3772 *
3773 * vcmpeqw zm, zn, %ymm0
3774 * vpmovmskb %ymm0, %eax
3775 * and $0x5555, %eax
3776 * and pg, %eax
3777 *
3778 * or even aarch64, e.g.
3779 *
3780 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3781 * cmeq v0.8h, zn, zm
3782 * and v0.8h, v0.8h, mask
3783 * addv h0, v0.8h
3784 * and v0.8b, pg
3785 *
3786 * However, coming up with an abstraction that allows vector inputs and
3787 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3788 * scalar outputs, is tricky.
3789 */
3790 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3791 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3792 { \
3793 intptr_t opr_sz = simd_oprsz(desc); \
3794 uint32_t flags = PREDTEST_INIT; \
3795 intptr_t i = opr_sz; \
3796 do { \
3797 uint64_t out = 0, pg; \
3798 do { \
3799 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3800 TYPE nn = *(TYPE *)(vn + H(i)); \
3801 TYPE mm = *(TYPE *)(vm + H(i)); \
3802 out |= nn OP mm; \
3803 } while (i & 63); \
3804 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3805 out &= pg; \
3806 *(uint64_t *)(vd + (i >> 3)) = out; \
3807 flags = iter_predtest_bwd(out, pg, flags); \
3808 } while (i > 0); \
3809 return flags; \
3810 }
3811
3812 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3813 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3814 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3815 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3816 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3817 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3818 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3819 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3820
3821 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3822 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3823 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3824 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3825
3826 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3827 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3828 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3829 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3830
3831 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3832 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3833 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3834 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3835
3836 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3837 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3838 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3839 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3840
3841 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3842 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3843 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3844 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3845
3846 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3847 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3848 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3849 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3850
3851 #undef DO_CMP_PPZZ_B
3852 #undef DO_CMP_PPZZ_H
3853 #undef DO_CMP_PPZZ_S
3854 #undef DO_CMP_PPZZ_D
3855 #undef DO_CMP_PPZZ
3856
3857 /* Similar, but the second source is "wide". */
3858 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3859 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3860 { \
3861 intptr_t opr_sz = simd_oprsz(desc); \
3862 uint32_t flags = PREDTEST_INIT; \
3863 intptr_t i = opr_sz; \
3864 do { \
3865 uint64_t out = 0, pg; \
3866 do { \
3867 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3868 do { \
3869 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3870 TYPE nn = *(TYPE *)(vn + H(i)); \
3871 out |= nn OP mm; \
3872 } while (i & 7); \
3873 } while (i & 63); \
3874 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3875 out &= pg; \
3876 *(uint64_t *)(vd + (i >> 3)) = out; \
3877 flags = iter_predtest_bwd(out, pg, flags); \
3878 } while (i > 0); \
3879 return flags; \
3880 }
3881
3882 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3883 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3884 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3885 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3886 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3887 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3888
3889 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3890 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3891 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3892
3893 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3894 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3895 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3896
3897 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3898 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3899 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3900
3901 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3902 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3903 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3904
3905 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3906 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3907 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3908
3909 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3910 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3911 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3912
3913 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3914 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3915 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3916
3917 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3918 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3919 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3920
3921 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3922 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3923 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3924
3925 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3926 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3927 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3928
3929 #undef DO_CMP_PPZW_B
3930 #undef DO_CMP_PPZW_H
3931 #undef DO_CMP_PPZW_S
3932 #undef DO_CMP_PPZW
3933
3934 /* Similar, but the second source is immediate. */
3935 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3936 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3937 { \
3938 intptr_t opr_sz = simd_oprsz(desc); \
3939 uint32_t flags = PREDTEST_INIT; \
3940 TYPE mm = simd_data(desc); \
3941 intptr_t i = opr_sz; \
3942 do { \
3943 uint64_t out = 0, pg; \
3944 do { \
3945 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3946 TYPE nn = *(TYPE *)(vn + H(i)); \
3947 out |= nn OP mm; \
3948 } while (i & 63); \
3949 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3950 out &= pg; \
3951 *(uint64_t *)(vd + (i >> 3)) = out; \
3952 flags = iter_predtest_bwd(out, pg, flags); \
3953 } while (i > 0); \
3954 return flags; \
3955 }
3956
3957 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3958 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3959 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3960 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3961 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3962 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3963 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3964 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3965
3966 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3967 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3968 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3969 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3970
3971 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3972 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3973 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3974 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3975
3976 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3977 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3978 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3979 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3980
3981 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3982 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3983 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3984 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3985
3986 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3987 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3988 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3989 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3990
3991 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3992 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3993 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3994 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3995
3996 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3997 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3998 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3999 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
4000
4001 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
4002 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
4003 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
4004 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
4005
4006 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
4007 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
4008 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
4009 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
4010
4011 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
4012 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
4013 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
4014 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
4015
4016 #undef DO_CMP_PPZI_B
4017 #undef DO_CMP_PPZI_H
4018 #undef DO_CMP_PPZI_S
4019 #undef DO_CMP_PPZI_D
4020 #undef DO_CMP_PPZI
4021
4022 /* Similar to the ARM LastActive pseudocode function. */
last_active_pred(void * vd,void * vg,intptr_t oprsz)4023 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
4024 {
4025 intptr_t i;
4026
4027 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
4028 uint64_t pg = *(uint64_t *)(vg + i);
4029 if (pg) {
4030 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
4031 }
4032 }
4033 return 0;
4034 }
4035
4036 /* Compute a mask into RETB that is true for all G, up to and including
4037 * (if after) or excluding (if !after) the first G & N.
4038 * Return true if BRK found.
4039 */
compute_brk(uint64_t * retb,uint64_t n,uint64_t g,bool brk,bool after)4040 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
4041 bool brk, bool after)
4042 {
4043 uint64_t b;
4044
4045 if (brk) {
4046 b = 0;
4047 } else if ((g & n) == 0) {
4048 /* For all G, no N are set; break not found. */
4049 b = g;
4050 } else {
4051 /* Break somewhere in N. Locate it. */
4052 b = g & n; /* guard true, pred true */
4053 b = b & -b; /* first such */
4054 if (after) {
4055 b = b | (b - 1); /* break after same */
4056 } else {
4057 b = b - 1; /* break before same */
4058 }
4059 brk = true;
4060 }
4061
4062 *retb = b;
4063 return brk;
4064 }
4065
4066 /* Compute a zeroing BRK. */
compute_brk_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)4067 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
4068 intptr_t oprsz, bool after)
4069 {
4070 bool brk = false;
4071 intptr_t i;
4072
4073 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4074 uint64_t this_b, this_g = g[i];
4075
4076 brk = compute_brk(&this_b, n[i], this_g, brk, after);
4077 d[i] = this_b & this_g;
4078 }
4079 }
4080
4081 /* Likewise, but also compute flags. */
compute_brks_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)4082 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
4083 intptr_t oprsz, bool after)
4084 {
4085 uint32_t flags = PREDTEST_INIT;
4086 bool brk = false;
4087 intptr_t i;
4088
4089 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4090 uint64_t this_b, this_d, this_g = g[i];
4091
4092 brk = compute_brk(&this_b, n[i], this_g, brk, after);
4093 d[i] = this_d = this_b & this_g;
4094 flags = iter_predtest_fwd(this_d, this_g, flags);
4095 }
4096 return flags;
4097 }
4098
4099 /* Compute a merging BRK. */
compute_brk_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)4100 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
4101 intptr_t oprsz, bool after)
4102 {
4103 bool brk = false;
4104 intptr_t i;
4105
4106 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4107 uint64_t this_b, this_g = g[i];
4108
4109 brk = compute_brk(&this_b, n[i], this_g, brk, after);
4110 d[i] = (this_b & this_g) | (d[i] & ~this_g);
4111 }
4112 }
4113
4114 /* Likewise, but also compute flags. */
compute_brks_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)4115 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
4116 intptr_t oprsz, bool after)
4117 {
4118 uint32_t flags = PREDTEST_INIT;
4119 bool brk = false;
4120 intptr_t i;
4121
4122 for (i = 0; i < oprsz / 8; ++i) {
4123 uint64_t this_b, this_d = d[i], this_g = g[i];
4124
4125 brk = compute_brk(&this_b, n[i], this_g, brk, after);
4126 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
4127 flags = iter_predtest_fwd(this_d, this_g, flags);
4128 }
4129 return flags;
4130 }
4131
HELPER(sve_brkpa)4132 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4133 uint32_t pred_desc)
4134 {
4135 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4136 if (last_active_pred(vn, vg, oprsz)) {
4137 compute_brk_z(vd, vm, vg, oprsz, true);
4138 } else {
4139 memset(vd, 0, sizeof(ARMPredicateReg));
4140 }
4141 }
4142
HELPER(sve_brkpas)4143 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4144 uint32_t pred_desc)
4145 {
4146 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4147 if (last_active_pred(vn, vg, oprsz)) {
4148 return compute_brks_z(vd, vm, vg, oprsz, true);
4149 } else {
4150 memset(vd, 0, sizeof(ARMPredicateReg));
4151 return PREDTEST_INIT;
4152 }
4153 }
4154
HELPER(sve_brkpb)4155 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4156 uint32_t pred_desc)
4157 {
4158 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4159 if (last_active_pred(vn, vg, oprsz)) {
4160 compute_brk_z(vd, vm, vg, oprsz, false);
4161 } else {
4162 memset(vd, 0, sizeof(ARMPredicateReg));
4163 }
4164 }
4165
HELPER(sve_brkpbs)4166 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4167 uint32_t pred_desc)
4168 {
4169 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4170 if (last_active_pred(vn, vg, oprsz)) {
4171 return compute_brks_z(vd, vm, vg, oprsz, false);
4172 } else {
4173 memset(vd, 0, sizeof(ARMPredicateReg));
4174 return PREDTEST_INIT;
4175 }
4176 }
4177
HELPER(sve_brka_z)4178 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4179 {
4180 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4181 compute_brk_z(vd, vn, vg, oprsz, true);
4182 }
4183
HELPER(sve_brkas_z)4184 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4185 {
4186 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4187 return compute_brks_z(vd, vn, vg, oprsz, true);
4188 }
4189
HELPER(sve_brkb_z)4190 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4191 {
4192 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4193 compute_brk_z(vd, vn, vg, oprsz, false);
4194 }
4195
HELPER(sve_brkbs_z)4196 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4197 {
4198 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4199 return compute_brks_z(vd, vn, vg, oprsz, false);
4200 }
4201
HELPER(sve_brka_m)4202 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4203 {
4204 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4205 compute_brk_m(vd, vn, vg, oprsz, true);
4206 }
4207
HELPER(sve_brkas_m)4208 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4209 {
4210 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4211 return compute_brks_m(vd, vn, vg, oprsz, true);
4212 }
4213
HELPER(sve_brkb_m)4214 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4215 {
4216 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4217 compute_brk_m(vd, vn, vg, oprsz, false);
4218 }
4219
HELPER(sve_brkbs_m)4220 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4221 {
4222 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4223 return compute_brks_m(vd, vn, vg, oprsz, false);
4224 }
4225
HELPER(sve_brkn)4226 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4227 {
4228 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4229 if (!last_active_pred(vn, vg, oprsz)) {
4230 memset(vd, 0, sizeof(ARMPredicateReg));
4231 }
4232 }
4233
HELPER(sve_brkns)4234 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4235 {
4236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4237 if (last_active_pred(vn, vg, oprsz)) {
4238 ARMPredicateReg *d = vd;
4239 uint32_t flags = PREDTEST_INIT;
4240 intptr_t i;
4241
4242 /* As if PredTest(Ones(PL), D, MO_8). */
4243 for (i = 0; i < oprsz / 8; i++) {
4244 flags = iter_predtest_fwd(d->p[i], -1, flags);
4245 }
4246 if (oprsz & 7) {
4247 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4248 flags = iter_predtest_fwd(d->p[i], mask, flags);
4249 }
4250 return flags;
4251 }
4252 memset(vd, 0, sizeof(ARMPredicateReg));
4253 return PREDTEST_INIT;
4254 }
4255
HELPER(sve_cntp)4256 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4257 {
4258 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4259 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4260 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4261 intptr_t i;
4262
4263 for (i = 0; i < words; ++i) {
4264 uint64_t t = n[i] & g[i] & mask;
4265 sum += ctpop64(t);
4266 }
4267 return sum;
4268 }
4269
HELPER(sve2p1_cntp_c)4270 uint64_t HELPER(sve2p1_cntp_c)(uint32_t png, uint32_t desc)
4271 {
4272 int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
4273 int vl = pl * 8;
4274 unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
4275 int lg2_width = FIELD_EX32(desc, PREDDESC, DATA) + 1;
4276 DecodeCounter p = decode_counter(png, vl, v_esz);
4277 unsigned maxelem = (vl << lg2_width) >> v_esz;
4278 unsigned count = p.count;
4279
4280 if (p.invert) {
4281 if (count >= maxelem) {
4282 return 0;
4283 }
4284 count = maxelem - count;
4285 } else {
4286 count = MIN(count, maxelem);
4287 }
4288 return count >> p.lg2_stride;
4289 }
4290
4291 /* C.f. Arm pseudocode EncodePredCount */
encode_pred_count(uint32_t elements,uint32_t count,uint32_t esz,bool invert)4292 static uint64_t encode_pred_count(uint32_t elements, uint32_t count,
4293 uint32_t esz, bool invert)
4294 {
4295 uint32_t pred;
4296
4297 if (count == 0) {
4298 return 0;
4299 }
4300 if (invert) {
4301 count = elements - count;
4302 } else if (count == elements) {
4303 count = 0;
4304 invert = true;
4305 }
4306
4307 pred = (count << 1) | 1;
4308 pred <<= esz;
4309 pred |= invert << 15;
4310
4311 return pred;
4312 }
4313
4314 /* C.f. Arm pseudocode PredCountTest */
pred_count_test(uint32_t elements,uint32_t count,bool invert)4315 static uint32_t pred_count_test(uint32_t elements, uint32_t count, bool invert)
4316 {
4317 uint32_t flags;
4318
4319 if (count == 0) {
4320 flags = 1; /* !N, Z, C */
4321 } else if (!invert) {
4322 flags = (1u << 31) | 2; /* N, !Z */
4323 flags |= count != elements; /* C */
4324 } else {
4325 flags = 2; /* !Z, !C */
4326 flags |= (count == elements) << 31; /* N */
4327 }
4328 return flags;
4329 }
4330
4331 /* D must be cleared on entry. */
do_whilel(ARMPredicateReg * d,uint64_t esz_mask,uint32_t count,uint32_t oprbits)4332 static void do_whilel(ARMPredicateReg *d, uint64_t esz_mask,
4333 uint32_t count, uint32_t oprbits)
4334 {
4335 tcg_debug_assert(count <= oprbits);
4336 if (count) {
4337 uint32_t i;
4338
4339 /* Set all of the requested bits. */
4340 for (i = 0; i < count / 64; ++i) {
4341 d->p[i] = esz_mask;
4342 }
4343 if (count & 63) {
4344 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4345 }
4346 }
4347 }
4348
HELPER(sve_whilel)4349 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4350 {
4351 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4352 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4353 uint32_t oprbits = oprsz * 8;
4354 uint64_t esz_mask = pred_esz_masks[esz];
4355 ARMPredicateReg *d = vd;
4356
4357 count <<= esz;
4358 memset(d, 0, sizeof(*d));
4359 do_whilel(d, esz_mask, count, oprbits);
4360 return pred_count_test(oprbits, count, false);
4361 }
4362
HELPER(sve_while2l)4363 uint32_t HELPER(sve_while2l)(void *vd, uint32_t count, uint32_t pred_desc)
4364 {
4365 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4366 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4367 uint32_t oprbits = oprsz * 8;
4368 uint64_t esz_mask = pred_esz_masks[esz];
4369 ARMPredicateReg *d = vd;
4370
4371 count <<= esz;
4372 memset(d, 0, 2 * sizeof(*d));
4373 if (count <= oprbits) {
4374 do_whilel(&d[0], esz_mask, count, oprbits);
4375 } else {
4376 do_whilel(&d[0], esz_mask, oprbits, oprbits);
4377 do_whilel(&d[1], esz_mask, count - oprbits, oprbits);
4378 }
4379
4380 return pred_count_test(2 * oprbits, count, false);
4381 }
4382
HELPER(sve_whilecl)4383 uint32_t HELPER(sve_whilecl)(void *vd, uint32_t count, uint32_t pred_desc)
4384 {
4385 uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4386 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4387 uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA);
4388 uint32_t vl = pl * 8;
4389 uint32_t elements = (vl >> esz) << scale;
4390 ARMPredicateReg *d = vd;
4391
4392 *d = (ARMPredicateReg) {
4393 .p[0] = encode_pred_count(elements, count, esz, false)
4394 };
4395 return pred_count_test(elements, count, false);
4396 }
4397
4398 /* D must be cleared on entry. */
do_whileg(ARMPredicateReg * d,uint64_t esz_mask,uint32_t count,uint32_t oprbits)4399 static void do_whileg(ARMPredicateReg *d, uint64_t esz_mask,
4400 uint32_t count, uint32_t oprbits)
4401 {
4402 tcg_debug_assert(count <= oprbits);
4403 if (count) {
4404 uint32_t i, invcount = oprbits - count;
4405 uint64_t bits = esz_mask & MAKE_64BIT_MASK(invcount & 63, 64);
4406
4407 for (i = invcount / 64; i < oprbits / 64; ++i) {
4408 d->p[i] = bits;
4409 bits = esz_mask;
4410 }
4411 if (oprbits & 63) {
4412 d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63);
4413 }
4414 }
4415 }
4416
HELPER(sve_whileg)4417 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4418 {
4419 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4420 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4421 uint32_t oprbits = oprsz * 8;
4422 uint64_t esz_mask = pred_esz_masks[esz];
4423 ARMPredicateReg *d = vd;
4424
4425 count <<= esz;
4426 memset(d, 0, sizeof(*d));
4427 do_whileg(d, esz_mask, count, oprbits);
4428 return pred_count_test(oprbits, count, true);
4429 }
4430
HELPER(sve_while2g)4431 uint32_t HELPER(sve_while2g)(void *vd, uint32_t count, uint32_t pred_desc)
4432 {
4433 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4434 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4435 uint32_t oprbits = oprsz * 8;
4436 uint64_t esz_mask = pred_esz_masks[esz];
4437 ARMPredicateReg *d = vd;
4438
4439 count <<= esz;
4440 memset(d, 0, 2 * sizeof(*d));
4441 if (count <= oprbits) {
4442 do_whileg(&d[1], esz_mask, count, oprbits);
4443 } else {
4444 do_whilel(&d[1], esz_mask, oprbits, oprbits);
4445 do_whileg(&d[0], esz_mask, count - oprbits, oprbits);
4446 }
4447
4448 return pred_count_test(2 * oprbits, count, true);
4449 }
4450
HELPER(sve_whilecg)4451 uint32_t HELPER(sve_whilecg)(void *vd, uint32_t count, uint32_t pred_desc)
4452 {
4453 uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4454 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4455 uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA);
4456 uint32_t vl = pl * 8;
4457 uint32_t elements = (vl >> esz) << scale;
4458 ARMPredicateReg *d = vd;
4459
4460 *d = (ARMPredicateReg) {
4461 .p[0] = encode_pred_count(elements, count, esz, true)
4462 };
4463 return pred_count_test(elements, count, true);
4464 }
4465
4466 /* Recursive reduction on a function;
4467 * C.f. the ARM ARM function ReducePredicated.
4468 *
4469 * While it would be possible to write this without the DATA temporary,
4470 * it is much simpler to process the predicate register this way.
4471 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4472 * little to gain with a more complex non-recursive form.
4473 */
4474 #define DO_REDUCE(NAME, SUF, TYPE, H, FUNC, IDENT) \
4475 static TYPE FUNC##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4476 { \
4477 if (n == 1) { \
4478 return *data; \
4479 } else { \
4480 uintptr_t half = n / 2; \
4481 TYPE lo = FUNC##_reduce(data, status, half); \
4482 TYPE hi = FUNC##_reduce(data + half, status, half); \
4483 return FUNC(lo, hi, status); \
4484 } \
4485 } \
4486 uint64_t helper_sve_##NAME##v_##SUF(void *vn, void *vg, \
4487 float_status *status, uint32_t desc) \
4488 { \
4489 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4490 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4491 TYPE ident = IDENT; \
4492 for (i = 0; i < oprsz; ) { \
4493 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4494 do { \
4495 TYPE nn = *(TYPE *)(vn + H(i)); \
4496 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : ident); \
4497 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4498 } while (i & 15); \
4499 } \
4500 for (; i < maxsz; i += sizeof(TYPE)) { \
4501 *(TYPE *)((void *)data + i) = ident; \
4502 } \
4503 return FUNC##_reduce(data, status, maxsz / sizeof(TYPE)); \
4504 } \
4505 void helper_sve2p1_##NAME##qv_##SUF(void *vd, void *vn, void *vg, \
4506 float_status *status, uint32_t desc) \
4507 { \
4508 unsigned oprsz = simd_oprsz(desc), segments = oprsz / 16; \
4509 TYPE ident = IDENT; \
4510 for (unsigned e = 0; e < 16; e += sizeof(TYPE)) { \
4511 TYPE data[ARM_MAX_VQ]; \
4512 for (unsigned s = 0; s < segments; s++) { \
4513 uint16_t pg = *(uint16_t *)(vg + H1_2(s * 2)); \
4514 TYPE nn = *(TYPE *)(vn + (s * 16 + H(e))); \
4515 data[s] = (pg >> e) & 1 ? nn : ident; \
4516 } \
4517 *(TYPE *)(vd + H(e)) = FUNC##_reduce(data, status, segments); \
4518 } \
4519 clear_tail(vd, 16, simd_maxsz(desc)); \
4520 }
4521
DO_REDUCE(fadd,h,float16,H1_2,float16_add,float16_zero)4522 DO_REDUCE(fadd,h, float16, H1_2, float16_add, float16_zero)
4523 DO_REDUCE(fadd,s, float32, H1_4, float32_add, float32_zero)
4524 DO_REDUCE(fadd,d, float64, H1_8, float64_add, float64_zero)
4525
4526 /*
4527 * We can't avoid the function call for the default NaN value, because
4528 * it changes when FPCR.AH is set.
4529 */
4530 DO_REDUCE(fminnm,h, float16, H1_2, float16_minnum, float16_default_nan(status))
4531 DO_REDUCE(fminnm,s, float32, H1_4, float32_minnum, float32_default_nan(status))
4532 DO_REDUCE(fminnm,d, float64, H1_8, float64_minnum, float64_default_nan(status))
4533
4534 DO_REDUCE(fmaxnm,h, float16, H1_2, float16_maxnum, float16_default_nan(status))
4535 DO_REDUCE(fmaxnm,s, float32, H1_4, float32_maxnum, float32_default_nan(status))
4536 DO_REDUCE(fmaxnm,d, float64, H1_8, float64_maxnum, float64_default_nan(status))
4537
4538 DO_REDUCE(fmin,h, float16, H1_2, float16_min, float16_infinity)
4539 DO_REDUCE(fmin,s, float32, H1_4, float32_min, float32_infinity)
4540 DO_REDUCE(fmin,d, float64, H1_8, float64_min, float64_infinity)
4541
4542 DO_REDUCE(fmax,h, float16, H1_2, float16_max, float16_chs(float16_infinity))
4543 DO_REDUCE(fmax,s, float32, H1_4, float32_max, float32_chs(float32_infinity))
4544 DO_REDUCE(fmax,d, float64, H1_8, float64_max, float64_chs(float64_infinity))
4545
4546 DO_REDUCE(ah_fmin,h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
4547 DO_REDUCE(ah_fmin,s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
4548 DO_REDUCE(ah_fmin,d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
4549
4550 DO_REDUCE(ah_fmax,h, float16, H1_2, helper_vfp_ah_maxh,
4551 float16_chs(float16_infinity))
4552 DO_REDUCE(ah_fmax,s, float32, H1_4, helper_vfp_ah_maxs,
4553 float32_chs(float32_infinity))
4554 DO_REDUCE(ah_fmax,d, float64, H1_8, helper_vfp_ah_maxd,
4555 float64_chs(float64_infinity))
4556
4557 #undef DO_REDUCE
4558
4559 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4560 float_status *status, uint32_t desc)
4561 {
4562 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4563 float16 result = nn;
4564
4565 do {
4566 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4567 do {
4568 if (pg & 1) {
4569 float16 mm = *(float16 *)(vm + H1_2(i));
4570 result = float16_add(result, mm, status);
4571 }
4572 i += sizeof(float16), pg >>= sizeof(float16);
4573 } while (i & 15);
4574 } while (i < opr_sz);
4575
4576 return result;
4577 }
4578
HELPER(sve_fadda_s)4579 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4580 float_status *status, uint32_t desc)
4581 {
4582 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4583 float32 result = nn;
4584
4585 do {
4586 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4587 do {
4588 if (pg & 1) {
4589 float32 mm = *(float32 *)(vm + H1_2(i));
4590 result = float32_add(result, mm, status);
4591 }
4592 i += sizeof(float32), pg >>= sizeof(float32);
4593 } while (i & 15);
4594 } while (i < opr_sz);
4595
4596 return result;
4597 }
4598
HELPER(sve_fadda_d)4599 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4600 float_status *status, uint32_t desc)
4601 {
4602 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4603 uint64_t *m = vm;
4604 uint8_t *pg = vg;
4605
4606 for (i = 0; i < opr_sz; i++) {
4607 if (pg[H1(i)] & 1) {
4608 nn = float64_add(nn, m[i], status);
4609 }
4610 }
4611
4612 return nn;
4613 }
4614
4615 /* Fully general three-operand expander, controlled by a predicate,
4616 * With the extra float_status parameter.
4617 */
4618 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4619 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4620 float_status *status, uint32_t desc) \
4621 { \
4622 intptr_t i = simd_oprsz(desc); \
4623 uint64_t *g = vg; \
4624 do { \
4625 uint64_t pg = g[(i - 1) >> 6]; \
4626 do { \
4627 i -= sizeof(TYPE); \
4628 if (likely((pg >> (i & 63)) & 1)) { \
4629 TYPE nn = *(TYPE *)(vn + H(i)); \
4630 TYPE mm = *(TYPE *)(vm + H(i)); \
4631 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4632 } \
4633 } while (i & 63); \
4634 } while (i != 0); \
4635 }
4636
DO_ZPZZ_FP(sve_fadd_b16,uint16_t,H1_2,bfloat16_add)4637 DO_ZPZZ_FP(sve_fadd_b16, uint16_t, H1_2, bfloat16_add)
4638 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4639 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4640 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4641
4642 DO_ZPZZ_FP(sve_fsub_b16, uint16_t, H1_2, bfloat16_sub)
4643 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4644 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4645 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4646
4647 DO_ZPZZ_FP(sve_fmul_b16, uint16_t, H1_2, bfloat16_mul)
4648 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4649 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4650 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4651
4652 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4653 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4654 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4655
4656 DO_ZPZZ_FP(sve_fmin_b16, uint16_t, H1_2, bfloat16_min)
4657 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4658 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4659 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4660
4661 DO_ZPZZ_FP(sve_fmax_b16, uint16_t, H1_2, bfloat16_max)
4662 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4663 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4664 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4665
4666 DO_ZPZZ_FP(sve_ah_fmin_b16, uint16_t, H1_2, helper_sme2_ah_fmin_b16)
4667 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh)
4668 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins)
4669 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind)
4670
4671 DO_ZPZZ_FP(sve_ah_fmax_b16, uint16_t, H1_2, helper_sme2_ah_fmax_b16)
4672 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh)
4673 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs)
4674 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd)
4675
4676 DO_ZPZZ_FP(sve_fminnum_b16, uint16_t, H1_2, bfloat16_minnum)
4677 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4678 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4679 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4680
4681 DO_ZPZZ_FP(sve_fmaxnum_b16, uint16_t, H1_2, bfloat16_maxnum)
4682 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4683 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4684 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4685
4686 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4687 {
4688 return float16_abs(float16_sub(a, b, s));
4689 }
4690
abd_s(float32 a,float32 b,float_status * s)4691 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4692 {
4693 return float32_abs(float32_sub(a, b, s));
4694 }
4695
abd_d(float64 a,float64 b,float_status * s)4696 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4697 {
4698 return float64_abs(float64_sub(a, b, s));
4699 }
4700
4701 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
ah_abd_h(float16 op1,float16 op2,float_status * stat)4702 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat)
4703 {
4704 float16 r = float16_sub(op1, op2, stat);
4705 return float16_is_any_nan(r) ? r : float16_abs(r);
4706 }
4707
ah_abd_s(float32 op1,float32 op2,float_status * stat)4708 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat)
4709 {
4710 float32 r = float32_sub(op1, op2, stat);
4711 return float32_is_any_nan(r) ? r : float32_abs(r);
4712 }
4713
ah_abd_d(float64 op1,float64 op2,float_status * stat)4714 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat)
4715 {
4716 float64 r = float64_sub(op1, op2, stat);
4717 return float64_is_any_nan(r) ? r : float64_abs(r);
4718 }
4719
DO_ZPZZ_FP(sve_fabd_h,uint16_t,H1_2,abd_h)4720 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4721 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4722 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4723 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h)
4724 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s)
4725 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d)
4726
4727 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4728 {
4729 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4730 return float64_scalbn(a, b_int, s);
4731 }
4732
DO_ZPZZ_FP(sve_fscalbn_h,int16_t,H1_2,float16_scalbn)4733 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4734 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4735 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4736
4737 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4738 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4739 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4740
4741 #undef DO_ZPZZ_FP
4742
4743 /* Three-operand expander, with one scalar operand, controlled by
4744 * a predicate, with the extra float_status parameter.
4745 */
4746 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4747 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4748 float_status *status, uint32_t desc) \
4749 { \
4750 intptr_t i = simd_oprsz(desc); \
4751 uint64_t *g = vg; \
4752 TYPE mm = scalar; \
4753 do { \
4754 uint64_t pg = g[(i - 1) >> 6]; \
4755 do { \
4756 i -= sizeof(TYPE); \
4757 if (likely((pg >> (i & 63)) & 1)) { \
4758 TYPE nn = *(TYPE *)(vn + H(i)); \
4759 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4760 } \
4761 } while (i & 63); \
4762 } while (i != 0); \
4763 }
4764
4765 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4766 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4767 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4768
4769 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4770 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4771 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4772
4773 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4774 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4775 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4776
4777 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4778 {
4779 return float16_sub(b, a, s);
4780 }
4781
subr_s(float32 a,float32 b,float_status * s)4782 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4783 {
4784 return float32_sub(b, a, s);
4785 }
4786
subr_d(float64 a,float64 b,float_status * s)4787 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4788 {
4789 return float64_sub(b, a, s);
4790 }
4791
DO_ZPZS_FP(sve_fsubrs_h,float16,H1_2,subr_h)4792 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4793 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4794 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4795
4796 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4797 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4798 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4799
4800 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4801 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4802 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4803
4804 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4805 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4806 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4807
4808 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4809 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4810 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4811
4812 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh)
4813 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs)
4814 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd)
4815
4816 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh)
4817 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins)
4818 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind)
4819
4820 /* Fully general two-operand expander, controlled by a predicate,
4821 * With the extra float_status parameter.
4822 */
4823 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4824 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4825 float_status *status, uint32_t desc) \
4826 { \
4827 intptr_t i = simd_oprsz(desc); \
4828 uint64_t *g = vg; \
4829 do { \
4830 uint64_t pg = g[(i - 1) >> 6]; \
4831 do { \
4832 i -= sizeof(TYPE); \
4833 if (likely((pg >> (i & 63)) & 1)) { \
4834 TYPE nn = *(TYPE *)(vn + H(i)); \
4835 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4836 } \
4837 } while (i & 63); \
4838 } while (i != 0); \
4839 }
4840
4841 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4842 * FZ16. When converting from fp16, this affects flushing input denormals;
4843 * when converting to fp16, this affects flushing output denormals.
4844 */
4845 float32 sve_f16_to_f32(float16 f, float_status *fpst)
4846 {
4847 bool save = get_flush_inputs_to_zero(fpst);
4848 float32 ret;
4849
4850 set_flush_inputs_to_zero(false, fpst);
4851 ret = float16_to_float32(f, true, fpst);
4852 set_flush_inputs_to_zero(save, fpst);
4853 return ret;
4854 }
4855
sve_f16_to_f64(float16 f,float_status * fpst)4856 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4857 {
4858 bool save = get_flush_inputs_to_zero(fpst);
4859 float64 ret;
4860
4861 set_flush_inputs_to_zero(false, fpst);
4862 ret = float16_to_float64(f, true, fpst);
4863 set_flush_inputs_to_zero(save, fpst);
4864 return ret;
4865 }
4866
sve_f32_to_f16(float32 f,float_status * fpst)4867 float16 sve_f32_to_f16(float32 f, float_status *fpst)
4868 {
4869 bool save = get_flush_to_zero(fpst);
4870 float16 ret;
4871
4872 set_flush_to_zero(false, fpst);
4873 ret = float32_to_float16(f, true, fpst);
4874 set_flush_to_zero(save, fpst);
4875 return ret;
4876 }
4877
sve_f64_to_f16(float64 f,float_status * fpst)4878 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4879 {
4880 bool save = get_flush_to_zero(fpst);
4881 float16 ret;
4882
4883 set_flush_to_zero(false, fpst);
4884 ret = float64_to_float16(f, true, fpst);
4885 set_flush_to_zero(save, fpst);
4886 return ret;
4887 }
4888
vfp_float16_to_int16_rtz(float16 f,float_status * s)4889 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4890 {
4891 if (float16_is_any_nan(f)) {
4892 float_raise(float_flag_invalid, s);
4893 return 0;
4894 }
4895 return float16_to_int16_round_to_zero(f, s);
4896 }
4897
vfp_float16_to_int64_rtz(float16 f,float_status * s)4898 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4899 {
4900 if (float16_is_any_nan(f)) {
4901 float_raise(float_flag_invalid, s);
4902 return 0;
4903 }
4904 return float16_to_int64_round_to_zero(f, s);
4905 }
4906
vfp_float32_to_int64_rtz(float32 f,float_status * s)4907 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4908 {
4909 if (float32_is_any_nan(f)) {
4910 float_raise(float_flag_invalid, s);
4911 return 0;
4912 }
4913 return float32_to_int64_round_to_zero(f, s);
4914 }
4915
vfp_float64_to_int64_rtz(float64 f,float_status * s)4916 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4917 {
4918 if (float64_is_any_nan(f)) {
4919 float_raise(float_flag_invalid, s);
4920 return 0;
4921 }
4922 return float64_to_int64_round_to_zero(f, s);
4923 }
4924
vfp_float16_to_uint16_rtz(float16 f,float_status * s)4925 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4926 {
4927 if (float16_is_any_nan(f)) {
4928 float_raise(float_flag_invalid, s);
4929 return 0;
4930 }
4931 return float16_to_uint16_round_to_zero(f, s);
4932 }
4933
vfp_float16_to_uint64_rtz(float16 f,float_status * s)4934 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4935 {
4936 if (float16_is_any_nan(f)) {
4937 float_raise(float_flag_invalid, s);
4938 return 0;
4939 }
4940 return float16_to_uint64_round_to_zero(f, s);
4941 }
4942
vfp_float32_to_uint64_rtz(float32 f,float_status * s)4943 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4944 {
4945 if (float32_is_any_nan(f)) {
4946 float_raise(float_flag_invalid, s);
4947 return 0;
4948 }
4949 return float32_to_uint64_round_to_zero(f, s);
4950 }
4951
vfp_float64_to_uint64_rtz(float64 f,float_status * s)4952 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4953 {
4954 if (float64_is_any_nan(f)) {
4955 float_raise(float_flag_invalid, s);
4956 return 0;
4957 }
4958 return float64_to_uint64_round_to_zero(f, s);
4959 }
4960
DO_ZPZ_FP(sve_fcvt_sh,uint32_t,H1_4,sve_f32_to_f16)4961 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4962 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4963 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4964 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4965 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4966 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4967 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4968
4969 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4970 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4971 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4972 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4973 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4974 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4975 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4976
4977 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4978 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4979 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4980 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4981 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4982 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4983 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4984
4985 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4986 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4987 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4988
4989 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4990 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4991 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4992
4993 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4994 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4995 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4996
4997 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4998 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4999 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
5000
5001 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
5002 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
5003 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
5004 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
5005 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
5006 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
5007 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
5008
5009 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
5010 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
5011 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
5012 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
5013 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
5014 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
5015 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
5016
5017 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
5018 {
5019 /* Extract frac to the top of the uint32_t. */
5020 uint32_t frac = (uint32_t)a << (16 + 6);
5021 int16_t exp = extract32(a, 10, 5);
5022
5023 if (unlikely(exp == 0)) {
5024 if (frac != 0) {
5025 if (!get_flush_inputs_to_zero(s)) {
5026 /* denormal: bias - fractional_zeros */
5027 return -15 - clz32(frac);
5028 }
5029 /* flush to zero */
5030 float_raise(float_flag_input_denormal_flushed, s);
5031 }
5032 } else if (unlikely(exp == 0x1f)) {
5033 if (frac == 0) {
5034 return INT16_MAX; /* infinity */
5035 }
5036 } else {
5037 /* normal: exp - bias */
5038 return exp - 15;
5039 }
5040 /* nan or zero */
5041 float_raise(float_flag_invalid, s);
5042 return INT16_MIN;
5043 }
5044
do_float32_logb_as_int(float32 a,float_status * s)5045 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
5046 {
5047 /* Extract frac to the top of the uint32_t. */
5048 uint32_t frac = a << 9;
5049 int32_t exp = extract32(a, 23, 8);
5050
5051 if (unlikely(exp == 0)) {
5052 if (frac != 0) {
5053 if (!get_flush_inputs_to_zero(s)) {
5054 /* denormal: bias - fractional_zeros */
5055 return -127 - clz32(frac);
5056 }
5057 /* flush to zero */
5058 float_raise(float_flag_input_denormal_flushed, s);
5059 }
5060 } else if (unlikely(exp == 0xff)) {
5061 if (frac == 0) {
5062 return INT32_MAX; /* infinity */
5063 }
5064 } else {
5065 /* normal: exp - bias */
5066 return exp - 127;
5067 }
5068 /* nan or zero */
5069 float_raise(float_flag_invalid, s);
5070 return INT32_MIN;
5071 }
5072
do_float64_logb_as_int(float64 a,float_status * s)5073 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
5074 {
5075 /* Extract frac to the top of the uint64_t. */
5076 uint64_t frac = a << 12;
5077 int64_t exp = extract64(a, 52, 11);
5078
5079 if (unlikely(exp == 0)) {
5080 if (frac != 0) {
5081 if (!get_flush_inputs_to_zero(s)) {
5082 /* denormal: bias - fractional_zeros */
5083 return -1023 - clz64(frac);
5084 }
5085 /* flush to zero */
5086 float_raise(float_flag_input_denormal_flushed, s);
5087 }
5088 } else if (unlikely(exp == 0x7ff)) {
5089 if (frac == 0) {
5090 return INT64_MAX; /* infinity */
5091 }
5092 } else {
5093 /* normal: exp - bias */
5094 return exp - 1023;
5095 }
5096 /* nan or zero */
5097 float_raise(float_flag_invalid, s);
5098 return INT64_MIN;
5099 }
5100
DO_ZPZ_FP(flogb_h,float16,H1_2,do_float16_logb_as_int)5101 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
5102 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
5103 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
5104
5105 #undef DO_ZPZ_FP
5106
5107 static void do_fmla_zpzzz_b16(void *vd, void *vn, void *vm, void *va, void *vg,
5108 float_status *status, uint32_t desc,
5109 uint16_t neg1, uint16_t neg3, int flags)
5110 {
5111 intptr_t i = simd_oprsz(desc);
5112 uint64_t *g = vg;
5113
5114 do {
5115 uint64_t pg = g[(i - 1) >> 6];
5116 do {
5117 i -= 2;
5118 if (likely((pg >> (i & 63)) & 1)) {
5119 float16 e1, e2, e3, r;
5120
5121 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
5122 e2 = *(uint16_t *)(vm + H1_2(i));
5123 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
5124 r = bfloat16_muladd(e1, e2, e3, flags, status);
5125 *(uint16_t *)(vd + H1_2(i)) = r;
5126 }
5127 } while (i & 63);
5128 } while (i != 0);
5129 }
5130
HELPER(sve_fmla_zpzzz_b16)5131 void HELPER(sve_fmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5132 void *vg, float_status *status, uint32_t desc)
5133 {
5134 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5135 }
5136
HELPER(sve_fmls_zpzzz_b16)5137 void HELPER(sve_fmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5138 void *vg, float_status *status, uint32_t desc)
5139 {
5140 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
5141 }
5142
HELPER(sve_fnmla_zpzzz_b16)5143 void HELPER(sve_fnmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5144 void *vg, float_status *status, uint32_t desc)
5145 {
5146 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
5147 }
5148
HELPER(sve_fnmls_zpzzz_b16)5149 void HELPER(sve_fnmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5150 void *vg, float_status *status, uint32_t desc)
5151 {
5152 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
5153 }
5154
HELPER(sve_ah_fmls_zpzzz_b16)5155 void HELPER(sve_ah_fmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5156 void *vg, float_status *status, uint32_t desc)
5157 {
5158 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0,
5159 float_muladd_negate_product);
5160 }
5161
HELPER(sve_ah_fnmla_zpzzz_b16)5162 void HELPER(sve_ah_fnmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5163 void *vg, float_status *status, uint32_t desc)
5164 {
5165 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0,
5166 float_muladd_negate_product | float_muladd_negate_c);
5167 }
5168
HELPER(sve_ah_fnmls_zpzzz_b16)5169 void HELPER(sve_ah_fnmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5170 void *vg, float_status *status, uint32_t desc)
5171 {
5172 do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0,
5173 float_muladd_negate_c);
5174 }
5175
do_fmla_zpzzz_h(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint16_t neg1,uint16_t neg3,int flags)5176 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
5177 float_status *status, uint32_t desc,
5178 uint16_t neg1, uint16_t neg3, int flags)
5179 {
5180 intptr_t i = simd_oprsz(desc);
5181 uint64_t *g = vg;
5182
5183 do {
5184 uint64_t pg = g[(i - 1) >> 6];
5185 do {
5186 i -= 2;
5187 if (likely((pg >> (i & 63)) & 1)) {
5188 float16 e1, e2, e3, r;
5189
5190 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
5191 e2 = *(uint16_t *)(vm + H1_2(i));
5192 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
5193 r = float16_muladd(e1, e2, e3, flags, status);
5194 *(uint16_t *)(vd + H1_2(i)) = r;
5195 }
5196 } while (i & 63);
5197 } while (i != 0);
5198 }
5199
HELPER(sve_fmla_zpzzz_h)5200 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5201 void *vg, float_status *status, uint32_t desc)
5202 {
5203 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5204 }
5205
HELPER(sve_fmls_zpzzz_h)5206 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5207 void *vg, float_status *status, uint32_t desc)
5208 {
5209 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
5210 }
5211
HELPER(sve_fnmla_zpzzz_h)5212 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5213 void *vg, float_status *status, uint32_t desc)
5214 {
5215 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
5216 }
5217
HELPER(sve_fnmls_zpzzz_h)5218 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5219 void *vg, float_status *status, uint32_t desc)
5220 {
5221 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
5222 }
5223
HELPER(sve_ah_fmls_zpzzz_h)5224 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5225 void *vg, float_status *status, uint32_t desc)
5226 {
5227 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
5228 float_muladd_negate_product);
5229 }
5230
HELPER(sve_ah_fnmla_zpzzz_h)5231 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5232 void *vg, float_status *status, uint32_t desc)
5233 {
5234 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
5235 float_muladd_negate_product | float_muladd_negate_c);
5236 }
5237
HELPER(sve_ah_fnmls_zpzzz_h)5238 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5239 void *vg, float_status *status, uint32_t desc)
5240 {
5241 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
5242 float_muladd_negate_c);
5243 }
5244
do_fmla_zpzzz_s(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint32_t neg1,uint32_t neg3,int flags)5245 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
5246 float_status *status, uint32_t desc,
5247 uint32_t neg1, uint32_t neg3, int flags)
5248 {
5249 intptr_t i = simd_oprsz(desc);
5250 uint64_t *g = vg;
5251
5252 do {
5253 uint64_t pg = g[(i - 1) >> 6];
5254 do {
5255 i -= 4;
5256 if (likely((pg >> (i & 63)) & 1)) {
5257 float32 e1, e2, e3, r;
5258
5259 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
5260 e2 = *(uint32_t *)(vm + H1_4(i));
5261 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
5262 r = float32_muladd(e1, e2, e3, flags, status);
5263 *(uint32_t *)(vd + H1_4(i)) = r;
5264 }
5265 } while (i & 63);
5266 } while (i != 0);
5267 }
5268
HELPER(sve_fmla_zpzzz_s)5269 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5270 void *vg, float_status *status, uint32_t desc)
5271 {
5272 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5273 }
5274
HELPER(sve_fmls_zpzzz_s)5275 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5276 void *vg, float_status *status, uint32_t desc)
5277 {
5278 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0);
5279 }
5280
HELPER(sve_fnmla_zpzzz_s)5281 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5282 void *vg, float_status *status, uint32_t desc)
5283 {
5284 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0);
5285 }
5286
HELPER(sve_fnmls_zpzzz_s)5287 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5288 void *vg, float_status *status, uint32_t desc)
5289 {
5290 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0);
5291 }
5292
HELPER(sve_ah_fmls_zpzzz_s)5293 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5294 void *vg, float_status *status, uint32_t desc)
5295 {
5296 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
5297 float_muladd_negate_product);
5298 }
5299
HELPER(sve_ah_fnmla_zpzzz_s)5300 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5301 void *vg, float_status *status, uint32_t desc)
5302 {
5303 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
5304 float_muladd_negate_product | float_muladd_negate_c);
5305 }
5306
HELPER(sve_ah_fnmls_zpzzz_s)5307 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5308 void *vg, float_status *status, uint32_t desc)
5309 {
5310 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
5311 float_muladd_negate_c);
5312 }
5313
do_fmla_zpzzz_d(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint64_t neg1,uint64_t neg3,int flags)5314 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
5315 float_status *status, uint32_t desc,
5316 uint64_t neg1, uint64_t neg3, int flags)
5317 {
5318 intptr_t i = simd_oprsz(desc);
5319 uint64_t *g = vg;
5320
5321 do {
5322 uint64_t pg = g[(i - 1) >> 6];
5323 do {
5324 i -= 8;
5325 if (likely((pg >> (i & 63)) & 1)) {
5326 float64 e1, e2, e3, r;
5327
5328 e1 = *(uint64_t *)(vn + i) ^ neg1;
5329 e2 = *(uint64_t *)(vm + i);
5330 e3 = *(uint64_t *)(va + i) ^ neg3;
5331 r = float64_muladd(e1, e2, e3, flags, status);
5332 *(uint64_t *)(vd + i) = r;
5333 }
5334 } while (i & 63);
5335 } while (i != 0);
5336 }
5337
HELPER(sve_fmla_zpzzz_d)5338 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5339 void *vg, float_status *status, uint32_t desc)
5340 {
5341 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5342 }
5343
HELPER(sve_fmls_zpzzz_d)5344 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5345 void *vg, float_status *status, uint32_t desc)
5346 {
5347 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0);
5348 }
5349
HELPER(sve_fnmla_zpzzz_d)5350 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5351 void *vg, float_status *status, uint32_t desc)
5352 {
5353 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0);
5354 }
5355
HELPER(sve_fnmls_zpzzz_d)5356 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5357 void *vg, float_status *status, uint32_t desc)
5358 {
5359 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0);
5360 }
5361
HELPER(sve_ah_fmls_zpzzz_d)5362 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5363 void *vg, float_status *status, uint32_t desc)
5364 {
5365 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5366 float_muladd_negate_product);
5367 }
5368
HELPER(sve_ah_fnmla_zpzzz_d)5369 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5370 void *vg, float_status *status, uint32_t desc)
5371 {
5372 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5373 float_muladd_negate_product | float_muladd_negate_c);
5374 }
5375
HELPER(sve_ah_fnmls_zpzzz_d)5376 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5377 void *vg, float_status *status, uint32_t desc)
5378 {
5379 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5380 float_muladd_negate_c);
5381 }
5382
5383 /* Two operand floating-point comparison controlled by a predicate.
5384 * Unlike the integer version, we are not allowed to optimistically
5385 * compare operands, since the comparison may have side effects wrt
5386 * the FPSR.
5387 */
5388 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
5389 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
5390 float_status *status, uint32_t desc) \
5391 { \
5392 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
5393 uint64_t *d = vd, *g = vg; \
5394 do { \
5395 uint64_t out = 0, pg = g[j]; \
5396 do { \
5397 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
5398 if (likely((pg >> (i & 63)) & 1)) { \
5399 TYPE nn = *(TYPE *)(vn + H(i)); \
5400 TYPE mm = *(TYPE *)(vm + H(i)); \
5401 out |= OP(TYPE, nn, mm, status); \
5402 } \
5403 } while (i & 63); \
5404 d[j--] = out; \
5405 } while (i > 0); \
5406 }
5407
5408 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5409 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5410 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5411 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5412 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5413 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5414
5415 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5416 DO_FPCMP_PPZZ_H(NAME, OP) \
5417 DO_FPCMP_PPZZ_S(NAME, OP) \
5418 DO_FPCMP_PPZZ_D(NAME, OP)
5419
5420 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
5421 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
5422 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
5423 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
5424 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
5425 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
5426 #define DO_FCMUO(TYPE, X, Y, ST) \
5427 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5428 #define DO_FACGE(TYPE, X, Y, ST) \
5429 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5430 #define DO_FACGT(TYPE, X, Y, ST) \
5431 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5432
DO_FPCMP_PPZZ_ALL(sve_fcmge,DO_FCMGE)5433 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5434 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5435 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5436 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5437 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5438 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5439 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5440
5441 #undef DO_FPCMP_PPZZ_ALL
5442 #undef DO_FPCMP_PPZZ_D
5443 #undef DO_FPCMP_PPZZ_S
5444 #undef DO_FPCMP_PPZZ_H
5445 #undef DO_FPCMP_PPZZ
5446
5447 /* One operand floating-point comparison against zero, controlled
5448 * by a predicate.
5449 */
5450 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
5451 void HELPER(NAME)(void *vd, void *vn, void *vg, \
5452 float_status *status, uint32_t desc) \
5453 { \
5454 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
5455 uint64_t *d = vd, *g = vg; \
5456 do { \
5457 uint64_t out = 0, pg = g[j]; \
5458 do { \
5459 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
5460 if ((pg >> (i & 63)) & 1) { \
5461 TYPE nn = *(TYPE *)(vn + H(i)); \
5462 out |= OP(TYPE, nn, 0, status); \
5463 } \
5464 } while (i & 63); \
5465 d[j--] = out; \
5466 } while (i > 0); \
5467 }
5468
5469 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5470 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5471 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5472 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5473 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5474 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5475
5476 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5477 DO_FPCMP_PPZ0_H(NAME, OP) \
5478 DO_FPCMP_PPZ0_S(NAME, OP) \
5479 DO_FPCMP_PPZ0_D(NAME, OP)
5480
5481 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5482 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5483 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5484 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5485 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5486 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5487
5488 /* FP Trig Multiply-Add. */
5489
5490 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
5491 float_status *s, uint32_t desc)
5492 {
5493 static const float16 coeff[16] = {
5494 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5495 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5496 };
5497 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5498 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5499 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5500 float16 *d = vd, *n = vn, *m = vm;
5501
5502 for (i = 0; i < opr_sz; i++) {
5503 float16 mm = m[i];
5504 intptr_t xx = x;
5505 int flags = 0;
5506
5507 if (float16_is_neg(mm)) {
5508 if (fpcr_ah) {
5509 flags = float_muladd_negate_product;
5510 } else {
5511 mm = float16_abs(mm);
5512 }
5513 xx += 8;
5514 }
5515 d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5516 }
5517 }
5518
HELPER(sve_ftmad_s)5519 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5520 float_status *s, uint32_t desc)
5521 {
5522 static const float32 coeff[16] = {
5523 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5524 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5525 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5526 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5527 };
5528 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5529 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5530 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5531 float32 *d = vd, *n = vn, *m = vm;
5532
5533 for (i = 0; i < opr_sz; i++) {
5534 float32 mm = m[i];
5535 intptr_t xx = x;
5536 int flags = 0;
5537
5538 if (float32_is_neg(mm)) {
5539 if (fpcr_ah) {
5540 flags = float_muladd_negate_product;
5541 } else {
5542 mm = float32_abs(mm);
5543 }
5544 xx += 8;
5545 }
5546 d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);
5547 }
5548 }
5549
HELPER(sve_ftmad_d)5550 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5551 float_status *s, uint32_t desc)
5552 {
5553 static const float64 coeff[16] = {
5554 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5555 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5556 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5557 0x3de5d8408868552full, 0x0000000000000000ull,
5558 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5559 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5560 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5561 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5562 };
5563 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5564 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5565 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5566 float64 *d = vd, *n = vn, *m = vm;
5567
5568 for (i = 0; i < opr_sz; i++) {
5569 float64 mm = m[i];
5570 intptr_t xx = x;
5571 int flags = 0;
5572
5573 if (float64_is_neg(mm)) {
5574 if (fpcr_ah) {
5575 flags = float_muladd_negate_product;
5576 } else {
5577 mm = float64_abs(mm);
5578 }
5579 xx += 8;
5580 }
5581 d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);
5582 }
5583 }
5584
5585 /*
5586 * FP Complex Add
5587 */
5588
HELPER(sve_fcadd_h)5589 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5590 float_status *s, uint32_t desc)
5591 {
5592 intptr_t j, i = simd_oprsz(desc);
5593 uint64_t *g = vg;
5594 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5595 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5596
5597 do {
5598 uint64_t pg = g[(i - 1) >> 6];
5599 do {
5600 float16 e0, e1, e2, e3;
5601
5602 /* I holds the real index; J holds the imag index. */
5603 j = i - sizeof(float16);
5604 i -= 2 * sizeof(float16);
5605
5606 e0 = *(float16 *)(vn + H1_2(i));
5607 e1 = *(float16 *)(vm + H1_2(j));
5608 e2 = *(float16 *)(vn + H1_2(j));
5609 e3 = *(float16 *)(vm + H1_2(i));
5610
5611 if (rot) {
5612 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
5613 } else {
5614 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
5615 }
5616
5617 if (likely((pg >> (i & 63)) & 1)) {
5618 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5619 }
5620 if (likely((pg >> (j & 63)) & 1)) {
5621 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5622 }
5623 } while (i & 63);
5624 } while (i != 0);
5625 }
5626
HELPER(sve_fcadd_s)5627 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5628 float_status *s, uint32_t desc)
5629 {
5630 intptr_t j, i = simd_oprsz(desc);
5631 uint64_t *g = vg;
5632 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5633 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5634
5635 do {
5636 uint64_t pg = g[(i - 1) >> 6];
5637 do {
5638 float32 e0, e1, e2, e3;
5639
5640 /* I holds the real index; J holds the imag index. */
5641 j = i - sizeof(float32);
5642 i -= 2 * sizeof(float32);
5643
5644 e0 = *(float32 *)(vn + H1_2(i));
5645 e1 = *(float32 *)(vm + H1_2(j));
5646 e2 = *(float32 *)(vn + H1_2(j));
5647 e3 = *(float32 *)(vm + H1_2(i));
5648
5649 if (rot) {
5650 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
5651 } else {
5652 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
5653 }
5654
5655 if (likely((pg >> (i & 63)) & 1)) {
5656 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5657 }
5658 if (likely((pg >> (j & 63)) & 1)) {
5659 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5660 }
5661 } while (i & 63);
5662 } while (i != 0);
5663 }
5664
HELPER(sve_fcadd_d)5665 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5666 float_status *s, uint32_t desc)
5667 {
5668 intptr_t j, i = simd_oprsz(desc);
5669 uint64_t *g = vg;
5670 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5671 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5672
5673 do {
5674 uint64_t pg = g[(i - 1) >> 6];
5675 do {
5676 float64 e0, e1, e2, e3;
5677
5678 /* I holds the real index; J holds the imag index. */
5679 j = i - sizeof(float64);
5680 i -= 2 * sizeof(float64);
5681
5682 e0 = *(float64 *)(vn + H1_2(i));
5683 e1 = *(float64 *)(vm + H1_2(j));
5684 e2 = *(float64 *)(vn + H1_2(j));
5685 e3 = *(float64 *)(vm + H1_2(i));
5686
5687 if (rot) {
5688 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
5689 } else {
5690 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
5691 }
5692
5693 if (likely((pg >> (i & 63)) & 1)) {
5694 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5695 }
5696 if (likely((pg >> (j & 63)) & 1)) {
5697 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5698 }
5699 } while (i & 63);
5700 } while (i != 0);
5701 }
5702
5703 /*
5704 * FP Complex Multiply
5705 */
5706
HELPER(sve_fcmla_zpzzz_h)5707 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5708 void *vg, float_status *status, uint32_t desc)
5709 {
5710 intptr_t j, i = simd_oprsz(desc);
5711 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5712 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5713 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5714 uint32_t negf_real = flip ^ negf_imag;
5715 float16 negx_imag, negx_real;
5716 uint64_t *g = vg;
5717
5718 /* With AH=0, use negx; with AH=1 use negf. */
5719 negx_real = (negf_real & ~fpcr_ah) << 15;
5720 negx_imag = (negf_imag & ~fpcr_ah) << 15;
5721 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5722 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5723
5724 do {
5725 uint64_t pg = g[(i - 1) >> 6];
5726 do {
5727 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5728
5729 /* I holds the real index; J holds the imag index. */
5730 j = i - sizeof(float16);
5731 i -= 2 * sizeof(float16);
5732
5733 nr = *(float16 *)(vn + H1_2(i));
5734 ni = *(float16 *)(vn + H1_2(j));
5735 mr = *(float16 *)(vm + H1_2(i));
5736 mi = *(float16 *)(vm + H1_2(j));
5737
5738 e2 = (flip ? ni : nr);
5739 e1 = (flip ? mi : mr) ^ negx_real;
5740 e4 = e2;
5741 e3 = (flip ? mr : mi) ^ negx_imag;
5742
5743 if (likely((pg >> (i & 63)) & 1)) {
5744 d = *(float16 *)(va + H1_2(i));
5745 d = float16_muladd(e2, e1, d, negf_real, status);
5746 *(float16 *)(vd + H1_2(i)) = d;
5747 }
5748 if (likely((pg >> (j & 63)) & 1)) {
5749 d = *(float16 *)(va + H1_2(j));
5750 d = float16_muladd(e4, e3, d, negf_imag, status);
5751 *(float16 *)(vd + H1_2(j)) = d;
5752 }
5753 } while (i & 63);
5754 } while (i != 0);
5755 }
5756
HELPER(sve_fcmla_zpzzz_s)5757 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5758 void *vg, float_status *status, uint32_t desc)
5759 {
5760 intptr_t j, i = simd_oprsz(desc);
5761 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5762 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5763 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5764 uint32_t negf_real = flip ^ negf_imag;
5765 float32 negx_imag, negx_real;
5766 uint64_t *g = vg;
5767
5768 /* With AH=0, use negx; with AH=1 use negf. */
5769 negx_real = (negf_real & ~fpcr_ah) << 31;
5770 negx_imag = (negf_imag & ~fpcr_ah) << 31;
5771 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5772 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5773
5774 do {
5775 uint64_t pg = g[(i - 1) >> 6];
5776 do {
5777 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5778
5779 /* I holds the real index; J holds the imag index. */
5780 j = i - sizeof(float32);
5781 i -= 2 * sizeof(float32);
5782
5783 nr = *(float32 *)(vn + H1_2(i));
5784 ni = *(float32 *)(vn + H1_2(j));
5785 mr = *(float32 *)(vm + H1_2(i));
5786 mi = *(float32 *)(vm + H1_2(j));
5787
5788 e2 = (flip ? ni : nr);
5789 e1 = (flip ? mi : mr) ^ negx_real;
5790 e4 = e2;
5791 e3 = (flip ? mr : mi) ^ negx_imag;
5792
5793 if (likely((pg >> (i & 63)) & 1)) {
5794 d = *(float32 *)(va + H1_2(i));
5795 d = float32_muladd(e2, e1, d, negf_real, status);
5796 *(float32 *)(vd + H1_2(i)) = d;
5797 }
5798 if (likely((pg >> (j & 63)) & 1)) {
5799 d = *(float32 *)(va + H1_2(j));
5800 d = float32_muladd(e4, e3, d, negf_imag, status);
5801 *(float32 *)(vd + H1_2(j)) = d;
5802 }
5803 } while (i & 63);
5804 } while (i != 0);
5805 }
5806
HELPER(sve_fcmla_zpzzz_d)5807 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5808 void *vg, float_status *status, uint32_t desc)
5809 {
5810 intptr_t j, i = simd_oprsz(desc);
5811 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5812 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5813 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5814 uint32_t negf_real = flip ^ negf_imag;
5815 float64 negx_imag, negx_real;
5816 uint64_t *g = vg;
5817
5818 /* With AH=0, use negx; with AH=1 use negf. */
5819 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
5820 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
5821 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5822 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5823
5824 do {
5825 uint64_t pg = g[(i - 1) >> 6];
5826 do {
5827 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5828
5829 /* I holds the real index; J holds the imag index. */
5830 j = i - sizeof(float64);
5831 i -= 2 * sizeof(float64);
5832
5833 nr = *(float64 *)(vn + H1_2(i));
5834 ni = *(float64 *)(vn + H1_2(j));
5835 mr = *(float64 *)(vm + H1_2(i));
5836 mi = *(float64 *)(vm + H1_2(j));
5837
5838 e2 = (flip ? ni : nr);
5839 e1 = (flip ? mi : mr) ^ negx_real;
5840 e4 = e2;
5841 e3 = (flip ? mr : mi) ^ negx_imag;
5842
5843 if (likely((pg >> (i & 63)) & 1)) {
5844 d = *(float64 *)(va + H1_2(i));
5845 d = float64_muladd(e2, e1, d, negf_real, status);
5846 *(float64 *)(vd + H1_2(i)) = d;
5847 }
5848 if (likely((pg >> (j & 63)) & 1)) {
5849 d = *(float64 *)(va + H1_2(j));
5850 d = float64_muladd(e4, e3, d, negf_imag, status);
5851 *(float64 *)(vd + H1_2(j)) = d;
5852 }
5853 } while (i & 63);
5854 } while (i != 0);
5855 }
5856
5857 /*
5858 * Load contiguous data, protected by a governing predicate.
5859 */
5860
5861 /*
5862 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5863 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5864 * element >= @reg_off, or @reg_max if there were no active elements at all.
5865 */
find_next_active(uint64_t * vg,intptr_t reg_off,intptr_t reg_max,int esz)5866 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5867 intptr_t reg_max, int esz)
5868 {
5869 uint64_t pg_mask = pred_esz_masks[esz];
5870 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5871
5872 /* In normal usage, the first element is active. */
5873 if (likely(pg & 1)) {
5874 return reg_off;
5875 }
5876
5877 if (pg == 0) {
5878 reg_off &= -64;
5879 do {
5880 reg_off += 64;
5881 if (unlikely(reg_off >= reg_max)) {
5882 /* The entire predicate was false. */
5883 return reg_max;
5884 }
5885 pg = vg[reg_off >> 6] & pg_mask;
5886 } while (pg == 0);
5887 }
5888 reg_off += ctz64(pg);
5889
5890 /* We should never see an out of range predicate bit set. */
5891 tcg_debug_assert(reg_off < reg_max);
5892 return reg_off;
5893 }
5894
5895 /*
5896 * Resolve the guest virtual address to info->host and info->flags.
5897 * If @nofault, return false if the page is invalid, otherwise
5898 * exit via page fault exception.
5899 */
5900
sve_probe_page(SVEHostPage * info,bool nofault,CPUARMState * env,target_ulong addr,int mem_off,MMUAccessType access_type,int mmu_idx,uintptr_t retaddr)5901 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5902 target_ulong addr, int mem_off, MMUAccessType access_type,
5903 int mmu_idx, uintptr_t retaddr)
5904 {
5905 int flags;
5906
5907 addr += mem_off;
5908
5909 /*
5910 * User-only currently always issues with TBI. See the comment
5911 * above useronly_clean_ptr. Usually we clean this top byte away
5912 * during translation, but we can't do that for e.g. vector + imm
5913 * addressing modes.
5914 *
5915 * We currently always enable TBI for user-only, and do not provide
5916 * a way to turn it off. So clean the pointer unconditionally here,
5917 * rather than look it up here, or pass it down from above.
5918 */
5919 addr = useronly_clean_ptr(addr);
5920
5921 #ifdef CONFIG_USER_ONLY
5922 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5923 &info->host, retaddr);
5924 #else
5925 CPUTLBEntryFull *full;
5926 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5927 &info->host, &full, retaddr);
5928 #endif
5929 info->flags = flags;
5930
5931 if (flags & TLB_INVALID_MASK) {
5932 g_assert(nofault);
5933 return false;
5934 }
5935
5936 #ifdef CONFIG_USER_ONLY
5937 memset(&info->attrs, 0, sizeof(info->attrs));
5938 /* Require both ANON and MTE; see allocation_tag_mem(). */
5939 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5940 #else
5941 info->attrs = full->attrs;
5942 info->tagged = full->extra.arm.pte_attrs == 0xf0;
5943 #endif
5944
5945 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5946 info->host -= mem_off;
5947 return true;
5948 }
5949
5950 /*
5951 * Find first active element on each page, and a loose bound for the
5952 * final element on each page. Identify any single element that spans
5953 * the page boundary. Return true if there are any active elements.
5954 */
sve_cont_ldst_elements(SVEContLdSt * info,target_ulong addr,uint64_t * vg,intptr_t reg_max,int esz,int msize)5955 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5956 intptr_t reg_max, int esz, int msize)
5957 {
5958 const int esize = 1 << esz;
5959 const uint64_t pg_mask = pred_esz_masks[esz];
5960 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5961 intptr_t mem_off_last, mem_off_split;
5962 intptr_t page_split, elt_split;
5963 intptr_t i;
5964
5965 /* Set all of the element indices to -1, and the TLB data to 0. */
5966 memset(info, -1, offsetof(SVEContLdSt, page));
5967 memset(info->page, 0, sizeof(info->page));
5968
5969 /* Gross scan over the entire predicate to find bounds. */
5970 i = 0;
5971 do {
5972 uint64_t pg = vg[i] & pg_mask;
5973 if (pg) {
5974 reg_off_last = i * 64 + 63 - clz64(pg);
5975 if (reg_off_first < 0) {
5976 reg_off_first = i * 64 + ctz64(pg);
5977 }
5978 }
5979 } while (++i * 64 < reg_max);
5980
5981 if (unlikely(reg_off_first < 0)) {
5982 /* No active elements, no pages touched. */
5983 return false;
5984 }
5985 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5986
5987 info->reg_off_first[0] = reg_off_first;
5988 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5989 mem_off_last = (reg_off_last >> esz) * msize;
5990
5991 page_split = -(addr | TARGET_PAGE_MASK);
5992 if (likely(mem_off_last + msize <= page_split)) {
5993 /* The entire operation fits within a single page. */
5994 info->reg_off_last[0] = reg_off_last;
5995 return true;
5996 }
5997
5998 info->page_split = page_split;
5999 elt_split = page_split / msize;
6000 reg_off_split = elt_split << esz;
6001 mem_off_split = elt_split * msize;
6002
6003 /*
6004 * This is the last full element on the first page, but it is not
6005 * necessarily active. If there is no full element, i.e. the first
6006 * active element is the one that's split, this value remains -1.
6007 * It is useful as iteration bounds.
6008 */
6009 if (elt_split != 0) {
6010 info->reg_off_last[0] = reg_off_split - esize;
6011 }
6012
6013 /* Determine if an unaligned element spans the pages. */
6014 if (page_split % msize != 0) {
6015 /* It is helpful to know if the split element is active. */
6016 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
6017 info->reg_off_split = reg_off_split;
6018 info->mem_off_split = mem_off_split;
6019
6020 if (reg_off_split == reg_off_last) {
6021 /* The page crossing element is last. */
6022 return true;
6023 }
6024 }
6025 reg_off_split += esize;
6026 mem_off_split += msize;
6027 }
6028
6029 /*
6030 * We do want the first active element on the second page, because
6031 * this may affect the address reported in an exception.
6032 */
6033 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
6034 tcg_debug_assert(reg_off_split <= reg_off_last);
6035 info->reg_off_first[1] = reg_off_split;
6036 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
6037 info->reg_off_last[1] = reg_off_last;
6038 return true;
6039 }
6040
6041 /*
6042 * Resolve the guest virtual addresses to info->page[].
6043 * Control the generation of page faults with @fault. Return false if
6044 * there is no work to do, which can only happen with @fault == FAULT_NO.
6045 */
sve_cont_ldst_pages(SVEContLdSt * info,SVEContFault fault,CPUARMState * env,target_ulong addr,MMUAccessType access_type,uintptr_t retaddr)6046 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
6047 CPUARMState *env, target_ulong addr,
6048 MMUAccessType access_type, uintptr_t retaddr)
6049 {
6050 int mmu_idx = arm_env_mmu_index(env);
6051 int mem_off = info->mem_off_first[0];
6052 bool nofault = fault == FAULT_NO;
6053 bool have_work = true;
6054
6055 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
6056 access_type, mmu_idx, retaddr)) {
6057 /* No work to be done. */
6058 return false;
6059 }
6060
6061 if (likely(info->page_split < 0)) {
6062 /* The entire operation was on the one page. */
6063 return true;
6064 }
6065
6066 /*
6067 * If the second page is invalid, then we want the fault address to be
6068 * the first byte on that page which is accessed.
6069 */
6070 if (info->mem_off_split >= 0) {
6071 /*
6072 * There is an element split across the pages. The fault address
6073 * should be the first byte of the second page.
6074 */
6075 mem_off = info->page_split;
6076 /*
6077 * If the split element is also the first active element
6078 * of the vector, then: For first-fault we should continue
6079 * to generate faults for the second page. For no-fault,
6080 * we have work only if the second page is valid.
6081 */
6082 if (info->mem_off_first[0] < info->mem_off_split) {
6083 nofault = FAULT_FIRST;
6084 have_work = false;
6085 }
6086 } else {
6087 /*
6088 * There is no element split across the pages. The fault address
6089 * should be the first active element on the second page.
6090 */
6091 mem_off = info->mem_off_first[1];
6092 /*
6093 * There must have been one active element on the first page,
6094 * so we're out of first-fault territory.
6095 */
6096 nofault = fault != FAULT_ALL;
6097 }
6098
6099 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
6100 access_type, mmu_idx, retaddr);
6101 return have_work;
6102 }
6103
6104 #ifndef CONFIG_USER_ONLY
sve_cont_ldst_watchpoints(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,int wp_access,uintptr_t retaddr)6105 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
6106 uint64_t *vg, target_ulong addr,
6107 int esize, int msize, int wp_access,
6108 uintptr_t retaddr)
6109 {
6110 intptr_t mem_off, reg_off, reg_last;
6111 int flags0 = info->page[0].flags;
6112 int flags1 = info->page[1].flags;
6113
6114 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
6115 return;
6116 }
6117
6118 /* Indicate that watchpoints are handled. */
6119 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
6120 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
6121
6122 if (flags0 & TLB_WATCHPOINT) {
6123 mem_off = info->mem_off_first[0];
6124 reg_off = info->reg_off_first[0];
6125 reg_last = info->reg_off_last[0];
6126
6127 while (reg_off <= reg_last) {
6128 uint64_t pg = vg[reg_off >> 6];
6129 do {
6130 if ((pg >> (reg_off & 63)) & 1) {
6131 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
6132 msize, info->page[0].attrs,
6133 wp_access, retaddr);
6134 }
6135 reg_off += esize;
6136 mem_off += msize;
6137 } while (reg_off <= reg_last && (reg_off & 63));
6138 }
6139 }
6140
6141 mem_off = info->mem_off_split;
6142 if (mem_off >= 0) {
6143 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
6144 info->page[0].attrs, wp_access, retaddr);
6145 }
6146
6147 mem_off = info->mem_off_first[1];
6148 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
6149 reg_off = info->reg_off_first[1];
6150 reg_last = info->reg_off_last[1];
6151
6152 do {
6153 uint64_t pg = vg[reg_off >> 6];
6154 do {
6155 if ((pg >> (reg_off & 63)) & 1) {
6156 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
6157 msize, info->page[1].attrs,
6158 wp_access, retaddr);
6159 }
6160 reg_off += esize;
6161 mem_off += msize;
6162 } while (reg_off & 63);
6163 } while (reg_off <= reg_last);
6164 }
6165 }
6166 #endif
6167
sve_cont_ldst_mte_check(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,uint32_t mtedesc,uintptr_t ra)6168 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
6169 uint64_t *vg, target_ulong addr, int esize,
6170 int msize, uint32_t mtedesc, uintptr_t ra)
6171 {
6172 intptr_t mem_off, reg_off, reg_last;
6173
6174 /* Process the page only if MemAttr == Tagged. */
6175 if (info->page[0].tagged) {
6176 mem_off = info->mem_off_first[0];
6177 reg_off = info->reg_off_first[0];
6178 reg_last = info->reg_off_split;
6179 if (reg_last < 0) {
6180 reg_last = info->reg_off_last[0];
6181 }
6182
6183 do {
6184 uint64_t pg = vg[reg_off >> 6];
6185 do {
6186 if ((pg >> (reg_off & 63)) & 1) {
6187 mte_check(env, mtedesc, addr, ra);
6188 }
6189 reg_off += esize;
6190 mem_off += msize;
6191 } while (reg_off <= reg_last && (reg_off & 63));
6192 } while (reg_off <= reg_last);
6193 }
6194
6195 mem_off = info->mem_off_first[1];
6196 if (mem_off >= 0 && info->page[1].tagged) {
6197 reg_off = info->reg_off_first[1];
6198 reg_last = info->reg_off_last[1];
6199
6200 do {
6201 uint64_t pg = vg[reg_off >> 6];
6202 do {
6203 if ((pg >> (reg_off & 63)) & 1) {
6204 mte_check(env, mtedesc, addr, ra);
6205 }
6206 reg_off += esize;
6207 mem_off += msize;
6208 } while (reg_off & 63);
6209 } while (reg_off <= reg_last);
6210 }
6211 }
6212
6213 /*
6214 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6215 */
6216 static inline QEMU_ALWAYS_INLINE
sve_ldN_r(CPUARMState * env,uint64_t * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const int N,uint32_t mtedesc,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6217 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
6218 uint32_t desc, const uintptr_t retaddr,
6219 const int esz, const int msz, const int N, uint32_t mtedesc,
6220 sve_ldst1_host_fn *host_fn,
6221 sve_ldst1_tlb_fn *tlb_fn)
6222 {
6223 const unsigned rd = simd_data(desc);
6224 const intptr_t reg_max = simd_oprsz(desc);
6225 intptr_t reg_off, reg_last, mem_off;
6226 SVEContLdSt info;
6227 void *host;
6228 int flags, i;
6229
6230 /* Find the active elements. */
6231 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6232 /* The entire predicate was false; no load occurs. */
6233 for (i = 0; i < N; ++i) {
6234 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
6235 }
6236 return;
6237 }
6238
6239 /* Probe the page(s). Exit with exception for any invalid page. */
6240 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
6241
6242 /* Handle watchpoints for all active elements. */
6243 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6244 BP_MEM_READ, retaddr);
6245
6246 /*
6247 * Handle mte checks for all active elements.
6248 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6249 */
6250 if (mtedesc) {
6251 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6252 mtedesc, retaddr);
6253 }
6254
6255 flags = info.page[0].flags | info.page[1].flags;
6256 if (unlikely(flags != 0)) {
6257 /*
6258 * At least one page includes MMIO.
6259 * Any bus operation can fail with cpu_transaction_failed,
6260 * which for ARM will raise SyncExternal. Perform the load
6261 * into scratch memory to preserve register state until the end.
6262 */
6263 ARMVectorReg scratch[4] = { };
6264
6265 mem_off = info.mem_off_first[0];
6266 reg_off = info.reg_off_first[0];
6267 reg_last = info.reg_off_last[1];
6268 if (reg_last < 0) {
6269 reg_last = info.reg_off_split;
6270 if (reg_last < 0) {
6271 reg_last = info.reg_off_last[0];
6272 }
6273 }
6274
6275 do {
6276 uint64_t pg = vg[reg_off >> 6];
6277 do {
6278 if ((pg >> (reg_off & 63)) & 1) {
6279 for (i = 0; i < N; ++i) {
6280 tlb_fn(env, &scratch[i], reg_off,
6281 addr + mem_off + (i << msz), retaddr);
6282 }
6283 }
6284 reg_off += 1 << esz;
6285 mem_off += N << msz;
6286 } while (reg_off & 63);
6287 } while (reg_off <= reg_last);
6288
6289 for (i = 0; i < N; ++i) {
6290 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
6291 }
6292 return;
6293 }
6294
6295 /* The entire operation is in RAM, on valid pages. */
6296
6297 for (i = 0; i < N; ++i) {
6298 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
6299 }
6300
6301 mem_off = info.mem_off_first[0];
6302 reg_off = info.reg_off_first[0];
6303 reg_last = info.reg_off_last[0];
6304 host = info.page[0].host;
6305
6306 set_helper_retaddr(retaddr);
6307
6308 while (reg_off <= reg_last) {
6309 uint64_t pg = vg[reg_off >> 6];
6310 do {
6311 if ((pg >> (reg_off & 63)) & 1) {
6312 for (i = 0; i < N; ++i) {
6313 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6314 host + mem_off + (i << msz));
6315 }
6316 }
6317 reg_off += 1 << esz;
6318 mem_off += N << msz;
6319 } while (reg_off <= reg_last && (reg_off & 63));
6320 }
6321
6322 clear_helper_retaddr();
6323
6324 /*
6325 * Use the slow path to manage the cross-page misalignment.
6326 * But we know this is RAM and cannot trap.
6327 */
6328 mem_off = info.mem_off_split;
6329 if (unlikely(mem_off >= 0)) {
6330 reg_off = info.reg_off_split;
6331 for (i = 0; i < N; ++i) {
6332 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6333 addr + mem_off + (i << msz), retaddr);
6334 }
6335 }
6336
6337 mem_off = info.mem_off_first[1];
6338 if (unlikely(mem_off >= 0)) {
6339 reg_off = info.reg_off_first[1];
6340 reg_last = info.reg_off_last[1];
6341 host = info.page[1].host;
6342
6343 set_helper_retaddr(retaddr);
6344
6345 do {
6346 uint64_t pg = vg[reg_off >> 6];
6347 do {
6348 if ((pg >> (reg_off & 63)) & 1) {
6349 for (i = 0; i < N; ++i) {
6350 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6351 host + mem_off + (i << msz));
6352 }
6353 }
6354 reg_off += 1 << esz;
6355 mem_off += N << msz;
6356 } while (reg_off & 63);
6357 } while (reg_off <= reg_last);
6358
6359 clear_helper_retaddr();
6360 }
6361 }
6362
6363 static inline QEMU_ALWAYS_INLINE
sve_ldN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint64_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6364 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6365 uint64_t desc, const uintptr_t ra,
6366 const int esz, const int msz, const int N,
6367 sve_ldst1_host_fn *host_fn,
6368 sve_ldst1_tlb_fn *tlb_fn)
6369 {
6370 uint32_t mtedesc = desc >> 32;
6371 int bit55 = extract64(addr, 55, 1);
6372
6373 /* Perform gross MTE suppression early. */
6374 if (!tbi_check(mtedesc, bit55) ||
6375 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6376 mtedesc = 0;
6377 }
6378
6379 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6380 }
6381
6382 #define DO_LD1_1(NAME, ESZ) \
6383 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
6384 target_ulong addr, uint64_t desc) \
6385 { \
6386 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
6387 sve_##NAME##_host, sve_##NAME##_tlb); \
6388 } \
6389 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
6390 target_ulong addr, uint64_t desc) \
6391 { \
6392 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
6393 sve_##NAME##_host, sve_##NAME##_tlb); \
6394 }
6395
6396 #define DO_LD1_2(NAME, ESZ, MSZ) \
6397 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
6398 target_ulong addr, uint64_t desc) \
6399 { \
6400 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6401 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6402 } \
6403 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
6404 target_ulong addr, uint64_t desc) \
6405 { \
6406 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6407 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6408 } \
6409 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6410 target_ulong addr, uint64_t desc) \
6411 { \
6412 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6413 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6414 } \
6415 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6416 target_ulong addr, uint64_t desc) \
6417 { \
6418 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6419 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6420 }
6421
DO_LD1_1(ld1bb,MO_8)6422 DO_LD1_1(ld1bb, MO_8)
6423 DO_LD1_1(ld1bhu, MO_16)
6424 DO_LD1_1(ld1bhs, MO_16)
6425 DO_LD1_1(ld1bsu, MO_32)
6426 DO_LD1_1(ld1bss, MO_32)
6427 DO_LD1_1(ld1bdu, MO_64)
6428 DO_LD1_1(ld1bds, MO_64)
6429
6430 DO_LD1_2(ld1hh, MO_16, MO_16)
6431 DO_LD1_2(ld1hsu, MO_32, MO_16)
6432 DO_LD1_2(ld1hss, MO_32, MO_16)
6433 DO_LD1_2(ld1hdu, MO_64, MO_16)
6434 DO_LD1_2(ld1hds, MO_64, MO_16)
6435
6436 DO_LD1_2(ld1ss, MO_32, MO_32)
6437 DO_LD1_2(ld1sdu, MO_64, MO_32)
6438 DO_LD1_2(ld1sds, MO_64, MO_32)
6439
6440 DO_LD1_2(ld1dd, MO_64, MO_64)
6441
6442 DO_LD1_2(ld1squ, MO_128, MO_32)
6443 DO_LD1_2(ld1dqu, MO_128, MO_64)
6444
6445 #undef DO_LD1_1
6446 #undef DO_LD1_2
6447
6448 #define DO_LDN_1(N) \
6449 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
6450 target_ulong addr, uint64_t desc) \
6451 { \
6452 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
6453 sve_ld1bb_host, sve_ld1bb_tlb); \
6454 } \
6455 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
6456 target_ulong addr, uint64_t desc) \
6457 { \
6458 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
6459 sve_ld1bb_host, sve_ld1bb_tlb); \
6460 }
6461
6462 #define DO_LDN_2(N, SUFF, ESZ) \
6463 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
6464 target_ulong addr, uint64_t desc) \
6465 { \
6466 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6467 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6468 } \
6469 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
6470 target_ulong addr, uint64_t desc) \
6471 { \
6472 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6473 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6474 } \
6475 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
6476 target_ulong addr, uint64_t desc) \
6477 { \
6478 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6479 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6480 } \
6481 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
6482 target_ulong addr, uint64_t desc) \
6483 { \
6484 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6485 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6486 }
6487
6488 DO_LDN_1(2)
6489 DO_LDN_1(3)
6490 DO_LDN_1(4)
6491
6492 DO_LDN_2(2, hh, MO_16)
6493 DO_LDN_2(3, hh, MO_16)
6494 DO_LDN_2(4, hh, MO_16)
6495
6496 DO_LDN_2(2, ss, MO_32)
6497 DO_LDN_2(3, ss, MO_32)
6498 DO_LDN_2(4, ss, MO_32)
6499
6500 DO_LDN_2(2, dd, MO_64)
6501 DO_LDN_2(3, dd, MO_64)
6502 DO_LDN_2(4, dd, MO_64)
6503
6504 DO_LDN_2(2, qq, MO_128)
6505 DO_LDN_2(3, qq, MO_128)
6506 DO_LDN_2(4, qq, MO_128)
6507
6508 #undef DO_LDN_1
6509 #undef DO_LDN_2
6510
6511 /*
6512 * Load contiguous data, first-fault and no-fault.
6513 *
6514 * For user-only, we control the race between page_check_range and
6515 * another thread's munmap by using set/clear_helper_retaddr. Any
6516 * SEGV that occurs between those markers is assumed to be because
6517 * the guest page vanished. Keep that block as small as possible
6518 * so that unrelated QEMU bugs are not blamed on the guest.
6519 */
6520
6521 /* Fault on byte I. All bits in FFR from I are cleared. The vector
6522 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6523 * option, which leaves subsequent data unchanged.
6524 */
6525 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6526 {
6527 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6528
6529 if (i & 63) {
6530 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6531 i = ROUND_UP(i, 64);
6532 }
6533 for (; i < oprsz; i += 64) {
6534 ffr[i / 64] = 0;
6535 }
6536 }
6537
6538 /*
6539 * Common helper for all contiguous no-fault and first-fault loads.
6540 */
6541 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r(CPUARMState * env,void * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,uint32_t mtedesc,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6542 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6543 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6544 const int esz, const int msz, const SVEContFault fault,
6545 sve_ldst1_host_fn *host_fn,
6546 sve_ldst1_tlb_fn *tlb_fn)
6547 {
6548 const unsigned rd = simd_data(desc);
6549 void *vd = &env->vfp.zregs[rd];
6550 const intptr_t reg_max = simd_oprsz(desc);
6551 intptr_t reg_off, mem_off, reg_last;
6552 SVEContLdSt info;
6553 int flags;
6554 void *host;
6555
6556 /* Find the active elements. */
6557 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6558 /* The entire predicate was false; no load occurs. */
6559 memset(vd, 0, reg_max);
6560 return;
6561 }
6562 reg_off = info.reg_off_first[0];
6563
6564 /* Probe the page(s). */
6565 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6566 /* Fault on first element. */
6567 tcg_debug_assert(fault == FAULT_NO);
6568 memset(vd, 0, reg_max);
6569 goto do_fault;
6570 }
6571
6572 mem_off = info.mem_off_first[0];
6573 flags = info.page[0].flags;
6574
6575 /*
6576 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6577 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6578 */
6579 if (!info.page[0].tagged) {
6580 mtedesc = 0;
6581 }
6582
6583 if (fault == FAULT_FIRST) {
6584 /* Trapping mte check for the first-fault element. */
6585 if (mtedesc) {
6586 mte_check(env, mtedesc, addr + mem_off, retaddr);
6587 }
6588
6589 /*
6590 * Special handling of the first active element,
6591 * if it crosses a page boundary or is MMIO.
6592 */
6593 bool is_split = mem_off == info.mem_off_split;
6594 if (unlikely(flags != 0) || unlikely(is_split)) {
6595 /*
6596 * Use the slow path for cross-page handling.
6597 * Might trap for MMIO or watchpoints.
6598 */
6599 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6600
6601 /* After any fault, zero the other elements. */
6602 swap_memzero(vd, reg_off);
6603 reg_off += 1 << esz;
6604 mem_off += 1 << msz;
6605 swap_memzero(vd + reg_off, reg_max - reg_off);
6606
6607 if (is_split) {
6608 goto second_page;
6609 }
6610 } else {
6611 memset(vd, 0, reg_max);
6612 }
6613 } else {
6614 memset(vd, 0, reg_max);
6615 if (unlikely(mem_off == info.mem_off_split)) {
6616 /* The first active element crosses a page boundary. */
6617 flags |= info.page[1].flags;
6618 if (unlikely(flags & TLB_MMIO)) {
6619 /* Some page is MMIO, see below. */
6620 goto do_fault;
6621 }
6622 if (unlikely(flags & TLB_WATCHPOINT) &&
6623 (cpu_watchpoint_address_matches
6624 (env_cpu(env), addr + mem_off, 1 << msz)
6625 & BP_MEM_READ)) {
6626 /* Watchpoint hit, see below. */
6627 goto do_fault;
6628 }
6629 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6630 goto do_fault;
6631 }
6632 /*
6633 * Use the slow path for cross-page handling.
6634 * This is RAM, without a watchpoint, and will not trap.
6635 */
6636 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6637 goto second_page;
6638 }
6639 }
6640
6641 /*
6642 * From this point on, all memory operations are MemSingleNF.
6643 *
6644 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6645 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6646 *
6647 * Unfortuately we do not have access to the memory attributes from the
6648 * PTE to tell Device memory from Normal memory. So we make a mostly
6649 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6650 * This gives the right answer for the common cases of "Normal memory,
6651 * backed by host RAM" and "Device memory, backed by MMIO".
6652 * The architecture allows us to suppress an NF load and return
6653 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6654 * case of "Normal memory, backed by MMIO" is permitted. The case we
6655 * get wrong is "Device memory, backed by host RAM", for which we
6656 * should return (UNKNOWN, FAULT) for but do not.
6657 *
6658 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6659 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6660 * architectural breakpoints the same.
6661 */
6662 if (unlikely(flags & TLB_MMIO)) {
6663 goto do_fault;
6664 }
6665
6666 reg_last = info.reg_off_last[0];
6667 host = info.page[0].host;
6668
6669 set_helper_retaddr(retaddr);
6670
6671 do {
6672 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6673 do {
6674 if ((pg >> (reg_off & 63)) & 1) {
6675 if (unlikely(flags & TLB_WATCHPOINT) &&
6676 (cpu_watchpoint_address_matches
6677 (env_cpu(env), addr + mem_off, 1 << msz)
6678 & BP_MEM_READ)) {
6679 clear_helper_retaddr();
6680 goto do_fault;
6681 }
6682 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6683 clear_helper_retaddr();
6684 goto do_fault;
6685 }
6686 host_fn(vd, reg_off, host + mem_off);
6687 }
6688 reg_off += 1 << esz;
6689 mem_off += 1 << msz;
6690 } while (reg_off <= reg_last && (reg_off & 63));
6691 } while (reg_off <= reg_last);
6692
6693 clear_helper_retaddr();
6694
6695 /*
6696 * MemSingleNF is allowed to fail for any reason. We have special
6697 * code above to handle the first element crossing a page boundary.
6698 * As an implementation choice, decline to handle a cross-page element
6699 * in any other position.
6700 */
6701 reg_off = info.reg_off_split;
6702 if (reg_off >= 0) {
6703 goto do_fault;
6704 }
6705
6706 second_page:
6707 reg_off = info.reg_off_first[1];
6708 if (likely(reg_off < 0)) {
6709 /* No active elements on the second page. All done. */
6710 return;
6711 }
6712
6713 /*
6714 * MemSingleNF is allowed to fail for any reason. As an implementation
6715 * choice, decline to handle elements on the second page. This should
6716 * be low frequency as the guest walks through memory -- the next
6717 * iteration of the guest's loop should be aligned on the page boundary,
6718 * and then all following iterations will stay aligned.
6719 */
6720
6721 do_fault:
6722 record_fault(env, reg_off, reg_max);
6723 }
6724
6725 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r_mte(CPUARMState * env,void * vg,target_ulong addr,uint64_t desc,const uintptr_t retaddr,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6726 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6727 uint64_t desc, const uintptr_t retaddr,
6728 const int esz, const int msz, const SVEContFault fault,
6729 sve_ldst1_host_fn *host_fn,
6730 sve_ldst1_tlb_fn *tlb_fn)
6731 {
6732 uint32_t mtedesc = desc >> 32;
6733 int bit55 = extract64(addr, 55, 1);
6734
6735 /* Perform gross MTE suppression early. */
6736 if (!tbi_check(mtedesc, bit55) ||
6737 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6738 mtedesc = 0;
6739 }
6740
6741 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6742 esz, msz, fault, host_fn, tlb_fn);
6743 }
6744
6745 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6746 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6747 target_ulong addr, uint64_t desc) \
6748 { \
6749 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6750 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6751 } \
6752 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6753 target_ulong addr, uint64_t desc) \
6754 { \
6755 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6756 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6757 } \
6758 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6759 target_ulong addr, uint64_t desc) \
6760 { \
6761 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6762 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6763 } \
6764 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6765 target_ulong addr, uint64_t desc) \
6766 { \
6767 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6768 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6769 }
6770
6771 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6772 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6773 target_ulong addr, uint64_t desc) \
6774 { \
6775 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6776 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6777 } \
6778 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6779 target_ulong addr, uint64_t desc) \
6780 { \
6781 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6782 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6783 } \
6784 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6785 target_ulong addr, uint64_t desc) \
6786 { \
6787 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6788 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6789 } \
6790 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6791 target_ulong addr, uint64_t desc) \
6792 { \
6793 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6794 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6795 } \
6796 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6797 target_ulong addr, uint64_t desc) \
6798 { \
6799 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6800 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6801 } \
6802 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6803 target_ulong addr, uint64_t desc) \
6804 { \
6805 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6806 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6807 } \
6808 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6809 target_ulong addr, uint64_t desc) \
6810 { \
6811 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6812 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6813 } \
6814 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6815 target_ulong addr, uint64_t desc) \
6816 { \
6817 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6818 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6819 }
6820
DO_LDFF1_LDNF1_1(bb,MO_8)6821 DO_LDFF1_LDNF1_1(bb, MO_8)
6822 DO_LDFF1_LDNF1_1(bhu, MO_16)
6823 DO_LDFF1_LDNF1_1(bhs, MO_16)
6824 DO_LDFF1_LDNF1_1(bsu, MO_32)
6825 DO_LDFF1_LDNF1_1(bss, MO_32)
6826 DO_LDFF1_LDNF1_1(bdu, MO_64)
6827 DO_LDFF1_LDNF1_1(bds, MO_64)
6828
6829 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6830 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6831 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6832 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6833 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6834
6835 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6836 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6837 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6838
6839 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6840
6841 #undef DO_LDFF1_LDNF1_1
6842 #undef DO_LDFF1_LDNF1_2
6843
6844 /*
6845 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6846 */
6847
6848 static inline QEMU_ALWAYS_INLINE
6849 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6850 uint32_t desc, const uintptr_t retaddr,
6851 const int esz, const int msz, const int N, uint32_t mtedesc,
6852 sve_ldst1_host_fn *host_fn,
6853 sve_ldst1_tlb_fn *tlb_fn)
6854 {
6855 const unsigned rd = simd_data(desc);
6856 const intptr_t reg_max = simd_oprsz(desc);
6857 intptr_t reg_off, reg_last, mem_off;
6858 SVEContLdSt info;
6859 void *host;
6860 int i, flags;
6861
6862 /* Find the active elements. */
6863 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6864 /* The entire predicate was false; no store occurs. */
6865 return;
6866 }
6867
6868 /* Probe the page(s). Exit with exception for any invalid page. */
6869 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6870
6871 /* Handle watchpoints for all active elements. */
6872 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6873 BP_MEM_WRITE, retaddr);
6874
6875 /*
6876 * Handle mte checks for all active elements.
6877 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6878 */
6879 if (mtedesc) {
6880 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6881 mtedesc, retaddr);
6882 }
6883
6884 flags = info.page[0].flags | info.page[1].flags;
6885 if (unlikely(flags != 0)) {
6886 /*
6887 * At least one page includes MMIO.
6888 * Any bus operation can fail with cpu_transaction_failed,
6889 * which for ARM will raise SyncExternal. We cannot avoid
6890 * this fault and will leave with the store incomplete.
6891 */
6892 mem_off = info.mem_off_first[0];
6893 reg_off = info.reg_off_first[0];
6894 reg_last = info.reg_off_last[1];
6895 if (reg_last < 0) {
6896 reg_last = info.reg_off_split;
6897 if (reg_last < 0) {
6898 reg_last = info.reg_off_last[0];
6899 }
6900 }
6901
6902 do {
6903 uint64_t pg = vg[reg_off >> 6];
6904 do {
6905 if ((pg >> (reg_off & 63)) & 1) {
6906 for (i = 0; i < N; ++i) {
6907 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6908 addr + mem_off + (i << msz), retaddr);
6909 }
6910 }
6911 reg_off += 1 << esz;
6912 mem_off += N << msz;
6913 } while (reg_off & 63);
6914 } while (reg_off <= reg_last);
6915 return;
6916 }
6917
6918 mem_off = info.mem_off_first[0];
6919 reg_off = info.reg_off_first[0];
6920 reg_last = info.reg_off_last[0];
6921 host = info.page[0].host;
6922
6923 set_helper_retaddr(retaddr);
6924
6925 while (reg_off <= reg_last) {
6926 uint64_t pg = vg[reg_off >> 6];
6927 do {
6928 if ((pg >> (reg_off & 63)) & 1) {
6929 for (i = 0; i < N; ++i) {
6930 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6931 host + mem_off + (i << msz));
6932 }
6933 }
6934 reg_off += 1 << esz;
6935 mem_off += N << msz;
6936 } while (reg_off <= reg_last && (reg_off & 63));
6937 }
6938
6939 clear_helper_retaddr();
6940
6941 /*
6942 * Use the slow path to manage the cross-page misalignment.
6943 * But we know this is RAM and cannot trap.
6944 */
6945 mem_off = info.mem_off_split;
6946 if (unlikely(mem_off >= 0)) {
6947 reg_off = info.reg_off_split;
6948 for (i = 0; i < N; ++i) {
6949 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6950 addr + mem_off + (i << msz), retaddr);
6951 }
6952 }
6953
6954 mem_off = info.mem_off_first[1];
6955 if (unlikely(mem_off >= 0)) {
6956 reg_off = info.reg_off_first[1];
6957 reg_last = info.reg_off_last[1];
6958 host = info.page[1].host;
6959
6960 set_helper_retaddr(retaddr);
6961
6962 do {
6963 uint64_t pg = vg[reg_off >> 6];
6964 do {
6965 if ((pg >> (reg_off & 63)) & 1) {
6966 for (i = 0; i < N; ++i) {
6967 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6968 host + mem_off + (i << msz));
6969 }
6970 }
6971 reg_off += 1 << esz;
6972 mem_off += N << msz;
6973 } while (reg_off & 63);
6974 } while (reg_off <= reg_last);
6975
6976 clear_helper_retaddr();
6977 }
6978 }
6979
6980 static inline QEMU_ALWAYS_INLINE
sve_stN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint64_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6981 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6982 uint64_t desc, const uintptr_t ra,
6983 const int esz, const int msz, const int N,
6984 sve_ldst1_host_fn *host_fn,
6985 sve_ldst1_tlb_fn *tlb_fn)
6986 {
6987 uint32_t mtedesc = desc >> 32;
6988 int bit55 = extract64(addr, 55, 1);
6989
6990 /* Perform gross MTE suppression early. */
6991 if (!tbi_check(mtedesc, bit55) ||
6992 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6993 mtedesc = 0;
6994 }
6995
6996 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6997 }
6998
6999 #define DO_STN_1(N, NAME, ESZ) \
7000 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
7001 target_ulong addr, uint64_t desc) \
7002 { \
7003 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
7004 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
7005 } \
7006 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
7007 target_ulong addr, uint64_t desc) \
7008 { \
7009 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
7010 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
7011 }
7012
7013 #define DO_STN_2(N, NAME, ESZ, MSZ) \
7014 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
7015 target_ulong addr, uint64_t desc) \
7016 { \
7017 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
7018 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
7019 } \
7020 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
7021 target_ulong addr, uint64_t desc) \
7022 { \
7023 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
7024 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
7025 } \
7026 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
7027 target_ulong addr, uint64_t desc) \
7028 { \
7029 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
7030 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
7031 } \
7032 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
7033 target_ulong addr, uint64_t desc) \
7034 { \
7035 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
7036 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
7037 }
7038
7039 DO_STN_1(1, bb, MO_8)
7040 DO_STN_1(1, bh, MO_16)
7041 DO_STN_1(1, bs, MO_32)
7042 DO_STN_1(1, bd, MO_64)
7043 DO_STN_1(2, bb, MO_8)
7044 DO_STN_1(3, bb, MO_8)
7045 DO_STN_1(4, bb, MO_8)
7046
7047 DO_STN_2(1, hh, MO_16, MO_16)
7048 DO_STN_2(1, hs, MO_32, MO_16)
7049 DO_STN_2(1, hd, MO_64, MO_16)
7050 DO_STN_2(2, hh, MO_16, MO_16)
7051 DO_STN_2(3, hh, MO_16, MO_16)
7052 DO_STN_2(4, hh, MO_16, MO_16)
7053
7054 DO_STN_2(1, ss, MO_32, MO_32)
7055 DO_STN_2(1, sd, MO_64, MO_32)
7056 DO_STN_2(2, ss, MO_32, MO_32)
7057 DO_STN_2(3, ss, MO_32, MO_32)
7058 DO_STN_2(4, ss, MO_32, MO_32)
7059
7060 DO_STN_2(1, dd, MO_64, MO_64)
7061 DO_STN_2(2, dd, MO_64, MO_64)
7062 DO_STN_2(3, dd, MO_64, MO_64)
7063 DO_STN_2(4, dd, MO_64, MO_64)
7064
7065 DO_STN_2(1, sq, MO_128, MO_32)
7066 DO_STN_2(1, dq, MO_128, MO_64)
7067
7068 DO_STN_2(2, qq, MO_128, MO_128)
7069 DO_STN_2(3, qq, MO_128, MO_128)
7070 DO_STN_2(4, qq, MO_128, MO_128)
7071
7072 #undef DO_STN_1
7073 #undef DO_STN_2
7074
7075 /*
7076 * Loads with a vector index.
7077 */
7078
7079 /*
7080 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
7081 */
7082 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
7083
off_zsu_s(void * reg,intptr_t reg_ofs)7084 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
7085 {
7086 return *(uint32_t *)(reg + H1_4(reg_ofs));
7087 }
7088
off_zss_s(void * reg,intptr_t reg_ofs)7089 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
7090 {
7091 return *(int32_t *)(reg + H1_4(reg_ofs));
7092 }
7093
off_zsu_d(void * reg,intptr_t reg_ofs)7094 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
7095 {
7096 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
7097 }
7098
off_zss_d(void * reg,intptr_t reg_ofs)7099 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
7100 {
7101 return (int32_t)*(uint64_t *)(reg + reg_ofs);
7102 }
7103
off_zd_d(void * reg,intptr_t reg_ofs)7104 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
7105 {
7106 return *(uint64_t *)(reg + reg_ofs);
7107 }
7108
7109 static inline QEMU_ALWAYS_INLINE
sve_ld1_z(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,uint32_t mtedesc,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7110 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7111 target_ulong base, uint32_t desc, uintptr_t retaddr,
7112 uint32_t mtedesc, int esize, int msize,
7113 zreg_off_fn *off_fn,
7114 sve_ldst1_host_fn *host_fn,
7115 sve_ldst1_tlb_fn *tlb_fn)
7116 {
7117 const int mmu_idx = arm_env_mmu_index(env);
7118 const intptr_t reg_max = simd_oprsz(desc);
7119 const int scale = simd_data(desc);
7120 ARMVectorReg scratch;
7121 intptr_t reg_off;
7122 SVEHostPage info, info2;
7123
7124 memset(&scratch, 0, reg_max);
7125 reg_off = 0;
7126 do {
7127 uint64_t pg = vg[reg_off >> 6];
7128 do {
7129 if (likely(pg & 1)) {
7130 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7131 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7132
7133 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
7134 mmu_idx, retaddr);
7135
7136 if (likely(in_page >= msize)) {
7137 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7138 cpu_check_watchpoint(env_cpu(env), addr, msize,
7139 info.attrs, BP_MEM_READ, retaddr);
7140 }
7141 if (mtedesc && info.tagged) {
7142 mte_check(env, mtedesc, addr, retaddr);
7143 }
7144 if (unlikely(info.flags & TLB_MMIO)) {
7145 tlb_fn(env, &scratch, reg_off, addr, retaddr);
7146 } else {
7147 set_helper_retaddr(retaddr);
7148 host_fn(&scratch, reg_off, info.host);
7149 clear_helper_retaddr();
7150 }
7151 } else {
7152 /* Element crosses the page boundary. */
7153 sve_probe_page(&info2, false, env, addr + in_page, 0,
7154 MMU_DATA_LOAD, mmu_idx, retaddr);
7155 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
7156 cpu_check_watchpoint(env_cpu(env), addr,
7157 msize, info.attrs,
7158 BP_MEM_READ, retaddr);
7159 }
7160 if (mtedesc && info.tagged) {
7161 mte_check(env, mtedesc, addr, retaddr);
7162 }
7163 tlb_fn(env, &scratch, reg_off, addr, retaddr);
7164 }
7165 }
7166 reg_off += esize;
7167 pg >>= esize;
7168 } while (reg_off & 63);
7169 } while (reg_off < reg_max);
7170
7171 /* Wait until all exceptions have been raised to write back. */
7172 memcpy(vd, &scratch, reg_max);
7173 }
7174
7175 static inline QEMU_ALWAYS_INLINE
sve_ld1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint64_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7176 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7177 target_ulong base, uint64_t desc, uintptr_t retaddr,
7178 int esize, int msize, zreg_off_fn *off_fn,
7179 sve_ldst1_host_fn *host_fn,
7180 sve_ldst1_tlb_fn *tlb_fn)
7181 {
7182 uint32_t mtedesc = desc >> 32;
7183
7184 /*
7185 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7186 * offset base entirely over the address space hole to change the
7187 * pointer tag, or change the bit55 selector. So we could here
7188 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7189 */
7190 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7191 esize, msize, off_fn, host_fn, tlb_fn);
7192 }
7193
7194 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
7195 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7196 void *vm, target_ulong base, uint64_t desc) \
7197 { \
7198 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7199 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7200 } \
7201 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7202 void *vm, target_ulong base, uint64_t desc) \
7203 { \
7204 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7205 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7206 }
7207
7208 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
7209 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7210 void *vm, target_ulong base, uint64_t desc) \
7211 { \
7212 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7213 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7214 } \
7215 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7216 void *vm, target_ulong base, uint64_t desc) \
7217 { \
7218 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7219 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7220 }
7221
7222 #define DO_LD1_ZPZ_Q(MEM, OFS, MSZ) \
7223 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7224 void *vm, target_ulong base, uint64_t desc) \
7225 { \
7226 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 16, 1 << MSZ, \
7227 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7228 } \
7229 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7230 void *vm, target_ulong base, uint64_t desc) \
7231 { \
7232 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 16, 1 << MSZ, \
7233 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7234 }
7235
DO_LD1_ZPZ_S(bsu,zsu,MO_8)7236 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
7237 DO_LD1_ZPZ_S(bsu, zss, MO_8)
7238 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
7239 DO_LD1_ZPZ_D(bdu, zss, MO_8)
7240 DO_LD1_ZPZ_D(bdu, zd, MO_8)
7241
7242 DO_LD1_ZPZ_S(bss, zsu, MO_8)
7243 DO_LD1_ZPZ_S(bss, zss, MO_8)
7244 DO_LD1_ZPZ_D(bds, zsu, MO_8)
7245 DO_LD1_ZPZ_D(bds, zss, MO_8)
7246 DO_LD1_ZPZ_D(bds, zd, MO_8)
7247
7248 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
7249 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
7250 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
7251 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
7252 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
7253
7254 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
7255 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
7256 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
7257 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
7258 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
7259
7260 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
7261 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
7262 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
7263 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
7264 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
7265
7266 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
7267 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
7268 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
7269 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
7270 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
7271
7272 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
7273 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
7274 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
7275 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
7276 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
7277
7278 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
7279 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
7280 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
7281 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
7282 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
7283
7284 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
7285 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
7286 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
7287
7288 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
7289 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
7290 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
7291
7292 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
7293 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
7294 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
7295
7296 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
7297 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
7298 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
7299
7300 DO_LD1_ZPZ_Q(qq_le, zd, MO_128)
7301 DO_LD1_ZPZ_Q(qq_be, zd, MO_128)
7302
7303 #undef DO_LD1_ZPZ_S
7304 #undef DO_LD1_ZPZ_D
7305
7306 /* First fault loads with a vector index. */
7307
7308 /*
7309 * Common helpers for all gather first-faulting loads.
7310 */
7311
7312 static inline QEMU_ALWAYS_INLINE
7313 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7314 target_ulong base, uint32_t desc, uintptr_t retaddr,
7315 uint32_t mtedesc, const int esz, const int msz,
7316 zreg_off_fn *off_fn,
7317 sve_ldst1_host_fn *host_fn,
7318 sve_ldst1_tlb_fn *tlb_fn)
7319 {
7320 const int mmu_idx = arm_env_mmu_index(env);
7321 const intptr_t reg_max = simd_oprsz(desc);
7322 const int scale = simd_data(desc);
7323 const int esize = 1 << esz;
7324 const int msize = 1 << msz;
7325 intptr_t reg_off;
7326 SVEHostPage info;
7327 target_ulong addr, in_page;
7328 ARMVectorReg scratch;
7329
7330 /* Skip to the first true predicate. */
7331 reg_off = find_next_active(vg, 0, reg_max, esz);
7332 if (unlikely(reg_off >= reg_max)) {
7333 /* The entire predicate was false; no load occurs. */
7334 memset(vd, 0, reg_max);
7335 return;
7336 }
7337
7338 /* Protect against overlap between vd and vm. */
7339 if (unlikely(vd == vm)) {
7340 vm = memcpy(&scratch, vm, reg_max);
7341 }
7342
7343 /*
7344 * Probe the first element, allowing faults.
7345 */
7346 addr = base + (off_fn(vm, reg_off) << scale);
7347 if (mtedesc) {
7348 mte_check(env, mtedesc, addr, retaddr);
7349 }
7350 tlb_fn(env, vd, reg_off, addr, retaddr);
7351
7352 /* After any fault, zero the other elements. */
7353 swap_memzero(vd, reg_off);
7354 reg_off += esize;
7355 swap_memzero(vd + reg_off, reg_max - reg_off);
7356
7357 /*
7358 * Probe the remaining elements, not allowing faults.
7359 */
7360 while (reg_off < reg_max) {
7361 uint64_t pg = vg[reg_off >> 6];
7362 do {
7363 if (likely((pg >> (reg_off & 63)) & 1)) {
7364 addr = base + (off_fn(vm, reg_off) << scale);
7365 in_page = -(addr | TARGET_PAGE_MASK);
7366
7367 if (unlikely(in_page < msize)) {
7368 /* Stop if the element crosses a page boundary. */
7369 goto fault;
7370 }
7371
7372 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
7373 mmu_idx, retaddr);
7374 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
7375 goto fault;
7376 }
7377 if (unlikely(info.flags & TLB_WATCHPOINT) &&
7378 (cpu_watchpoint_address_matches
7379 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
7380 goto fault;
7381 }
7382 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
7383 goto fault;
7384 }
7385
7386 set_helper_retaddr(retaddr);
7387 host_fn(vd, reg_off, info.host);
7388 clear_helper_retaddr();
7389 }
7390 reg_off += esize;
7391 } while (reg_off & 63);
7392 }
7393 return;
7394
7395 fault:
7396 record_fault(env, reg_off, reg_max);
7397 }
7398
7399 static inline QEMU_ALWAYS_INLINE
sve_ldff1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint64_t desc,uintptr_t retaddr,const int esz,const int msz,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7400 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7401 target_ulong base, uint64_t desc, uintptr_t retaddr,
7402 const int esz, const int msz,
7403 zreg_off_fn *off_fn,
7404 sve_ldst1_host_fn *host_fn,
7405 sve_ldst1_tlb_fn *tlb_fn)
7406 {
7407 uint32_t mtedesc = desc >> 32;
7408
7409 /*
7410 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7411 * offset base entirely over the address space hole to change the
7412 * pointer tag, or change the bit55 selector. So we could here
7413 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7414 */
7415 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7416 esz, msz, off_fn, host_fn, tlb_fn);
7417 }
7418
7419 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
7420 void HELPER(sve_ldff##MEM##_##OFS) \
7421 (CPUARMState *env, void *vd, void *vg, \
7422 void *vm, target_ulong base, uint64_t desc) \
7423 { \
7424 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
7425 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7426 } \
7427 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7428 (CPUARMState *env, void *vd, void *vg, \
7429 void *vm, target_ulong base, uint64_t desc) \
7430 { \
7431 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
7432 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7433 }
7434
7435 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
7436 void HELPER(sve_ldff##MEM##_##OFS) \
7437 (CPUARMState *env, void *vd, void *vg, \
7438 void *vm, target_ulong base, uint64_t desc) \
7439 { \
7440 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
7441 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7442 } \
7443 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7444 (CPUARMState *env, void *vd, void *vg, \
7445 void *vm, target_ulong base, uint64_t desc) \
7446 { \
7447 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
7448 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7449 }
7450
DO_LDFF1_ZPZ_S(bsu,zsu,MO_8)7451 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7452 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7453 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7454 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7455 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7456
7457 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7458 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7459 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7460 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7461 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7462
7463 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7464 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7465 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7466 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7467 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7468
7469 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7470 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7471 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7472 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7473 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7474
7475 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7476 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7477 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7478 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7479 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7480
7481 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7482 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7483 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7484 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7485 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7486
7487 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
7488 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
7489 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7490 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7491 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7492
7493 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
7494 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
7495 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7496 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7497 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7498
7499 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7500 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7501 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7502
7503 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7504 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7505 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7506
7507 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7508 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7509 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7510
7511 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7512 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7513 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7514
7515 /* Stores with a vector index. */
7516
7517 static inline QEMU_ALWAYS_INLINE
7518 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7519 target_ulong base, uint32_t desc, uintptr_t retaddr,
7520 uint32_t mtedesc, int esize, int msize,
7521 zreg_off_fn *off_fn,
7522 sve_ldst1_host_fn *host_fn,
7523 sve_ldst1_tlb_fn *tlb_fn)
7524 {
7525 const int mmu_idx = arm_env_mmu_index(env);
7526 const intptr_t reg_max = simd_oprsz(desc);
7527 const int scale = simd_data(desc);
7528 void *host[ARM_MAX_VQ * 4];
7529 intptr_t reg_off, i;
7530 SVEHostPage info, info2;
7531
7532 /*
7533 * Probe all of the elements for host addresses and flags.
7534 */
7535 i = reg_off = 0;
7536 do {
7537 uint64_t pg = vg[reg_off >> 6];
7538 do {
7539 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7540 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7541
7542 host[i] = NULL;
7543 if (likely((pg >> (reg_off & 63)) & 1)) {
7544 if (likely(in_page >= msize)) {
7545 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7546 mmu_idx, retaddr);
7547 if (!(info.flags & TLB_MMIO)) {
7548 host[i] = info.host;
7549 }
7550 } else {
7551 /*
7552 * Element crosses the page boundary.
7553 * Probe both pages, but do not record the host address,
7554 * so that we use the slow path.
7555 */
7556 sve_probe_page(&info, false, env, addr, 0,
7557 MMU_DATA_STORE, mmu_idx, retaddr);
7558 sve_probe_page(&info2, false, env, addr + in_page, 0,
7559 MMU_DATA_STORE, mmu_idx, retaddr);
7560 info.flags |= info2.flags;
7561 }
7562
7563 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7564 cpu_check_watchpoint(env_cpu(env), addr, msize,
7565 info.attrs, BP_MEM_WRITE, retaddr);
7566 }
7567
7568 if (mtedesc && info.tagged) {
7569 mte_check(env, mtedesc, addr, retaddr);
7570 }
7571 }
7572 i += 1;
7573 reg_off += esize;
7574 } while (reg_off & 63);
7575 } while (reg_off < reg_max);
7576
7577 /*
7578 * Now that we have recognized all exceptions except SyncExternal
7579 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7580 *
7581 * Note for the common case of an element in RAM, not crossing a page
7582 * boundary, we have stored the host address in host[]. This doubles
7583 * as a first-level check against the predicate, since only enabled
7584 * elements have non-null host addresses.
7585 */
7586 i = reg_off = 0;
7587 do {
7588 void *h = host[i];
7589 if (likely(h != NULL)) {
7590 set_helper_retaddr(retaddr);
7591 host_fn(vd, reg_off, h);
7592 clear_helper_retaddr();
7593 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7594 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7595 tlb_fn(env, vd, reg_off, addr, retaddr);
7596 }
7597 i += 1;
7598 reg_off += esize;
7599 } while (reg_off < reg_max);
7600 }
7601
7602 static inline QEMU_ALWAYS_INLINE
sve_st1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint64_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7603 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7604 target_ulong base, uint64_t desc, uintptr_t retaddr,
7605 int esize, int msize, zreg_off_fn *off_fn,
7606 sve_ldst1_host_fn *host_fn,
7607 sve_ldst1_tlb_fn *tlb_fn)
7608 {
7609 uint32_t mtedesc = desc >> 32;
7610
7611 /*
7612 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7613 * offset base entirely over the address space hole to change the
7614 * pointer tag, or change the bit55 selector. So we could here
7615 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7616 */
7617 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7618 esize, msize, off_fn, host_fn, tlb_fn);
7619 }
7620
7621 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7622 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7623 void *vm, target_ulong base, uint64_t desc) \
7624 { \
7625 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7626 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7627 } \
7628 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7629 void *vm, target_ulong base, uint64_t desc) \
7630 { \
7631 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7632 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7633 }
7634
7635 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7636 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7637 void *vm, target_ulong base, uint64_t desc) \
7638 { \
7639 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7640 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7641 } \
7642 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7643 void *vm, target_ulong base, uint64_t desc) \
7644 { \
7645 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7646 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7647 }
7648
7649 #define DO_ST1_ZPZ_Q(MEM, OFS, MSZ) \
7650 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7651 void *vm, target_ulong base, uint64_t desc) \
7652 { \
7653 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 16, 1 << MSZ, \
7654 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7655 } \
7656 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7657 void *vm, target_ulong base, uint64_t desc) \
7658 { \
7659 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 16, 1 << MSZ, \
7660 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7661 }
7662
DO_ST1_ZPZ_S(bs,zsu,MO_8)7663 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7664 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7665 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7666 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7667 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7668
7669 DO_ST1_ZPZ_S(bs, zss, MO_8)
7670 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7671 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7672 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7673 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7674
7675 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7676 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7677 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7678 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7679 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7680 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7681 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7682
7683 DO_ST1_ZPZ_D(bd, zss, MO_8)
7684 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7685 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7686 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7687 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7688 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7689 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7690
7691 DO_ST1_ZPZ_D(bd, zd, MO_8)
7692 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7693 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7694 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7695 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7696 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7697 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7698
7699 DO_ST1_ZPZ_Q(qq_le, zd, MO_128)
7700 DO_ST1_ZPZ_Q(qq_be, zd, MO_128)
7701
7702 #undef DO_ST1_ZPZ_S
7703 #undef DO_ST1_ZPZ_D
7704
7705 /*
7706 * SVE2.1 consecutive register load/store
7707 */
7708
7709 static unsigned sve2p1_cont_ldst_elements(SVEContLdSt *info, vaddr addr,
7710 uint32_t png, intptr_t reg_max,
7711 int N, int v_esz)
7712 {
7713 const int esize = 1 << v_esz;
7714 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
7715 DecodeCounter p = decode_counter(png, reg_max, v_esz);
7716 unsigned b_count = p.count << v_esz;
7717 unsigned b_stride = 1 << (v_esz + p.lg2_stride);
7718 intptr_t page_split;
7719
7720 /* Set all of the element indices to -1, and the TLB data to 0. */
7721 memset(info, -1, offsetof(SVEContLdSt, page));
7722 memset(info->page, 0, sizeof(info->page));
7723
7724 if (p.invert) {
7725 if (b_count >= reg_max * N) {
7726 return 0;
7727 }
7728 reg_off_first = b_count;
7729 reg_off_last = reg_max * N - b_stride;
7730 } else {
7731 if (b_count == 0) {
7732 return 0;
7733 }
7734 reg_off_first = 0;
7735 reg_off_last = MIN(b_count - esize, reg_max * N - b_stride);
7736 }
7737
7738 info->reg_off_first[0] = reg_off_first;
7739 info->mem_off_first[0] = reg_off_first;
7740
7741 page_split = -(addr | TARGET_PAGE_MASK);
7742 if (reg_off_last + esize <= page_split || reg_off_first >= page_split) {
7743 /* The entire operation fits within a single page. */
7744 info->reg_off_last[0] = reg_off_last;
7745 return b_stride;
7746 }
7747
7748 info->page_split = page_split;
7749 reg_off_split = ROUND_DOWN(page_split, esize);
7750
7751 /*
7752 * This is the last full element on the first page, but it is not
7753 * necessarily active. If there is no full element, i.e. the first
7754 * active element is the one that's split, this value remains -1.
7755 * It is useful as iteration bounds.
7756 */
7757 if (reg_off_split != 0) {
7758 info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride);
7759 }
7760
7761 /* Determine if an unaligned element spans the pages. */
7762 if (page_split & (esize - 1)) {
7763 /* It is helpful to know if the split element is active. */
7764 if ((reg_off_split & (b_stride - 1)) == 0) {
7765 info->reg_off_split = reg_off_split;
7766 info->mem_off_split = reg_off_split;
7767 }
7768 reg_off_split += esize;
7769 }
7770
7771 /*
7772 * We do want the first active element on the second page, because
7773 * this may affect the address reported in an exception.
7774 */
7775 reg_off_split = ROUND_UP(reg_off_split, b_stride);
7776 if (reg_off_split <= reg_off_last) {
7777 info->reg_off_first[1] = reg_off_split;
7778 info->mem_off_first[1] = reg_off_split;
7779 info->reg_off_last[1] = reg_off_last;
7780 }
7781 return b_stride;
7782 }
7783
sve2p1_cont_ldst_watchpoints(SVEContLdSt * info,CPUARMState * env,target_ulong addr,unsigned estride,int esize,int wp_access,uintptr_t ra)7784 static void sve2p1_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
7785 target_ulong addr, unsigned estride,
7786 int esize, int wp_access, uintptr_t ra)
7787 {
7788 #ifndef CONFIG_USER_ONLY
7789 intptr_t count_off, count_last;
7790 int flags0 = info->page[0].flags;
7791 int flags1 = info->page[1].flags;
7792
7793 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
7794 return;
7795 }
7796
7797 /* Indicate that watchpoints are handled. */
7798 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
7799 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
7800
7801 if (flags0 & TLB_WATCHPOINT) {
7802 count_off = info->reg_off_first[0];
7803 count_last = info->reg_off_split;
7804 if (count_last < 0) {
7805 count_last = info->reg_off_last[0];
7806 }
7807 do {
7808 cpu_check_watchpoint(env_cpu(env), addr + count_off,
7809 esize, info->page[0].attrs, wp_access, ra);
7810 count_off += estride;
7811 } while (count_off <= count_last);
7812 }
7813
7814 count_off = info->reg_off_first[1];
7815 if ((flags1 & TLB_WATCHPOINT) && count_off >= 0) {
7816 count_last = info->reg_off_last[1];
7817 do {
7818 cpu_check_watchpoint(env_cpu(env), addr + count_off,
7819 esize, info->page[1].attrs,
7820 wp_access, ra);
7821 count_off += estride;
7822 } while (count_off <= count_last);
7823 }
7824 #endif
7825 }
7826
sve2p1_cont_ldst_mte_check(SVEContLdSt * info,CPUARMState * env,target_ulong addr,unsigned estride,int esize,uint32_t mtedesc,uintptr_t ra)7827 static void sve2p1_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
7828 target_ulong addr, unsigned estride,
7829 int esize, uint32_t mtedesc,
7830 uintptr_t ra)
7831 {
7832 intptr_t count_off, count_last;
7833
7834 /*
7835 * TODO: estride is always a small power of two, <= 8.
7836 * Manipulate the stride within the loops such that
7837 * - first iteration hits addr + off, as required,
7838 * - second iteration hits ALIGN_UP(addr, 16),
7839 * - other iterations advance addr by 16.
7840 * This will minimize the probing to once per MTE granule.
7841 */
7842
7843 /* Process the page only if MemAttr == Tagged. */
7844 if (info->page[0].tagged) {
7845 count_off = info->reg_off_first[0];
7846 count_last = info->reg_off_split;
7847 if (count_last < 0) {
7848 count_last = info->reg_off_last[0];
7849 }
7850
7851 do {
7852 mte_check(env, mtedesc, addr + count_off, ra);
7853 count_off += estride;
7854 } while (count_off <= count_last);
7855 }
7856
7857 count_off = info->reg_off_first[1];
7858 if (count_off >= 0 && info->page[1].tagged) {
7859 count_last = info->reg_off_last[1];
7860 do {
7861 mte_check(env, mtedesc, addr + count_off, ra);
7862 count_off += estride;
7863 } while (count_off <= count_last);
7864 }
7865 }
7866
7867 static inline QEMU_ALWAYS_INLINE
sve2p1_ld1_c(CPUARMState * env,ARMVectorReg * zd,const vaddr addr,uint32_t png,uint64_t desc64,const uintptr_t ra,const MemOp esz,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7868 void sve2p1_ld1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr,
7869 uint32_t png, uint64_t desc64,
7870 const uintptr_t ra, const MemOp esz,
7871 sve_ldst1_host_fn *host_fn,
7872 sve_ldst1_tlb_fn *tlb_fn)
7873 {
7874 uint32_t mtedesc = desc64 >> 32;
7875 uint32_t desc = desc64;
7876 const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2;
7877 const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4);
7878 const intptr_t reg_max = simd_oprsz(desc);
7879 const unsigned esize = 1 << esz;
7880 intptr_t count_off, count_last;
7881 intptr_t reg_off, reg_last, reg_n;
7882 SVEContLdSt info;
7883 unsigned estride, flags;
7884 void *host;
7885
7886 estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz);
7887 if (estride == 0) {
7888 /* The entire predicate was false; no load occurs. */
7889 for (unsigned n = 0; n < N; n++) {
7890 memset(zd + n * rstride, 0, reg_max);
7891 }
7892 return;
7893 }
7894
7895 /* Probe the page(s). Exit with exception for any invalid page. */
7896 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
7897
7898 /* Handle watchpoints for all active elements. */
7899 sve2p1_cont_ldst_watchpoints(&info, env, addr, estride,
7900 esize, BP_MEM_READ, ra);
7901
7902 /*
7903 * Handle mte checks for all active elements.
7904 * Since TBI must be set for MTE, !mtedesc => !mte_active.
7905 */
7906 if (mtedesc) {
7907 sve2p1_cont_ldst_mte_check(&info, env, estride, addr,
7908 esize, mtedesc, ra);
7909 }
7910
7911 flags = info.page[0].flags | info.page[1].flags;
7912 if (unlikely(flags != 0)) {
7913 /*
7914 * At least one page includes MMIO.
7915 * Any bus operation can fail with cpu_transaction_failed,
7916 * which for ARM will raise SyncExternal. Perform the load
7917 * into scratch memory to preserve register state until the end.
7918 */
7919 ARMVectorReg scratch[4] = { };
7920
7921 count_off = info.reg_off_first[0];
7922 count_last = info.reg_off_last[1];
7923 if (count_last < 0) {
7924 count_last = info.reg_off_split;
7925 if (count_last < 0) {
7926 count_last = info.reg_off_last[0];
7927 }
7928 }
7929 reg_off = count_off % reg_max;
7930 reg_n = count_off / reg_max;
7931
7932 do {
7933 reg_last = MIN(count_last - count_off, reg_max - esize);
7934 do {
7935 tlb_fn(env, &scratch[reg_n], reg_off, addr + count_off, ra);
7936 reg_off += estride;
7937 count_off += estride;
7938 } while (reg_off <= reg_last);
7939 reg_off = 0;
7940 reg_n++;
7941 } while (count_off <= count_last);
7942
7943 for (unsigned n = 0; n < N; ++n) {
7944 memcpy(&zd[n * rstride], &scratch[n], reg_max);
7945 }
7946 return;
7947 }
7948
7949 /* The entire operation is in RAM, on valid pages. */
7950
7951 for (unsigned n = 0; n < N; ++n) {
7952 memset(&zd[n * rstride], 0, reg_max);
7953 }
7954
7955 count_off = info.reg_off_first[0];
7956 count_last = info.reg_off_last[0];
7957 reg_off = count_off % reg_max;
7958 reg_n = count_off / reg_max;
7959 host = info.page[0].host;
7960
7961 set_helper_retaddr(ra);
7962
7963 do {
7964 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
7965 do {
7966 host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
7967 reg_off += estride;
7968 count_off += estride;
7969 } while (reg_off <= reg_last);
7970 reg_off = 0;
7971 reg_n++;
7972 } while (count_off <= count_last);
7973
7974 clear_helper_retaddr();
7975
7976 /*
7977 * Use the slow path to manage the cross-page misalignment.
7978 * But we know this is RAM and cannot trap.
7979 */
7980 count_off = info.reg_off_split;
7981 if (unlikely(count_off >= 0)) {
7982 reg_off = count_off % reg_max;
7983 reg_n = count_off / reg_max;
7984 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
7985 }
7986
7987 count_off = info.reg_off_first[1];
7988 if (unlikely(count_off >= 0)) {
7989 count_last = info.reg_off_last[1];
7990 reg_off = count_off % reg_max;
7991 reg_n = count_off / reg_max;
7992 host = info.page[1].host;
7993
7994 set_helper_retaddr(ra);
7995
7996 do {
7997 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
7998 do {
7999 host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
8000 reg_off += estride;
8001 count_off += estride;
8002 } while (reg_off <= reg_last);
8003 reg_off = 0;
8004 reg_n++;
8005 } while (count_off <= count_last);
8006
8007 clear_helper_retaddr();
8008 }
8009 }
8010
HELPER(sve2p1_ld1bb_c)8011 void HELPER(sve2p1_ld1bb_c)(CPUARMState *env, void *vd, target_ulong addr,
8012 uint32_t png, uint64_t desc)
8013 {
8014 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), MO_8,
8015 sve_ld1bb_host, sve_ld1bb_tlb);
8016 }
8017
8018 #define DO_LD1_2(NAME, ESZ) \
8019 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \
8020 target_ulong addr, uint32_t png, \
8021 uint64_t desc) \
8022 { \
8023 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
8024 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
8025 } \
8026 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \
8027 target_ulong addr, uint32_t png, \
8028 uint64_t desc) \
8029 { \
8030 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
8031 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
8032 }
8033
DO_LD1_2(ld1hh,MO_16)8034 DO_LD1_2(ld1hh, MO_16)
8035 DO_LD1_2(ld1ss, MO_32)
8036 DO_LD1_2(ld1dd, MO_64)
8037
8038 #undef DO_LD1_2
8039
8040 static inline QEMU_ALWAYS_INLINE
8041 void sve2p1_st1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr,
8042 uint32_t png, uint64_t desc64,
8043 const uintptr_t ra, const int esz,
8044 sve_ldst1_host_fn *host_fn,
8045 sve_ldst1_tlb_fn *tlb_fn)
8046 {
8047 uint32_t mtedesc = desc64 >> 32;
8048 uint32_t desc = desc64;
8049 const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2;
8050 const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4);
8051 const intptr_t reg_max = simd_oprsz(desc);
8052 const unsigned esize = 1 << esz;
8053 intptr_t count_off, count_last;
8054 intptr_t reg_off, reg_last, reg_n;
8055 SVEContLdSt info;
8056 unsigned estride, flags;
8057 void *host;
8058
8059 estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz);
8060 if (estride == 0) {
8061 /* The entire predicate was false; no store occurs. */
8062 return;
8063 }
8064
8065 /* Probe the page(s). Exit with exception for any invalid page. */
8066 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
8067
8068 /* Handle watchpoints for all active elements. */
8069 sve2p1_cont_ldst_watchpoints(&info, env, addr, estride,
8070 esize, BP_MEM_WRITE, ra);
8071
8072 /*
8073 * Handle mte checks for all active elements.
8074 * Since TBI must be set for MTE, !mtedesc => !mte_active.
8075 */
8076 if (mtedesc) {
8077 sve2p1_cont_ldst_mte_check(&info, env, estride, addr,
8078 esize, mtedesc, ra);
8079 }
8080
8081 flags = info.page[0].flags | info.page[1].flags;
8082 if (unlikely(flags != 0)) {
8083 /*
8084 * At least one page includes MMIO.
8085 * Any bus operation can fail with cpu_transaction_failed,
8086 * which for ARM will raise SyncExternal. Perform the load
8087 * into scratch memory to preserve register state until the end.
8088 */
8089 count_off = info.reg_off_first[0];
8090 count_last = info.reg_off_last[1];
8091 if (count_last < 0) {
8092 count_last = info.reg_off_split;
8093 if (count_last < 0) {
8094 count_last = info.reg_off_last[0];
8095 }
8096 }
8097 reg_off = count_off % reg_max;
8098 reg_n = count_off / reg_max;
8099
8100 do {
8101 reg_last = MIN(count_last - count_off, reg_max - esize);
8102 do {
8103 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
8104 reg_off += estride;
8105 count_off += estride;
8106 } while (reg_off <= reg_last);
8107 reg_off = 0;
8108 reg_n++;
8109 } while (count_off <= count_last);
8110 return;
8111 }
8112
8113 /* The entire operation is in RAM, on valid pages. */
8114
8115 count_off = info.reg_off_first[0];
8116 count_last = info.reg_off_last[0];
8117 reg_off = count_off % reg_max;
8118 reg_n = count_off / reg_max;
8119 host = info.page[0].host;
8120
8121 set_helper_retaddr(ra);
8122
8123 do {
8124 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
8125 do {
8126 host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
8127 reg_off += estride;
8128 count_off += estride;
8129 } while (reg_off <= reg_last);
8130 reg_off = 0;
8131 reg_n++;
8132 } while (count_off <= count_last);
8133
8134 clear_helper_retaddr();
8135
8136 /*
8137 * Use the slow path to manage the cross-page misalignment.
8138 * But we know this is RAM and cannot trap.
8139 */
8140 count_off = info.reg_off_split;
8141 if (unlikely(count_off >= 0)) {
8142 reg_off = count_off % reg_max;
8143 reg_n = count_off / reg_max;
8144 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
8145 }
8146
8147 count_off = info.reg_off_first[1];
8148 if (unlikely(count_off >= 0)) {
8149 count_last = info.reg_off_last[1];
8150 reg_off = count_off % reg_max;
8151 reg_n = count_off / reg_max;
8152 host = info.page[1].host;
8153
8154 set_helper_retaddr(ra);
8155
8156 do {
8157 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
8158 do {
8159 host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
8160 reg_off += estride;
8161 count_off += estride;
8162 } while (reg_off <= reg_last);
8163 reg_off = 0;
8164 reg_n++;
8165 } while (count_off <= count_last);
8166
8167 clear_helper_retaddr();
8168 }
8169 }
8170
HELPER(sve2p1_st1bb_c)8171 void HELPER(sve2p1_st1bb_c)(CPUARMState *env, void *vd, target_ulong addr,
8172 uint32_t png, uint64_t desc)
8173 {
8174 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), MO_8,
8175 sve_st1bb_host, sve_st1bb_tlb);
8176 }
8177
8178 #define DO_ST1_2(NAME, ESZ) \
8179 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \
8180 target_ulong addr, uint32_t png, \
8181 uint64_t desc) \
8182 { \
8183 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
8184 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
8185 } \
8186 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \
8187 target_ulong addr, uint32_t png, \
8188 uint64_t desc) \
8189 { \
8190 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
8191 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
8192 }
8193
DO_ST1_2(st1hh,MO_16)8194 DO_ST1_2(st1hh, MO_16)
8195 DO_ST1_2(st1ss, MO_32)
8196 DO_ST1_2(st1dd, MO_64)
8197
8198 #undef DO_ST1_2
8199
8200 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8201 {
8202 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8203 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8204
8205 for (i = 0; i < opr_sz; ++i) {
8206 d[i] = n[i] ^ m[i] ^ k[i];
8207 }
8208 }
8209
HELPER(sve2_bcax)8210 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8211 {
8212 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8213 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8214
8215 for (i = 0; i < opr_sz; ++i) {
8216 d[i] = n[i] ^ (m[i] & ~k[i]);
8217 }
8218 }
8219
HELPER(sve2_bsl1n)8220 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8221 {
8222 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8223 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8224
8225 for (i = 0; i < opr_sz; ++i) {
8226 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
8227 }
8228 }
8229
HELPER(sve2_bsl2n)8230 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8231 {
8232 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8233 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8234
8235 for (i = 0; i < opr_sz; ++i) {
8236 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
8237 }
8238 }
8239
HELPER(sve2_nbsl)8240 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8241 {
8242 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8243 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8244
8245 for (i = 0; i < opr_sz; ++i) {
8246 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
8247 }
8248 }
8249
8250 /*
8251 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
8252 * See hasless(v,1) from
8253 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
8254 */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)8255 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
8256 {
8257 int bits = 8 << esz;
8258 uint64_t ones = dup_const(esz, 1);
8259 uint64_t signs = ones << (bits - 1);
8260 uint64_t cmp0, cmp1;
8261
8262 cmp1 = dup_const(esz, n);
8263 cmp0 = cmp1 ^ m0;
8264 cmp1 = cmp1 ^ m1;
8265 cmp0 = (cmp0 - ones) & ~cmp0;
8266 cmp1 = (cmp1 - ones) & ~cmp1;
8267 return (cmp0 | cmp1) & signs;
8268 }
8269
do_match(void * vd,void * vn,void * vm,void * vg,uint32_t desc,int esz,bool nmatch)8270 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
8271 uint32_t desc, int esz, bool nmatch)
8272 {
8273 uint16_t esz_mask = pred_esz_masks[esz];
8274 intptr_t opr_sz = simd_oprsz(desc);
8275 uint32_t flags = PREDTEST_INIT;
8276 intptr_t i, j, k;
8277
8278 for (i = 0; i < opr_sz; i += 16) {
8279 uint64_t m0 = *(uint64_t *)(vm + i);
8280 uint64_t m1 = *(uint64_t *)(vm + i + 8);
8281 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
8282 uint16_t out = 0;
8283
8284 for (j = 0; j < 16; j += 8) {
8285 uint64_t n = *(uint64_t *)(vn + i + j);
8286
8287 for (k = 0; k < 8; k += 1 << esz) {
8288 if (pg & (1 << (j + k))) {
8289 bool o = do_match2(n >> (k * 8), m0, m1, esz);
8290 out |= (o ^ nmatch) << (j + k);
8291 }
8292 }
8293 }
8294 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
8295 flags = iter_predtest_fwd(out, pg, flags);
8296 }
8297 return flags;
8298 }
8299
8300 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
8301 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
8302 { \
8303 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
8304 }
8305
DO_PPZZ_MATCH(sve2_match_ppzz_b,MO_8,false)8306 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
8307 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
8308
8309 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
8310 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
8311
8312 #undef DO_PPZZ_MATCH
8313
8314 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
8315 uint32_t desc)
8316 {
8317 ARMVectorReg scratch;
8318 intptr_t i, j;
8319 intptr_t opr_sz = simd_oprsz(desc);
8320 uint32_t *d = vd, *n = vn, *m = vm;
8321 uint8_t *pg = vg;
8322
8323 if (d == n) {
8324 n = memcpy(&scratch, n, opr_sz);
8325 if (d == m) {
8326 m = n;
8327 }
8328 } else if (d == m) {
8329 m = memcpy(&scratch, m, opr_sz);
8330 }
8331
8332 for (i = 0; i < opr_sz; i += 4) {
8333 uint64_t count = 0;
8334 uint8_t pred;
8335
8336 pred = pg[H1(i >> 3)] >> (i & 7);
8337 if (pred & 1) {
8338 uint32_t nn = n[H4(i >> 2)];
8339
8340 for (j = 0; j <= i; j += 4) {
8341 pred = pg[H1(j >> 3)] >> (j & 7);
8342 if ((pred & 1) && nn == m[H4(j >> 2)]) {
8343 ++count;
8344 }
8345 }
8346 }
8347 d[H4(i >> 2)] = count;
8348 }
8349 }
8350
HELPER(sve2_histcnt_d)8351 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
8352 uint32_t desc)
8353 {
8354 ARMVectorReg scratch;
8355 intptr_t i, j;
8356 intptr_t opr_sz = simd_oprsz(desc);
8357 uint64_t *d = vd, *n = vn, *m = vm;
8358 uint8_t *pg = vg;
8359
8360 if (d == n) {
8361 n = memcpy(&scratch, n, opr_sz);
8362 if (d == m) {
8363 m = n;
8364 }
8365 } else if (d == m) {
8366 m = memcpy(&scratch, m, opr_sz);
8367 }
8368
8369 for (i = 0; i < opr_sz / 8; ++i) {
8370 uint64_t count = 0;
8371 if (pg[H1(i)] & 1) {
8372 uint64_t nn = n[i];
8373 for (j = 0; j <= i; ++j) {
8374 if ((pg[H1(j)] & 1) && nn == m[j]) {
8375 ++count;
8376 }
8377 }
8378 }
8379 d[i] = count;
8380 }
8381 }
8382
8383 /*
8384 * Returns the number of bytes in m0 and m1 that match n.
8385 * Unlike do_match2 we don't just need true/false, we need an exact count.
8386 * This requires two extra logical operations.
8387 */
do_histseg_cnt(uint8_t n,uint64_t m0,uint64_t m1)8388 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
8389 {
8390 const uint64_t mask = dup_const(MO_8, 0x7f);
8391 uint64_t cmp0, cmp1;
8392
8393 cmp1 = dup_const(MO_8, n);
8394 cmp0 = cmp1 ^ m0;
8395 cmp1 = cmp1 ^ m1;
8396
8397 /*
8398 * 1: clear msb of each byte to avoid carry to next byte (& mask)
8399 * 2: carry in to msb if byte != 0 (+ mask)
8400 * 3: set msb if cmp has msb set (| cmp)
8401 * 4: set ~msb to ignore them (| mask)
8402 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
8403 * 5: invert, resulting in 0x80 if and only if byte == 0.
8404 */
8405 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
8406 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
8407
8408 /*
8409 * Combine the two compares in a way that the bits do
8410 * not overlap, and so preserves the count of set bits.
8411 * If the host has an efficient instruction for ctpop,
8412 * then ctpop(x) + ctpop(y) has the same number of
8413 * operations as ctpop(x | (y >> 1)). If the host does
8414 * not have an efficient ctpop, then we only want to
8415 * use it once.
8416 */
8417 return ctpop64(cmp0 | (cmp1 >> 1));
8418 }
8419
HELPER(sve2_histseg)8420 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
8421 {
8422 intptr_t i, j;
8423 intptr_t opr_sz = simd_oprsz(desc);
8424
8425 for (i = 0; i < opr_sz; i += 16) {
8426 uint64_t n0 = *(uint64_t *)(vn + i);
8427 uint64_t m0 = *(uint64_t *)(vm + i);
8428 uint64_t n1 = *(uint64_t *)(vn + i + 8);
8429 uint64_t m1 = *(uint64_t *)(vm + i + 8);
8430 uint64_t out0 = 0;
8431 uint64_t out1 = 0;
8432
8433 for (j = 0; j < 64; j += 8) {
8434 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
8435 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
8436 out0 |= cnt0 << j;
8437 out1 |= cnt1 << j;
8438 }
8439
8440 *(uint64_t *)(vd + i) = out0;
8441 *(uint64_t *)(vd + i + 8) = out1;
8442 }
8443 }
8444
HELPER(sve2_xar_b)8445 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
8446 {
8447 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8448 int shr = simd_data(desc);
8449 int shl = 8 - shr;
8450 uint64_t mask = dup_const(MO_8, 0xff >> shr);
8451 uint64_t *d = vd, *n = vn, *m = vm;
8452
8453 for (i = 0; i < opr_sz; ++i) {
8454 uint64_t t = n[i] ^ m[i];
8455 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
8456 }
8457 }
8458
HELPER(sve2_xar_h)8459 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
8460 {
8461 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8462 int shr = simd_data(desc);
8463 int shl = 16 - shr;
8464 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
8465 uint64_t *d = vd, *n = vn, *m = vm;
8466
8467 for (i = 0; i < opr_sz; ++i) {
8468 uint64_t t = n[i] ^ m[i];
8469 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
8470 }
8471 }
8472
HELPER(sve2_xar_s)8473 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
8474 {
8475 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
8476 int shr = simd_data(desc);
8477 uint32_t *d = vd, *n = vn, *m = vm;
8478
8479 for (i = 0; i < opr_sz; ++i) {
8480 d[i] = ror32(n[i] ^ m[i], shr);
8481 }
8482 }
8483
HELPER(fmmla_s)8484 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
8485 float_status *status, uint32_t desc)
8486 {
8487 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
8488
8489 for (s = 0; s < opr_sz; ++s) {
8490 float32 *n = vn + s * sizeof(float32) * 4;
8491 float32 *m = vm + s * sizeof(float32) * 4;
8492 float32 *a = va + s * sizeof(float32) * 4;
8493 float32 *d = vd + s * sizeof(float32) * 4;
8494 float32 n00 = n[H4(0)], n01 = n[H4(1)];
8495 float32 n10 = n[H4(2)], n11 = n[H4(3)];
8496 float32 m00 = m[H4(0)], m01 = m[H4(1)];
8497 float32 m10 = m[H4(2)], m11 = m[H4(3)];
8498 float32 p0, p1;
8499
8500 /* i = 0, j = 0 */
8501 p0 = float32_mul(n00, m00, status);
8502 p1 = float32_mul(n01, m01, status);
8503 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
8504
8505 /* i = 0, j = 1 */
8506 p0 = float32_mul(n00, m10, status);
8507 p1 = float32_mul(n01, m11, status);
8508 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
8509
8510 /* i = 1, j = 0 */
8511 p0 = float32_mul(n10, m00, status);
8512 p1 = float32_mul(n11, m01, status);
8513 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
8514
8515 /* i = 1, j = 1 */
8516 p0 = float32_mul(n10, m10, status);
8517 p1 = float32_mul(n11, m11, status);
8518 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
8519 }
8520 }
8521
HELPER(fmmla_d)8522 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
8523 float_status *status, uint32_t desc)
8524 {
8525 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
8526
8527 for (s = 0; s < opr_sz; ++s) {
8528 float64 *n = vn + s * sizeof(float64) * 4;
8529 float64 *m = vm + s * sizeof(float64) * 4;
8530 float64 *a = va + s * sizeof(float64) * 4;
8531 float64 *d = vd + s * sizeof(float64) * 4;
8532 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
8533 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
8534 float64 p0, p1;
8535
8536 /* i = 0, j = 0 */
8537 p0 = float64_mul(n00, m00, status);
8538 p1 = float64_mul(n01, m01, status);
8539 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
8540
8541 /* i = 0, j = 1 */
8542 p0 = float64_mul(n00, m10, status);
8543 p1 = float64_mul(n01, m11, status);
8544 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
8545
8546 /* i = 1, j = 0 */
8547 p0 = float64_mul(n10, m00, status);
8548 p1 = float64_mul(n11, m01, status);
8549 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
8550
8551 /* i = 1, j = 1 */
8552 p0 = float64_mul(n10, m10, status);
8553 p1 = float64_mul(n11, m11, status);
8554 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
8555 }
8556 }
8557
8558 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
8559 void HELPER(NAME)(void *vd, void *vn, void *vg, \
8560 float_status *status, uint32_t desc) \
8561 { \
8562 intptr_t i = simd_oprsz(desc); \
8563 uint64_t *g = vg; \
8564 do { \
8565 uint64_t pg = g[(i - 1) >> 6]; \
8566 do { \
8567 i -= sizeof(TYPEW); \
8568 if (likely((pg >> (i & 63)) & 1)) { \
8569 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
8570 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
8571 } \
8572 } while (i & 63); \
8573 } while (i != 0); \
8574 }
8575
DO_FCVTNT(sve_bfcvtnt,uint32_t,uint16_t,H1_4,H1_2,float32_to_bfloat16)8576 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
8577 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
8578 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
8579
8580 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
8581 void HELPER(NAME)(void *vd, void *vn, void *vg, \
8582 float_status *status, uint32_t desc) \
8583 { \
8584 intptr_t i = simd_oprsz(desc); \
8585 uint64_t *g = vg; \
8586 do { \
8587 uint64_t pg = g[(i - 1) >> 6]; \
8588 do { \
8589 i -= sizeof(TYPEW); \
8590 if (likely((pg >> (i & 63)) & 1)) { \
8591 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
8592 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
8593 } \
8594 } while (i & 63); \
8595 } while (i != 0); \
8596 }
8597
8598 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
8599 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
8600
8601 #undef DO_FCVTLT
8602 #undef DO_FCVTNT
8603
8604 void HELPER(pext)(void *vd, uint32_t png, uint32_t desc)
8605 {
8606 int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
8607 int vl = pl * 8;
8608 unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
8609 int part = FIELD_EX32(desc, PREDDESC, DATA);
8610 DecodeCounter p = decode_counter(png, vl, v_esz);
8611 uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride];
8612 ARMPredicateReg *d = vd;
8613
8614 /*
8615 * Convert from element count to byte count and adjust
8616 * for the portion of the 4*VL counter to be extracted.
8617 */
8618 int b_count = (p.count << v_esz) - vl * part;
8619
8620 memset(d, 0, sizeof(*d));
8621 if (p.invert) {
8622 if (b_count <= 0) {
8623 do_whilel(vd, mask, vl, vl);
8624 } else if (b_count < vl) {
8625 do_whileg(vd, mask, vl - b_count, vl);
8626 }
8627 } else if (b_count > 0) {
8628 do_whilel(vd, mask, MIN(b_count, vl), vl);
8629 }
8630 }
8631