xref: /openbmc/qemu/target/arm/tcg/sve_helper.c (revision c017386f28c03a03b8f14444f8671d3d8f7180fe)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/page-protection.h"
24 #include "exec/helper-proto.h"
25 #include "exec/target_page.h"
26 #include "exec/tlb-flags.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg.h"
30 #include "vec_internal.h"
31 #include "sve_ldst_internal.h"
32 #include "accel/tcg/cpu-ldst.h"
33 #include "accel/tcg/helper-retaddr.h"
34 #include "accel/tcg/cpu-ops.h"
35 #include "accel/tcg/probe.h"
36 #ifdef CONFIG_USER_ONLY
37 #include "user/page-protection.h"
38 #endif
39 
40 
41 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
42  *
43  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
44  * and bit 0 set if C is set.  Compare the definitions of these variables
45  * within CPUARMState.
46  */
47 
48 /* For no G bits set, NZCV = C.  */
49 #define PREDTEST_INIT  1
50 
51 /* This is an iterative function, called for each Pd and Pg word
52  * moving forward.
53  */
iter_predtest_fwd(uint64_t d,uint64_t g,uint32_t flags)54 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
55 {
56     if (likely(g)) {
57         /* Compute N from first D & G.
58            Use bit 2 to signal first G bit seen.  */
59         if (!(flags & 4)) {
60             flags |= ((d & (g & -g)) != 0) << 31;
61             flags |= 4;
62         }
63 
64         /* Accumulate Z from each D & G.  */
65         flags |= ((d & g) != 0) << 1;
66 
67         /* Compute C from last !(D & G).  Replace previous.  */
68         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
69     }
70     return flags;
71 }
72 
73 /* This is an iterative function, called for each Pd and Pg word
74  * moving backward.
75  */
iter_predtest_bwd(uint64_t d,uint64_t g,uint32_t flags)76 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
77 {
78     if (likely(g)) {
79         /* Compute C from first (i.e last) !(D & G).
80            Use bit 2 to signal first G bit seen.  */
81         if (!(flags & 4)) {
82             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
83             flags |= (d & pow2floor(g)) == 0;
84         }
85 
86         /* Accumulate Z from each D & G.  */
87         flags |= ((d & g) != 0) << 1;
88 
89         /* Compute N from last (i.e first) D & G.  Replace previous.  */
90         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
91     }
92     return flags;
93 }
94 
95 /* The same for a single word predicate.  */
HELPER(sve_predtest1)96 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
97 {
98     return iter_predtest_fwd(d, g, PREDTEST_INIT);
99 }
100 
101 /* The same for a multi-word predicate.  */
HELPER(sve_predtest)102 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
103 {
104     uint32_t flags = PREDTEST_INIT;
105     uint64_t *d = vd, *g = vg;
106     uintptr_t i = 0;
107 
108     do {
109         flags = iter_predtest_fwd(d[i], g[i], flags);
110     } while (++i < words);
111 
112     return flags;
113 }
114 
115 /* Similarly for single word elements.  */
expand_pred_s(uint8_t byte)116 static inline uint64_t expand_pred_s(uint8_t byte)
117 {
118     static const uint64_t word[] = {
119         [0x01] = 0x00000000ffffffffull,
120         [0x10] = 0xffffffff00000000ull,
121         [0x11] = 0xffffffffffffffffull,
122     };
123     return word[byte & 0x11];
124 }
125 
expand_pred_d(uint8_t byte)126 static inline uint64_t expand_pred_d(uint8_t byte)
127 {
128     return -(uint64_t)(byte & 1);
129 }
130 
131 #define LOGICAL_PPPP(NAME, FUNC) \
132 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
133 {                                                                         \
134     uintptr_t opr_sz = simd_oprsz(desc);                                  \
135     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
136     uintptr_t i;                                                          \
137     for (i = 0; i < opr_sz / 8; ++i) {                                    \
138         d[i] = FUNC(n[i], m[i], g[i]);                                    \
139     }                                                                     \
140 }
141 
142 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
143 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
144 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
145 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
146 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
147 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
148 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
149 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
150 
LOGICAL_PPPP(sve_and_pppp,DO_AND)151 LOGICAL_PPPP(sve_and_pppp, DO_AND)
152 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
153 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
154 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
155 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
156 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
157 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
158 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
159 
160 #undef DO_AND
161 #undef DO_BIC
162 #undef DO_EOR
163 #undef DO_ORR
164 #undef DO_ORN
165 #undef DO_NOR
166 #undef DO_NAND
167 #undef DO_SEL
168 #undef LOGICAL_PPPP
169 
170 /* Fully general three-operand expander, controlled by a predicate.
171  * This is complicated by the host-endian storage of the register file.
172  */
173 /* ??? I don't expect the compiler could ever vectorize this itself.
174  * With some tables we can convert bit masks to byte masks, and with
175  * extra care wrt byte/word ordering we could use gcc generic vectors
176  * and do 16 bytes at a time.
177  */
178 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
179 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
180 {                                                                       \
181     intptr_t i, opr_sz = simd_oprsz(desc);                              \
182     for (i = 0; i < opr_sz; ) {                                         \
183         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
184         do {                                                            \
185             if (pg & 1) {                                               \
186                 TYPE nn = *(TYPE *)(vn + H(i));                         \
187                 TYPE mm = *(TYPE *)(vm + H(i));                         \
188                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
189             }                                                           \
190             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
191         } while (i & 15);                                               \
192     }                                                                   \
193 }
194 
195 /* Similarly, specialized for 64-bit operands.  */
196 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
197 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
198 {                                                               \
199     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
200     TYPE *d = vd, *n = vn, *m = vm;                             \
201     uint8_t *pg = vg;                                           \
202     for (i = 0; i < opr_sz; i += 1) {                           \
203         if (pg[H1(i)] & 1) {                                    \
204             TYPE nn = n[i], mm = m[i];                          \
205             d[i] = OP(nn, mm);                                  \
206         }                                                       \
207     }                                                           \
208 }
209 
210 #define DO_AND(N, M)  (N & M)
211 #define DO_EOR(N, M)  (N ^ M)
212 #define DO_ORR(N, M)  (N | M)
213 #define DO_BIC(N, M)  (N & ~M)
214 #define DO_ORC(N, M)  (N | ~M)
215 #define DO_ADD(N, M)  (N + M)
216 #define DO_SUB(N, M)  (N - M)
217 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
218 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
219 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
220 #define DO_MUL(N, M)  (N * M)
221 
222 
223 /*
224  * We must avoid the C undefined behaviour cases: division by
225  * zero and signed division of INT_MIN by -1. Both of these
226  * have architecturally defined required results for Arm.
227  * We special case all signed divisions by -1 to avoid having
228  * to deduce the minimum integer for the type involved.
229  */
230 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
231 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
232 
233 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
234 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
235 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
236 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
237 
238 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
239 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
240 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
241 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
242 
243 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
244 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
245 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
246 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
247 
248 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
249 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
250 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
251 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
252 
253 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
254 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
255 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
256 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
257 
258 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
259 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
260 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
261 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
262 
263 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
264 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
265 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
266 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
267 
268 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
269 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
270 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
271 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
272 
273 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
274 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
275 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
276 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
277 
278 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
279 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
280 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
281 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
282 
283 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
284 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
285 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
286 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
287 
288 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
289 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
290 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
291 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
292 
293 /* Because the computation type is at least twice as large as required,
294    these work for both signed and unsigned source types.  */
295 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
296 {
297     return (n * m) >> 8;
298 }
299 
do_mulh_h(int32_t n,int32_t m)300 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
301 {
302     return (n * m) >> 16;
303 }
304 
do_mulh_s(int64_t n,int64_t m)305 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
306 {
307     return (n * m) >> 32;
308 }
309 
do_smulh_d(uint64_t n,uint64_t m)310 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
311 {
312     uint64_t lo, hi;
313     muls64(&lo, &hi, n, m);
314     return hi;
315 }
316 
do_umulh_d(uint64_t n,uint64_t m)317 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
318 {
319     uint64_t lo, hi;
320     mulu64(&lo, &hi, n, m);
321     return hi;
322 }
323 
DO_ZPZZ(sve_mul_zpzz_b,uint8_t,H1,DO_MUL)324 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
325 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
326 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
327 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
328 
329 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
330 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
331 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
332 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
333 
334 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
335 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
336 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
337 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
338 
339 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
340 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
341 
342 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
343 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
344 
345 /* Note that all bits of the shift are significant
346    and not modulo the element size.  */
347 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
348 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
349 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
350 
351 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
352 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
353 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
354 
355 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
356 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
357 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
358 
359 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
360 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
361 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
362 
363 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
364 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
365 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
366 
367 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
368 {
369     int8_t n1 = n, n2 = n >> 8;
370     return m + n1 + n2;
371 }
372 
do_sadalp_s(int32_t n,int32_t m)373 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
374 {
375     int16_t n1 = n, n2 = n >> 16;
376     return m + n1 + n2;
377 }
378 
do_sadalp_d(int64_t n,int64_t m)379 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
380 {
381     int32_t n1 = n, n2 = n >> 32;
382     return m + n1 + n2;
383 }
384 
DO_ZPZZ(sve2_sadalp_zpzz_h,int16_t,H1_2,do_sadalp_h)385 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
386 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
387 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
388 
389 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
390 {
391     uint8_t n1 = n, n2 = n >> 8;
392     return m + n1 + n2;
393 }
394 
do_uadalp_s(uint32_t n,uint32_t m)395 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
396 {
397     uint16_t n1 = n, n2 = n >> 16;
398     return m + n1 + n2;
399 }
400 
do_uadalp_d(uint64_t n,uint64_t m)401 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
402 {
403     uint32_t n1 = n, n2 = n >> 32;
404     return m + n1 + n2;
405 }
406 
DO_ZPZZ(sve2_uadalp_zpzz_h,uint16_t,H1_2,do_uadalp_h)407 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
408 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
409 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
410 
411 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
412 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
413 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
414 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
415 
416 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
417 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
418 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
419 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
420 
421 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
422 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
423 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
424 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
425 
426 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
427 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
428 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
429 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
430 
431 /*
432  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
433  * We pass in a pointer to a dummy saturation field to trigger
434  * the saturating arithmetic but discard the information about
435  * whether it has occurred.
436  */
437 #define do_sqshl_b(n, m) \
438    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
439 #define do_sqshl_h(n, m) \
440    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
441 #define do_sqshl_s(n, m) \
442    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
443 #define do_sqshl_d(n, m) \
444    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
445 
446 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
447 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
448 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
449 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
450 
451 #define do_uqshl_b(n, m) \
452    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
453 #define do_uqshl_h(n, m) \
454    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
455 #define do_uqshl_s(n, m) \
456    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
457 #define do_uqshl_d(n, m) \
458    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
459 
460 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
461 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
462 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
463 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
464 
465 #define do_sqrshl_b(n, m) \
466    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
467 #define do_sqrshl_h(n, m) \
468    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
469 #define do_sqrshl_s(n, m) \
470    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
471 #define do_sqrshl_d(n, m) \
472    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
473 
474 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
475 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
476 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
477 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
478 
479 #undef do_sqrshl_d
480 
481 #define do_uqrshl_b(n, m) \
482    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
483 #define do_uqrshl_h(n, m) \
484    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
485 #define do_uqrshl_s(n, m) \
486    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
487 #define do_uqrshl_d(n, m) \
488    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
489 
490 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
491 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
492 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
493 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
494 
495 #undef do_uqrshl_d
496 
497 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
498 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
499 
500 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
501 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
502 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
503 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
504 
505 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
506 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
507 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
508 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
509 
510 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
511 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
512 
513 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
514 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
515 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
516 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
517 
518 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
519 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
520 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
521 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
522 
523 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
524 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
525 
526 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
527 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
528 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
529 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
530 
531 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
532 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
533 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
534 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
535 
536 #define DO_SQADD_B(n, m) do_ssat_b((int64_t)n + m)
537 #define DO_SQADD_H(n, m) do_ssat_h((int64_t)n + m)
538 #define DO_SQADD_S(n, m) do_ssat_s((int64_t)n + m)
539 
540 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
541 {
542     int64_t r = n + m;
543     if (((r ^ n) & ~(n ^ m)) < 0) {
544         /* Signed overflow.  */
545         return r < 0 ? INT64_MAX : INT64_MIN;
546     }
547     return r;
548 }
549 
DO_ZPZZ(sve2_sqadd_zpzz_b,int8_t,H1,DO_SQADD_B)550 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
551 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
552 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
553 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
554 
555 #define DO_UQADD_B(n, m) do_usat_b((int64_t)n + m)
556 #define DO_UQADD_H(n, m) do_usat_h((int64_t)n + m)
557 #define DO_UQADD_S(n, m) do_usat_s((int64_t)n + m)
558 
559 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
560 {
561     uint64_t r = n + m;
562     return r < n ? UINT64_MAX : r;
563 }
564 
DO_ZPZZ(sve2_uqadd_zpzz_b,uint8_t,H1,DO_UQADD_B)565 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
566 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
567 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
568 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
569 
570 #define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m)
571 #define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m)
572 #define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m)
573 
574 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
575 {
576     int64_t r = n - m;
577     if (((r ^ n) & (n ^ m)) < 0) {
578         /* Signed overflow.  */
579         return r < 0 ? INT64_MAX : INT64_MIN;
580     }
581     return r;
582 }
583 
DO_ZPZZ(sve2_sqsub_zpzz_b,int8_t,H1,DO_SQSUB_B)584 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
585 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
586 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
587 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
588 
589 #define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m)
590 #define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m)
591 #define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m)
592 
593 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
594 {
595     return n > m ? n - m : 0;
596 }
597 
DO_ZPZZ(sve2_uqsub_zpzz_b,uint8_t,H1,DO_UQSUB_B)598 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
599 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
600 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
601 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
602 
603 #define DO_SUQADD_B(n, m) do_ssat_b((int64_t)(int8_t)n + m)
604 #define DO_SUQADD_H(n, m) do_ssat_h((int64_t)(int16_t)n + m)
605 #define DO_SUQADD_S(n, m) do_ssat_s((int64_t)(int32_t)n + m)
606 
607 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
608 {
609     uint64_t r = n + m;
610 
611     if (n < 0) {
612         /* Note that m - abs(n) cannot underflow. */
613         if (r > INT64_MAX) {
614             /* Result is either very large positive or negative. */
615             if (m > -n) {
616                 /* m > abs(n), so r is a very large positive. */
617                 return INT64_MAX;
618             }
619             /* Result is negative. */
620         }
621     } else {
622         /* Both inputs are positive: check for overflow.  */
623         if (r < m || r > INT64_MAX) {
624             return INT64_MAX;
625         }
626     }
627     return r;
628 }
629 
DO_ZPZZ(sve2_suqadd_zpzz_b,uint8_t,H1,DO_SUQADD_B)630 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
631 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
632 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
633 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
634 
635 #define DO_USQADD_B(n, m) do_usat_b((int64_t)n + (int8_t)m)
636 #define DO_USQADD_H(n, m) do_usat_h((int64_t)n + (int16_t)m)
637 #define DO_USQADD_S(n, m) do_usat_s((int64_t)n + (int32_t)m)
638 
639 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
640 {
641     uint64_t r = n + m;
642 
643     if (m < 0) {
644         return n < -m ? 0 : r;
645     }
646     return r < n ? UINT64_MAX : r;
647 }
648 
DO_ZPZZ(sve2_usqadd_zpzz_b,uint8_t,H1,DO_USQADD_B)649 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
650 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
651 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
652 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
653 
654 #undef DO_ZPZZ
655 #undef DO_ZPZZ_D
656 
657 /*
658  * Three operand expander, operating on element pairs.
659  * If the slot I is even, the elements from from VN {I, I+1}.
660  * If the slot I is odd, the elements from from VM {I-1, I}.
661  * Load all of the input elements in each pair before overwriting output.
662  */
663 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
664 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
665 {                                                               \
666     intptr_t i, opr_sz = simd_oprsz(desc);                      \
667     for (i = 0; i < opr_sz; ) {                                 \
668         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
669         do {                                                    \
670             TYPE n0 = *(TYPE *)(vn + H(i));                     \
671             TYPE m0 = *(TYPE *)(vm + H(i));                     \
672             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
673             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
674             if (pg & 1) {                                       \
675                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
676             }                                                   \
677             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
678             if (pg & 1) {                                       \
679                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
680             }                                                   \
681             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
682         } while (i & 15);                                       \
683     }                                                           \
684 }
685 
686 /* Similarly, specialized for 64-bit operands.  */
687 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
688 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
689 {                                                               \
690     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
691     TYPE *d = vd, *n = vn, *m = vm;                             \
692     uint8_t *pg = vg;                                           \
693     for (i = 0; i < opr_sz; i += 2) {                           \
694         TYPE n0 = n[i], n1 = n[i + 1];                          \
695         TYPE m0 = m[i], m1 = m[i + 1];                          \
696         if (pg[H1(i)] & 1) {                                    \
697             d[i] = OP(n0, n1);                                  \
698         }                                                       \
699         if (pg[H1(i + 1)] & 1) {                                \
700             d[i + 1] = OP(m0, m1);                              \
701         }                                                       \
702     }                                                           \
703 }
704 
705 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
706 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
707 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
708 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
709 
710 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
713 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
714 
715 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
718 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
719 
720 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
723 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
724 
725 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
728 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
729 
730 #undef DO_ZPZZ_PAIR
731 #undef DO_ZPZZ_PAIR_D
732 
733 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
734 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
735                   float_status *status, uint32_t desc)                  \
736 {                                                                       \
737     intptr_t i, opr_sz = simd_oprsz(desc);                              \
738     for (i = 0; i < opr_sz; ) {                                         \
739         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
740         do {                                                            \
741             TYPE n0 = *(TYPE *)(vn + H(i));                             \
742             TYPE m0 = *(TYPE *)(vm + H(i));                             \
743             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
744             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
745             if (pg & 1) {                                               \
746                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
747             }                                                           \
748             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
749             if (pg & 1) {                                               \
750                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
751             }                                                           \
752             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
753         } while (i & 15);                                               \
754     }                                                                   \
755 }
756 
757 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
760 
761 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
764 
765 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
768 
769 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
772 
773 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
776 
777 #undef DO_ZPZZ_PAIR_FP
778 
779 /* Three-operand expander, controlled by a predicate, in which the
780  * third operand is "wide".  That is, for D = N op M, the same 64-bit
781  * value of M is used with all of the narrower values of N.
782  */
783 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
784 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
785 {                                                                       \
786     intptr_t i, opr_sz = simd_oprsz(desc);                              \
787     for (i = 0; i < opr_sz; ) {                                         \
788         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
789         TYPEW mm = *(TYPEW *)(vm + i);                                  \
790         do {                                                            \
791             if (pg & 1) {                                               \
792                 TYPE nn = *(TYPE *)(vn + H(i));                         \
793                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
794             }                                                           \
795             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
796         } while (i & 7);                                                \
797     }                                                                   \
798 }
799 
800 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
801 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
802 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
803 
804 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
805 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
806 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
807 
808 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
809 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
810 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
811 
812 #undef DO_ZPZW
813 
814 /* Fully general two-operand expander, controlled by a predicate.
815  */
816 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
817 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
818 {                                                               \
819     intptr_t i, opr_sz = simd_oprsz(desc);                      \
820     for (i = 0; i < opr_sz; ) {                                 \
821         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
822         do {                                                    \
823             if (pg & 1) {                                       \
824                 TYPE nn = *(TYPE *)(vn + H(i));                 \
825                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
826             }                                                   \
827             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
828         } while (i & 15);                                       \
829     }                                                           \
830 }
831 
832 /* Similarly, specialized for 64-bit operands.  */
833 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
834 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
835 {                                                               \
836     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
837     TYPE *d = vd, *n = vn;                                      \
838     uint8_t *pg = vg;                                           \
839     for (i = 0; i < opr_sz; i += 1) {                           \
840         if (pg[H1(i)] & 1) {                                    \
841             TYPE nn = n[i];                                     \
842             d[i] = OP(nn);                                      \
843         }                                                       \
844     }                                                           \
845 }
846 
847 #define DO_CLS_B(N)   (clrsb32(N) - 24)
848 #define DO_CLS_H(N)   (clrsb32(N) - 16)
849 
850 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
851 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
852 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
853 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
854 
855 #define DO_CLZ_B(N)   (clz32(N) - 24)
856 #define DO_CLZ_H(N)   (clz32(N) - 16)
857 
858 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
859 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
860 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
861 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
862 
863 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
864 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
865 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
866 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
867 
868 #define DO_CNOT(N)    (N == 0)
869 
870 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
871 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
872 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
873 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
874 
875 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
876 
877 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
878 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
879 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
880 
881 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N))
882 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N))
883 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N))
884 
885 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H)
886 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S)
887 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D)
888 
889 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
890 
891 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
892 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
893 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
894 
895 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N))
896 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N))
897 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N))
898 
899 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H)
900 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S)
901 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D)
902 
903 #define DO_NOT(N)    (~N)
904 
905 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
906 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
907 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
908 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
909 
910 #define DO_SXTB(N)    ((int8_t)N)
911 #define DO_SXTH(N)    ((int16_t)N)
912 #define DO_SXTS(N)    ((int32_t)N)
913 #define DO_UXTB(N)    ((uint8_t)N)
914 #define DO_UXTH(N)    ((uint16_t)N)
915 #define DO_UXTS(N)    ((uint32_t)N)
916 
917 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
918 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
919 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
920 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
921 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
922 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
923 
924 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
925 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
926 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
927 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
928 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
929 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
930 
931 #define DO_ABS(N)    (N < 0 ? -N : N)
932 
933 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
934 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
935 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
936 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
937 
938 #define DO_NEG(N)    (-N)
939 
940 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
941 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
942 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
943 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
944 
945 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
946 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
947 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
948 
949 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
950 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
951 
952 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
953 
954 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
955 {
956     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
957     uint64_t *d = vd, *n = vn;
958     uint8_t *pg = vg;
959 
960     for (i = 0; i < opr_sz; i += 2) {
961         if (pg[H1(i)] & 1) {
962             uint64_t n0 = n[i + 0];
963             uint64_t n1 = n[i + 1];
964             d[i + 0] = n1;
965             d[i + 1] = n0;
966         }
967     }
968 }
969 
DO_ZPZ(sve_rbit_b,uint8_t,H1,revbit8)970 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
971 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
972 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
973 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
974 
975 #define DO_SQABS(X) \
976     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
977        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
978 
979 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
980 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
981 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
982 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
983 
984 #define DO_SQNEG(X) \
985     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
986        x_ == min_ ? -min_ - 1 : -x_; })
987 
988 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
989 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
990 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
991 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
992 
993 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
994 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
995 
996 /* Three-operand expander, unpredicated, in which the third operand is "wide".
997  */
998 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
999 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1000 {                                                              \
1001     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1002     for (i = 0; i < opr_sz; ) {                                \
1003         TYPEW mm = *(TYPEW *)(vm + i);                         \
1004         do {                                                   \
1005             TYPE nn = *(TYPE *)(vn + H(i));                    \
1006             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
1007             i += sizeof(TYPE);                                 \
1008         } while (i & 7);                                       \
1009     }                                                          \
1010 }
1011 
1012 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1013 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1014 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1015 
1016 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1017 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1018 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1019 
1020 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1021 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1022 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1023 
1024 #undef DO_ZZW
1025 
1026 #undef DO_CLS_B
1027 #undef DO_CLS_H
1028 #undef DO_CLZ_B
1029 #undef DO_CLZ_H
1030 #undef DO_CNOT
1031 #undef DO_FABS
1032 #undef DO_FNEG
1033 #undef DO_ABS
1034 #undef DO_NEG
1035 #undef DO_ZPZ
1036 #undef DO_ZPZ_D
1037 
1038 /*
1039  * Three-operand expander, unpredicated, in which the two inputs are
1040  * selected from the top or bottom half of the wide column.
1041  */
1042 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1043 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1044 {                                                                       \
1045     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1046     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1047     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1048     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1049         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1050         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1051         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1052     }                                                                   \
1053 }
1054 
1055 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1056 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1057 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1058 
1059 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1060 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1061 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1062 
1063 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1064 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1065 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1066 
1067 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1068 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1069 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1070 
1071 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1072 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1073 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1074 
1075 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1076 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1077 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1078 
1079 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1080 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1081 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1082 
1083 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1084 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1085 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1086 
1087 /* Note that the multiply cannot overflow, but the doubling can. */
1088 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1089 {
1090     int16_t val = n * m;
1091     return DO_SQADD_H(val, val);
1092 }
1093 
do_sqdmull_s(int32_t n,int32_t m)1094 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1095 {
1096     int32_t val = n * m;
1097     return DO_SQADD_S(val, val);
1098 }
1099 
do_sqdmull_d(int64_t n,int64_t m)1100 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1101 {
1102     int64_t val = n * m;
1103     return do_sqadd_d(val, val);
1104 }
1105 
DO_ZZZ_TB(sve2_sqdmull_zzz_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h)1106 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1107 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1108 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1109 
1110 #undef DO_ZZZ_TB
1111 
1112 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1113 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1114 {                                                              \
1115     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1116     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1117     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1118         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1119         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1120         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1121     }                                                          \
1122 }
1123 
1124 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1125 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1126 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1127 
1128 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1129 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1130 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1131 
1132 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1133 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1134 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1135 
1136 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1137 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1138 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1139 
1140 #undef DO_ZZZ_WTB
1141 
1142 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1143 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1144 {                                                                       \
1145     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1146     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1147     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1148     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1149         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1150         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1151         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1152     }                                                                   \
1153 }
1154 
1155 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1156 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1157 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1158 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1159 
1160 #undef DO_ZZZ_NTB
1161 
1162 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1163 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1164 {                                                               \
1165     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1166     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1167     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1168         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1169         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1170         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1171         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1172     }                                                           \
1173 }
1174 
1175 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1176 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1177 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1178 
1179 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1180 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1181 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1182 
1183 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1184 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1185 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1186 
1187 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1188 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1189 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1190 
1191 #define DO_NMUL(N, M)  -(N * M)
1192 
1193 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1194 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1196 
1197 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1198 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1200 
1201 #undef DO_ZZZW_ACC
1202 
1203 #define DO_XTNB(NAME, TYPE, OP) \
1204 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1205 {                                                            \
1206     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1207     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1208         TYPE nn = *(TYPE *)(vn + i);                         \
1209         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1210         *(TYPE *)(vd + i) = nn;                              \
1211     }                                                        \
1212 }
1213 
1214 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1215 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1216 {                                                                       \
1217     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1218     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1219         TYPE nn = *(TYPE *)(vn + i);                                    \
1220         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1221     }                                                                   \
1222 }
1223 
1224 DO_XTNB(sve2_sqxtnb_h, int16_t, do_ssat_b)
1225 DO_XTNB(sve2_sqxtnb_s, int32_t, do_ssat_h)
1226 DO_XTNB(sve2_sqxtnb_d, int64_t, do_ssat_s)
1227 
1228 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, do_ssat_b)
1229 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, do_ssat_h)
1230 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, do_ssat_s)
1231 
1232 DO_XTNB(sve2_uqxtnb_h, uint16_t, do_usat_b)
1233 DO_XTNB(sve2_uqxtnb_s, uint32_t, do_usat_h)
1234 DO_XTNB(sve2_uqxtnb_d, uint64_t, do_usat_s)
1235 
1236 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, do_usat_b)
1237 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, do_usat_h)
1238 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, do_usat_s)
1239 
1240 DO_XTNB(sve2_sqxtunb_h, int16_t, do_usat_b)
1241 DO_XTNB(sve2_sqxtunb_s, int32_t, do_usat_h)
1242 DO_XTNB(sve2_sqxtunb_d, int64_t, do_usat_s)
1243 
1244 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, do_usat_b)
1245 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, do_usat_h)
1246 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, do_usat_s)
1247 
1248 #undef DO_XTNB
1249 #undef DO_XTNT
1250 
1251 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1252 {
1253     intptr_t i, opr_sz = simd_oprsz(desc);
1254     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1255     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1256     uint32_t *a = va, *n = vn;
1257     uint64_t *d = vd, *m = vm;
1258 
1259     for (i = 0; i < opr_sz / 8; ++i) {
1260         uint32_t e1 = a[2 * i + H4(0)];
1261         uint32_t e2 = n[2 * i + sel] ^ inv;
1262         uint64_t c = extract64(m[i], 32, 1);
1263         /* Compute and store the entire 33-bit result at once. */
1264         d[i] = c + e1 + e2;
1265     }
1266 }
1267 
HELPER(sve2_adcl_d)1268 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1269 {
1270     intptr_t i, opr_sz = simd_oprsz(desc);
1271     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1272     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1273     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1274 
1275     for (i = 0; i < opr_sz / 8; i += 2) {
1276         Int128 e1 = int128_make64(a[i]);
1277         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1278         Int128 c = int128_make64(m[i + 1] & 1);
1279         Int128 r = int128_add(int128_add(e1, e2), c);
1280         d[i + 0] = int128_getlo(r);
1281         d[i + 1] = int128_gethi(r);
1282     }
1283 }
1284 
1285 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1287 {                                                                       \
1288     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1289     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1290     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1291     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1292         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1293         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1294         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1295         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1296     }                                                                   \
1297 }
1298 
DO_SQDMLAL(sve2_sqdmlal_zzzw_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h,DO_SQADD_H)1299 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1300            do_sqdmull_h, DO_SQADD_H)
1301 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1302            do_sqdmull_s, DO_SQADD_S)
1303 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1304            do_sqdmull_d, do_sqadd_d)
1305 
1306 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1307            do_sqdmull_h, DO_SQSUB_H)
1308 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1309            do_sqdmull_s, DO_SQSUB_S)
1310 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1311            do_sqdmull_d, do_sqsub_d)
1312 
1313 #undef DO_SQDMLAL
1314 
1315 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1316 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1317 {                                                               \
1318     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1319     int rot = simd_data(desc);                                  \
1320     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1321     bool sub_r = rot == 1 || rot == 2;                          \
1322     bool sub_i = rot >= 2;                                      \
1323     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1324     for (i = 0; i < opr_sz; i += 2) {                           \
1325         TYPE elt1_a = n[H(i + sel_a)];                          \
1326         TYPE elt2_a = m[H(i + sel_a)];                          \
1327         TYPE elt2_b = m[H(i + sel_b)];                          \
1328         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1329         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1330     }                                                           \
1331 }
1332 
1333 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1334 
1335 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1336 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1337 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1338 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1339 
1340 #define DO_SQRDMLAH_B(N, M, A, S) \
1341     do_sqrdmlah_b(N, M, A, S, true)
1342 #define DO_SQRDMLAH_H(N, M, A, S) \
1343     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1344 #define DO_SQRDMLAH_S(N, M, A, S) \
1345     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1346 #define DO_SQRDMLAH_D(N, M, A, S) \
1347     do_sqrdmlah_d(N, M, A, S, true)
1348 
1349 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1350 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1351 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1352 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1353 
1354 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1355 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1356 {                                                                           \
1357     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1358     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1359     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1360     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1361     bool sub_r = rot == 1 || rot == 2;                                      \
1362     bool sub_i = rot >= 2;                                                  \
1363     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1364     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1365         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1366         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1367         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1368             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1369             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1370             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1371         }                                                                   \
1372     }                                                                       \
1373 }
1374 
1375 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1376 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1377 
1378 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1379 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1380 
1381 #undef DO_CMLA
1382 #undef DO_CMLA_FUNC
1383 #undef DO_CMLA_IDX_FUNC
1384 #undef DO_SQRDMLAH_B
1385 #undef DO_SQRDMLAH_H
1386 #undef DO_SQRDMLAH_S
1387 #undef DO_SQRDMLAH_D
1388 
1389 /* Note N and M are 4 elements bundled into one unit. */
1390 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1391                          int sel_a, int sel_b, int sub_i)
1392 {
1393     for (int i = 0; i <= 1; i++) {
1394         int32_t elt1_r = (int8_t)(n >> (16 * i));
1395         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1396         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1397         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1398 
1399         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1400     }
1401     return a;
1402 }
1403 
do_cdot_d(uint64_t n,uint64_t m,int64_t a,int sel_a,int sel_b,int sub_i)1404 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1405                          int sel_a, int sel_b, int sub_i)
1406 {
1407     for (int i = 0; i <= 1; i++) {
1408         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1409         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1410         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1411         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1412 
1413         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1414     }
1415     return a;
1416 }
1417 
HELPER(sve2_cdot_zzzz_s)1418 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1419                               void *va, uint32_t desc)
1420 {
1421     int opr_sz = simd_oprsz(desc);
1422     int rot = simd_data(desc);
1423     int sel_a = rot & 1;
1424     int sel_b = sel_a ^ 1;
1425     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1426     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1427 
1428     for (int e = 0; e < opr_sz / 4; e++) {
1429         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1430     }
1431 }
1432 
HELPER(sve2_cdot_zzzz_d)1433 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1434                               void *va, uint32_t desc)
1435 {
1436     int opr_sz = simd_oprsz(desc);
1437     int rot = simd_data(desc);
1438     int sel_a = rot & 1;
1439     int sel_b = sel_a ^ 1;
1440     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1441     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1442 
1443     for (int e = 0; e < opr_sz / 8; e++) {
1444         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1445     }
1446 }
1447 
HELPER(sve2_cdot_idx_s)1448 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1449                              void *va, uint32_t desc)
1450 {
1451     int opr_sz = simd_oprsz(desc);
1452     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1453     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1454     int sel_a = rot & 1;
1455     int sel_b = sel_a ^ 1;
1456     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1457     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1458 
1459     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1460         uint32_t seg_m = m[seg + idx];
1461         for (int e = 0; e < 4; e++) {
1462             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1463                                    sel_a, sel_b, sub_i);
1464         }
1465     }
1466 }
1467 
HELPER(sve2_cdot_idx_d)1468 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1469                              void *va, uint32_t desc)
1470 {
1471     int seg, opr_sz = simd_oprsz(desc);
1472     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1473     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1474     int sel_a = rot & 1;
1475     int sel_b = sel_a ^ 1;
1476     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1477     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1478 
1479     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1480         uint64_t seg_m = m[seg + idx];
1481         for (int e = 0; e < 2; e++) {
1482             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1483                                    sel_a, sel_b, sub_i);
1484         }
1485     }
1486 }
1487 
1488 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1489 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1490 {                                                                       \
1491     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1492     intptr_t i, j, idx = simd_data(desc);                               \
1493     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1494     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1495         TYPE mm = m[i];                                                 \
1496         for (j = 0; j < segment; j++) {                                 \
1497             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1498         }                                                               \
1499     }                                                                   \
1500 }
1501 
1502 #define DO_SQRDMLAH_H(N, M, A) \
1503     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1504 #define DO_SQRDMLAH_S(N, M, A) \
1505     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1506 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1507 
DO_ZZXZ(sve2_sqrdmlah_idx_h,int16_t,H2,DO_SQRDMLAH_H)1508 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1509 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1510 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1511 
1512 #define DO_SQRDMLSH_H(N, M, A) \
1513     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1514 #define DO_SQRDMLSH_S(N, M, A) \
1515     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1516 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1517 
1518 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1519 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1520 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1521 
1522 #undef DO_ZZXZ
1523 
1524 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1525 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1526 {                                                                         \
1527     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1528     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1529     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1530     for (i = 0; i < oprsz; i += 16) {                                     \
1531         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1532         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1533             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1534             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1535             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1536         }                                                                 \
1537     }                                                                     \
1538 }
1539 
1540 #define DO_MLA(N, M, A)  (A + N * M)
1541 
1542 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1543 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1544 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1545 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1546 
1547 #define DO_MLS(N, M, A)  (A - N * M)
1548 
1549 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1550 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1551 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1552 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1553 
1554 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1555 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1556 
1557 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1558 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1559 
1560 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1561 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1562 
1563 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1564 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1565 
1566 #undef DO_MLA
1567 #undef DO_MLS
1568 #undef DO_ZZXW
1569 
1570 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1571 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1572 {                                                                         \
1573     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1574     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1575     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1576     for (i = 0; i < oprsz; i += 16) {                                     \
1577         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1578         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1579             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1580             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1581         }                                                                 \
1582     }                                                                     \
1583 }
1584 
1585 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1586 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1587 
1588 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1589 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1590 
1591 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1592 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1593 
1594 #undef DO_ZZX
1595 
1596 #define DO_BITPERM(NAME, TYPE, OP) \
1597 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1598 {                                                              \
1599     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1600     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1601         TYPE nn = *(TYPE *)(vn + i);                           \
1602         TYPE mm = *(TYPE *)(vm + i);                           \
1603         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1604     }                                                          \
1605 }
1606 
1607 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1608 {
1609     uint64_t res = 0;
1610     int db, rb = 0;
1611 
1612     for (db = 0; db < n; ++db) {
1613         if ((mask >> db) & 1) {
1614             res |= ((data >> db) & 1) << rb;
1615             ++rb;
1616         }
1617     }
1618     return res;
1619 }
1620 
DO_BITPERM(sve2_bext_b,uint8_t,bitextract)1621 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1622 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1623 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1624 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1625 
1626 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1627 {
1628     uint64_t res = 0;
1629     int rb, db = 0;
1630 
1631     for (rb = 0; rb < n; ++rb) {
1632         if ((mask >> rb) & 1) {
1633             res |= ((data >> db) & 1) << rb;
1634             ++db;
1635         }
1636     }
1637     return res;
1638 }
1639 
DO_BITPERM(sve2_bdep_b,uint8_t,bitdeposit)1640 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1641 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1642 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1643 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1644 
1645 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1646 {
1647     uint64_t resm = 0, resu = 0;
1648     int db, rbm = 0, rbu = 0;
1649 
1650     for (db = 0; db < n; ++db) {
1651         uint64_t val = (data >> db) & 1;
1652         if ((mask >> db) & 1) {
1653             resm |= val << rbm++;
1654         } else {
1655             resu |= val << rbu++;
1656         }
1657     }
1658 
1659     return resm | (resu << rbm);
1660 }
1661 
DO_BITPERM(sve2_bgrp_b,uint8_t,bitgroup)1662 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1663 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1664 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1665 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1666 
1667 #undef DO_BITPERM
1668 
1669 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1670 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1671 {                                                               \
1672     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1673     int sub_r = simd_data(desc);                                \
1674     if (sub_r) {                                                \
1675         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1676             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1677             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1678             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1679             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1680             acc_r = ADD_OP(acc_r, el2_i);                       \
1681             acc_i = SUB_OP(acc_i, el2_r);                       \
1682             *(TYPE *)(vd + H(i)) = acc_r;                       \
1683             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1684         }                                                       \
1685     } else {                                                    \
1686         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1687             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1688             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1689             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1690             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1691             acc_r = SUB_OP(acc_r, el2_i);                       \
1692             acc_i = ADD_OP(acc_i, el2_r);                       \
1693             *(TYPE *)(vd + H(i)) = acc_r;                       \
1694             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1695         }                                                       \
1696     }                                                           \
1697 }
1698 
1699 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1700 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1701 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1702 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1703 
1704 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1705 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1706 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1707 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1708 
1709 #undef DO_CADD
1710 
1711 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1712 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1713 {                                                              \
1714     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1715     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1716     int shift = simd_data(desc) >> 1;                          \
1717     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1718         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1719         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1720     }                                                          \
1721 }
1722 
1723 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1724 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1725 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1726 
1727 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1728 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1729 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1730 
1731 #undef DO_ZZI_SHLL
1732 
1733 /* Two-operand reduction expander, controlled by a predicate.
1734  * The difference between TYPERED and TYPERET has to do with
1735  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1736  * but TYPERET must be unsigned so that e.g. a 32-bit value
1737  * is not sign-extended to the ABI uint64_t return type.
1738  */
1739 /* ??? If we were to vectorize this by hand the reduction ordering
1740  * would change.  For integer operands, this is perfectly fine.
1741  */
1742 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1743 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1744 {                                                          \
1745     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1746     TYPERED ret = INIT;                                    \
1747     for (i = 0; i < opr_sz; ) {                            \
1748         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1749         do {                                               \
1750             if (pg & 1) {                                  \
1751                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1752                 ret = OP(ret, nn);                         \
1753             }                                              \
1754             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1755         } while (i & 15);                                  \
1756     }                                                      \
1757     return (TYPERET)ret;                                   \
1758 }
1759 
1760 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1761 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1762 {                                                          \
1763     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1764     TYPEE *n = vn;                                         \
1765     uint8_t *pg = vg;                                      \
1766     TYPER ret = INIT;                                      \
1767     for (i = 0; i < opr_sz; i += 1) {                      \
1768         if (pg[H1(i)] & 1) {                               \
1769             TYPEE nn = n[i];                               \
1770             ret = OP(ret, nn);                             \
1771         }                                                  \
1772     }                                                      \
1773     return ret;                                            \
1774 }
1775 
1776 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1777 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1778 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1779 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1780 
1781 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1782 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1783 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1784 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1785 
1786 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1787 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1788 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1789 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1790 
1791 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1792 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1793 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1794 
1795 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1796 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1797 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1798 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1799 
1800 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1801 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1802 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1803 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1804 
1805 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1806 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1807 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1808 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1809 
1810 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1811 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1812 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1813 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1814 
1815 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1816 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1817 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1818 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1819 
1820 #undef DO_VPZ
1821 #undef DO_VPZ_D
1822 
1823 #define DO_VPQ(NAME, TYPE, H, INIT, OP) \
1824 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)          \
1825 {                                                                       \
1826     TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \
1827     TYPE *n = vn; uint16_t *g = vg;                                     \
1828     uintptr_t oprsz = simd_oprsz(desc);                                 \
1829     uintptr_t nseg = oprsz / 16, nsegelt = 16 / sizeof(TYPE);           \
1830     for (uintptr_t s = 0; s < nseg; s++) {                              \
1831         uint16_t pg = g[H2(s)];                                         \
1832         for (uintptr_t e = 0; e < nsegelt; e++, pg >>= sizeof(TYPE)) {  \
1833             if (pg & 1) {                                               \
1834                 tmp[e] = OP(tmp[H(e)], n[s * nsegelt + H(e)]);          \
1835             }                                                           \
1836         }                                                               \
1837     }                                                                   \
1838     memcpy(vd, tmp, 16);                                                \
1839     clear_tail(vd, 16, simd_maxsz(desc));                               \
1840 }
1841 
1842 DO_VPQ(sve2p1_addqv_b, uint8_t, H1, 0, DO_ADD)
1843 DO_VPQ(sve2p1_addqv_h, uint16_t, H2, 0, DO_ADD)
1844 DO_VPQ(sve2p1_addqv_s, uint32_t, H4, 0, DO_ADD)
1845 DO_VPQ(sve2p1_addqv_d, uint64_t, H8, 0, DO_ADD)
1846 
1847 DO_VPQ(sve2p1_smaxqv_b, int8_t, H1, INT8_MIN, DO_MAX)
1848 DO_VPQ(sve2p1_smaxqv_h, int16_t, H2, INT16_MIN, DO_MAX)
1849 DO_VPQ(sve2p1_smaxqv_s, int32_t, H4, INT32_MIN, DO_MAX)
1850 DO_VPQ(sve2p1_smaxqv_d, int64_t, H8, INT64_MIN, DO_MAX)
1851 
1852 DO_VPQ(sve2p1_sminqv_b, int8_t, H1, INT8_MAX, DO_MIN)
1853 DO_VPQ(sve2p1_sminqv_h, int16_t, H2, INT16_MAX, DO_MIN)
1854 DO_VPQ(sve2p1_sminqv_s, int32_t, H4, INT32_MAX, DO_MIN)
1855 DO_VPQ(sve2p1_sminqv_d, int64_t, H8, INT64_MAX, DO_MIN)
1856 
1857 DO_VPQ(sve2p1_umaxqv_b, uint8_t, H1, 0, DO_MAX)
1858 DO_VPQ(sve2p1_umaxqv_h, uint16_t, H2, 0, DO_MAX)
1859 DO_VPQ(sve2p1_umaxqv_s, uint32_t, H4, 0, DO_MAX)
1860 DO_VPQ(sve2p1_umaxqv_d, uint64_t, H8, 0, DO_MAX)
1861 
1862 DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN)
1863 DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN)
1864 DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN)
1865 DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN)
1866 
1867 #undef DO_VPQ
1868 
1869 /* Two vector operand, one scalar operand, unpredicated.  */
1870 #define DO_ZZI(NAME, TYPE, OP)                                       \
1871 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1872 {                                                                    \
1873     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1874     TYPE s = s64, *d = vd, *n = vn;                                  \
1875     for (i = 0; i < opr_sz; ++i) {                                   \
1876         d[i] = OP(n[i], s);                                          \
1877     }                                                                \
1878 }
1879 
1880 #define DO_SUBR(X, Y)   (Y - X)
1881 
1882 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1883 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1884 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1885 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1886 
1887 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1888 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1889 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1890 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1891 
1892 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1893 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1894 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1895 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1896 
1897 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1898 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1899 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1900 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1901 
1902 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1903 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1904 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1905 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1906 
1907 #undef DO_ZZI
1908 
1909 #define DO_LOGIC_QV(NAME, SUFF, INIT, VOP, POP)                         \
1910 void HELPER(NAME ## _ ## SUFF)(void *vd, void *vn, void *vg, uint32_t desc) \
1911 {                                                                       \
1912     unsigned seg = simd_oprsz(desc) / 16;                               \
1913     uint64_t r0 = INIT, r1 = INIT;                                      \
1914     for (unsigned s = 0; s < seg; s++) {                                \
1915         uint64_t p0 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2))); \
1916         uint64_t p1 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2 + 1))); \
1917         uint64_t v0 = *(uint64_t *)(vn + s * 16);                       \
1918         uint64_t v1 = *(uint64_t *)(vn + s * 16 + 8);                   \
1919         v0 = POP(v0, p0), v1 = POP(v1, p1);                             \
1920         r0 = VOP(r0, v0), r1 = VOP(r1, v1);                             \
1921     }                                                                   \
1922     *(uint64_t *)(vd + 0) = r0;                                         \
1923     *(uint64_t *)(vd + 8) = r1;                                         \
1924     clear_tail(vd, 16, simd_maxsz(desc));                               \
1925 }
1926 
1927 DO_LOGIC_QV(sve2p1_orqv, b, 0, DO_ORR, DO_AND)
1928 DO_LOGIC_QV(sve2p1_orqv, h, 0, DO_ORR, DO_AND)
1929 DO_LOGIC_QV(sve2p1_orqv, s, 0, DO_ORR, DO_AND)
1930 DO_LOGIC_QV(sve2p1_orqv, d, 0, DO_ORR, DO_AND)
1931 
1932 DO_LOGIC_QV(sve2p1_eorqv, b, 0, DO_EOR, DO_AND)
1933 DO_LOGIC_QV(sve2p1_eorqv, h, 0, DO_EOR, DO_AND)
1934 DO_LOGIC_QV(sve2p1_eorqv, s, 0, DO_EOR, DO_AND)
1935 DO_LOGIC_QV(sve2p1_eorqv, d, 0, DO_EOR, DO_AND)
1936 
1937 DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC)
1938 DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC)
1939 DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC)
1940 DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC)
1941 
1942 #undef DO_LOGIC_QV
1943 
1944 #undef DO_AND
1945 #undef DO_ORR
1946 #undef DO_EOR
1947 #undef DO_BIC
1948 #undef DO_ORC
1949 #undef DO_ADD
1950 #undef DO_SUB
1951 #undef DO_MAX
1952 #undef DO_MIN
1953 #undef DO_ABD
1954 #undef DO_MUL
1955 #undef DO_DIV
1956 #undef DO_ASR
1957 #undef DO_LSR
1958 #undef DO_LSL
1959 #undef DO_SUBR
1960 
1961 /* Similar to the ARM LastActiveElement pseudocode function, except the
1962    result is multiplied by the element size.  This includes the not found
1963    indication; e.g. not found for esz=3 is -8.  */
1964 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1965 {
1966     uint64_t mask = pred_esz_masks[esz];
1967     intptr_t i = words;
1968 
1969     do {
1970         uint64_t this_g = g[--i] & mask;
1971         if (this_g) {
1972             return i * 64 + (63 - clz64(this_g));
1973         }
1974     } while (i > 0);
1975     return (intptr_t)-1 << esz;
1976 }
1977 
HELPER(sve_pfirst)1978 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1979 {
1980     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1981     uint32_t flags = PREDTEST_INIT;
1982     uint64_t *d = vd, *g = vg;
1983     intptr_t i = 0;
1984 
1985     do {
1986         uint64_t this_d = d[i];
1987         uint64_t this_g = g[i];
1988 
1989         if (this_g) {
1990             if (!(flags & 4)) {
1991                 /* Set in D the first bit of G.  */
1992                 this_d |= this_g & -this_g;
1993                 d[i] = this_d;
1994             }
1995             flags = iter_predtest_fwd(this_d, this_g, flags);
1996         }
1997     } while (++i < words);
1998 
1999     return flags;
2000 }
2001 
HELPER(sve_pnext)2002 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
2003 {
2004     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2005     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2006     uint32_t flags = PREDTEST_INIT;
2007     uint64_t *d = vd, *g = vg, esz_mask;
2008     intptr_t i, next;
2009 
2010     next = last_active_element(vd, words, esz) + (1 << esz);
2011     esz_mask = pred_esz_masks[esz];
2012 
2013     /* Similar to the pseudocode for pnext, but scaled by ESZ
2014        so that we find the correct bit.  */
2015     if (next < words * 64) {
2016         uint64_t mask = -1;
2017 
2018         if (next & 63) {
2019             mask = ~((1ull << (next & 63)) - 1);
2020             next &= -64;
2021         }
2022         do {
2023             uint64_t this_g = g[next / 64] & esz_mask & mask;
2024             if (this_g != 0) {
2025                 next = (next & -64) + ctz64(this_g);
2026                 break;
2027             }
2028             next += 64;
2029             mask = -1;
2030         } while (next < words * 64);
2031     }
2032 
2033     i = 0;
2034     do {
2035         uint64_t this_d = 0;
2036         if (i == next / 64) {
2037             this_d = 1ull << (next & 63);
2038         }
2039         d[i] = this_d;
2040         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
2041     } while (++i < words);
2042 
2043     return flags;
2044 }
2045 
2046 /*
2047  * Copy Zn into Zd, and store zero into inactive elements.
2048  * If inv, store zeros into the active elements.
2049  */
HELPER(sve_movz_b)2050 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
2051 {
2052     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2053     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2054     uint64_t *d = vd, *n = vn;
2055     uint8_t *pg = vg;
2056 
2057     for (i = 0; i < opr_sz; i += 1) {
2058         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
2059     }
2060 }
2061 
HELPER(sve_movz_h)2062 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
2063 {
2064     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2065     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2066     uint64_t *d = vd, *n = vn;
2067     uint8_t *pg = vg;
2068 
2069     for (i = 0; i < opr_sz; i += 1) {
2070         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2071     }
2072 }
2073 
HELPER(sve_movz_s)2074 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2075 {
2076     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2077     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2078     uint64_t *d = vd, *n = vn;
2079     uint8_t *pg = vg;
2080 
2081     for (i = 0; i < opr_sz; i += 1) {
2082         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2083     }
2084 }
2085 
HELPER(sve_movz_d)2086 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2087 {
2088     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2089     uint64_t *d = vd, *n = vn;
2090     uint8_t *pg = vg;
2091     uint8_t inv = simd_data(desc);
2092 
2093     for (i = 0; i < opr_sz; i += 1) {
2094         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2095     }
2096 }
2097 
2098 /* Three-operand expander, immediate operand, controlled by a predicate.
2099  */
2100 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2101 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2102 {                                                               \
2103     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2104     TYPE imm = simd_data(desc);                                 \
2105     for (i = 0; i < opr_sz; ) {                                 \
2106         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2107         do {                                                    \
2108             if (pg & 1) {                                       \
2109                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2110                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2111             }                                                   \
2112             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2113         } while (i & 15);                                       \
2114     }                                                           \
2115 }
2116 
2117 /* Similarly, specialized for 64-bit operands.  */
2118 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2119 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2120 {                                                               \
2121     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2122     TYPE *d = vd, *n = vn;                                      \
2123     TYPE imm = simd_data(desc);                                 \
2124     uint8_t *pg = vg;                                           \
2125     for (i = 0; i < opr_sz; i += 1) {                           \
2126         if (pg[H1(i)] & 1) {                                    \
2127             TYPE nn = n[i];                                     \
2128             d[i] = OP(nn, imm);                                 \
2129         }                                                       \
2130     }                                                           \
2131 }
2132 
2133 #define DO_SHR(N, M)  (N >> M)
2134 #define DO_SHL(N, M)  (N << M)
2135 
2136 /* Arithmetic shift right for division.  This rounds negative numbers
2137    toward zero as per signed division.  Therefore before shifting,
2138    when N is negative, add 2**M-1.  */
2139 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2140 
DO_ZPZI(sve_asr_zpzi_b,int8_t,H1,DO_SHR)2141 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2142 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2143 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2144 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2145 
2146 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2147 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2148 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2149 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2150 
2151 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2152 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2153 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2154 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2155 
2156 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2157 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2158 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2159 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2160 
2161 /* SVE2 bitwise shift by immediate */
2162 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2163 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2164 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2165 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2166 
2167 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2168 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2169 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2170 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2171 
2172 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2173 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2174 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2175 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2176 
2177 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2178 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2179 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2180 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2181 
2182 #define do_suqrshl_b(n, m) \
2183    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2184 #define do_suqrshl_h(n, m) \
2185    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2186 #define do_suqrshl_s(n, m) \
2187    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2188 #define do_suqrshl_d(n, m) \
2189    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2190 
2191 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2192 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2193 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2194 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2195 
2196 #undef DO_ASRD
2197 #undef DO_ZPZI
2198 #undef DO_ZPZI_D
2199 
2200 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2201 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2202 {                                                            \
2203     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2204     int shift = simd_data(desc);                             \
2205     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2206         TYPEW nn = *(TYPEW *)(vn + i);                       \
2207         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2208     }                                                        \
2209 }
2210 
2211 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2212 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2213 {                                                                 \
2214     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2215     int shift = simd_data(desc);                                  \
2216     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2217         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2218         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2219     }                                                             \
2220 }
2221 
2222 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2223 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2224 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2225 
2226 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2227 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2228 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2229 
2230 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2231 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2232 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2233 
2234 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2235 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2236 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2237 
2238 #define DO_SQSHRUN_H(x, sh) do_usat_b((int64_t)(x) >> sh)
2239 #define DO_SQSHRUN_S(x, sh) do_usat_h((int64_t)(x) >> sh)
2240 #define DO_SQSHRUN_D(x, sh) do_usat_s((int64_t)(x) >> (sh < 64 ? sh : 63))
2241 
2242 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2243 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2244 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2245 
2246 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2247 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2248 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2249 
2250 #define DO_SQRSHRUN_H(x, sh) do_usat_b(do_srshr(x, sh))
2251 #define DO_SQRSHRUN_S(x, sh) do_usat_h(do_srshr(x, sh))
2252 #define DO_SQRSHRUN_D(x, sh) do_usat_s(do_srshr(x, sh))
2253 
2254 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2255 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2256 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2257 
2258 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2259 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2260 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2261 
2262 #define DO_SQSHRN_H(x, sh) do_ssat_b(x >> sh)
2263 #define DO_SQSHRN_S(x, sh) do_ssat_h(x >> sh)
2264 #define DO_SQSHRN_D(x, sh) do_ssat_s(x >> sh)
2265 
2266 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2267 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2268 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2269 
2270 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2271 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2272 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2273 
2274 #define DO_SQRSHRN_H(x, sh) do_ssat_b(do_srshr(x, sh))
2275 #define DO_SQRSHRN_S(x, sh) do_ssat_h(do_srshr(x, sh))
2276 #define DO_SQRSHRN_D(x, sh) do_ssat_s(do_srshr(x, sh))
2277 
2278 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2279 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2280 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2281 
2282 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2283 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2284 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2285 
2286 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2287 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2288 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2289 
2290 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2291 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2292 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2293 
2294 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2295 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2296 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2297 
2298 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2299 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2300 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2301 
2302 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2303 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2304 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2305 
2306 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2307 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2308 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2309 
2310 #undef DO_SHRNB
2311 #undef DO_SHRNT
2312 
2313 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2314 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2315 {                                                                           \
2316     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2317     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2318         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2319         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2320         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2321     }                                                                       \
2322 }
2323 
2324 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2325 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2326 {                                                                           \
2327     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2328     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2329         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2330         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2331         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2332     }                                                                       \
2333 }
2334 
2335 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2336 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2337 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2338 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2339 
2340 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2341 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2342 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2343 
2344 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2345 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2346 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2347 
2348 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2349 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2350 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2351 
2352 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2353 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2354 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2355 
2356 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2357 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2358 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2359 
2360 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2361 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2362 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2363 
2364 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2365 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2366 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2367 
2368 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2369 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2370 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2371 
2372 #undef DO_RSUBHN
2373 #undef DO_SUBHN
2374 #undef DO_RADDHN
2375 #undef DO_ADDHN
2376 
2377 #undef DO_BINOPNB
2378 
2379 /* Fully general four-operand expander, controlled by a predicate.
2380  */
2381 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2382 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2383                   void *vg, uint32_t desc)                    \
2384 {                                                             \
2385     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2386     for (i = 0; i < opr_sz; ) {                               \
2387         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2388         do {                                                  \
2389             if (pg & 1) {                                     \
2390                 TYPE nn = *(TYPE *)(vn + H(i));               \
2391                 TYPE mm = *(TYPE *)(vm + H(i));               \
2392                 TYPE aa = *(TYPE *)(va + H(i));               \
2393                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2394             }                                                 \
2395             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2396         } while (i & 15);                                     \
2397     }                                                         \
2398 }
2399 
2400 /* Similarly, specialized for 64-bit operands.  */
2401 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2402 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2403                   void *vg, uint32_t desc)                    \
2404 {                                                             \
2405     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2406     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2407     uint8_t *pg = vg;                                         \
2408     for (i = 0; i < opr_sz; i += 1) {                         \
2409         if (pg[H1(i)] & 1) {                                  \
2410             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2411             d[i] = OP(aa, nn, mm);                            \
2412         }                                                     \
2413     }                                                         \
2414 }
2415 
2416 #define DO_MLA(A, N, M)  (A + N * M)
2417 #define DO_MLS(A, N, M)  (A - N * M)
2418 
2419 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2420 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2421 
2422 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2423 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2424 
2425 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2426 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2427 
2428 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2429 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2430 
2431 #undef DO_MLA
2432 #undef DO_MLS
2433 #undef DO_ZPZZZ
2434 #undef DO_ZPZZZ_D
2435 
2436 void HELPER(sve_index_b)(void *vd, uint32_t start,
2437                          uint32_t incr, uint32_t desc)
2438 {
2439     intptr_t i, opr_sz = simd_oprsz(desc);
2440     uint8_t *d = vd;
2441     for (i = 0; i < opr_sz; i += 1) {
2442         d[H1(i)] = start + i * incr;
2443     }
2444 }
2445 
HELPER(sve_index_h)2446 void HELPER(sve_index_h)(void *vd, uint32_t start,
2447                          uint32_t incr, uint32_t desc)
2448 {
2449     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2450     uint16_t *d = vd;
2451     for (i = 0; i < opr_sz; i += 1) {
2452         d[H2(i)] = start + i * incr;
2453     }
2454 }
2455 
HELPER(sve_index_s)2456 void HELPER(sve_index_s)(void *vd, uint32_t start,
2457                          uint32_t incr, uint32_t desc)
2458 {
2459     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2460     uint32_t *d = vd;
2461     for (i = 0; i < opr_sz; i += 1) {
2462         d[H4(i)] = start + i * incr;
2463     }
2464 }
2465 
HELPER(sve_index_d)2466 void HELPER(sve_index_d)(void *vd, uint64_t start,
2467                          uint64_t incr, uint32_t desc)
2468 {
2469     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2470     uint64_t *d = vd;
2471     for (i = 0; i < opr_sz; i += 1) {
2472         d[i] = start + i * incr;
2473     }
2474 }
2475 
HELPER(sve_adr_p32)2476 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2477 {
2478     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2479     uint32_t sh = simd_data(desc);
2480     uint32_t *d = vd, *n = vn, *m = vm;
2481     for (i = 0; i < opr_sz; i += 1) {
2482         d[i] = n[i] + (m[i] << sh);
2483     }
2484 }
2485 
HELPER(sve_adr_p64)2486 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2487 {
2488     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2489     uint64_t sh = simd_data(desc);
2490     uint64_t *d = vd, *n = vn, *m = vm;
2491     for (i = 0; i < opr_sz; i += 1) {
2492         d[i] = n[i] + (m[i] << sh);
2493     }
2494 }
2495 
HELPER(sve_adr_s32)2496 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2497 {
2498     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2499     uint64_t sh = simd_data(desc);
2500     uint64_t *d = vd, *n = vn, *m = vm;
2501     for (i = 0; i < opr_sz; i += 1) {
2502         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2503     }
2504 }
2505 
HELPER(sve_adr_u32)2506 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2507 {
2508     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2509     uint64_t sh = simd_data(desc);
2510     uint64_t *d = vd, *n = vn, *m = vm;
2511     for (i = 0; i < opr_sz; i += 1) {
2512         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2513     }
2514 }
2515 
HELPER(sve_fexpa_h)2516 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2517 {
2518     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2519     static const uint16_t coeff[] = {
2520         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2521         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2522         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2523         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2524     };
2525     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2526     uint16_t *d = vd, *n = vn;
2527 
2528     for (i = 0; i < opr_sz; i++) {
2529         uint16_t nn = n[i];
2530         intptr_t idx = extract32(nn, 0, 5);
2531         uint16_t exp = extract32(nn, 5, 5);
2532         d[i] = coeff[idx] | (exp << 10);
2533     }
2534 }
2535 
HELPER(sve_fexpa_s)2536 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2537 {
2538     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2539     static const uint32_t coeff[] = {
2540         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2541         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2542         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2543         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2544         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2545         0x1ef532, 0x20b051, 0x227043, 0x243516,
2546         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2547         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2548         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2549         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2550         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2551         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2552         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2553         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2554         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2555         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2556     };
2557     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2558     uint32_t *d = vd, *n = vn;
2559 
2560     for (i = 0; i < opr_sz; i++) {
2561         uint32_t nn = n[i];
2562         intptr_t idx = extract32(nn, 0, 6);
2563         uint32_t exp = extract32(nn, 6, 8);
2564         d[i] = coeff[idx] | (exp << 23);
2565     }
2566 }
2567 
HELPER(sve_fexpa_d)2568 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2569 {
2570     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2571     static const uint64_t coeff[] = {
2572         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2573         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2574         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2575         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2576         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2577         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2578         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2579         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2580         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2581         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2582         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2583         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2584         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2585         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2586         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2587         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2588         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2589         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2590         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2591         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2592         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2593         0xFA7C1819E90D8ull,
2594     };
2595     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2596     uint64_t *d = vd, *n = vn;
2597 
2598     for (i = 0; i < opr_sz; i++) {
2599         uint64_t nn = n[i];
2600         intptr_t idx = extract32(nn, 0, 6);
2601         uint64_t exp = extract32(nn, 6, 11);
2602         d[i] = coeff[idx] | (exp << 52);
2603     }
2604 }
2605 
HELPER(sve_ftssel_h)2606 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2607 {
2608     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2609     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2610     uint16_t *d = vd, *n = vn, *m = vm;
2611     for (i = 0; i < opr_sz; i += 1) {
2612         uint16_t nn = n[i];
2613         uint16_t mm = m[i];
2614         if (mm & 1) {
2615             nn = float16_one;
2616         }
2617         if (mm & 2) {
2618             nn = float16_maybe_ah_chs(nn, fpcr_ah);
2619         }
2620         d[i] = nn;
2621     }
2622 }
2623 
HELPER(sve_ftssel_s)2624 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2625 {
2626     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2627     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2628     uint32_t *d = vd, *n = vn, *m = vm;
2629     for (i = 0; i < opr_sz; i += 1) {
2630         uint32_t nn = n[i];
2631         uint32_t mm = m[i];
2632         if (mm & 1) {
2633             nn = float32_one;
2634         }
2635         if (mm & 2) {
2636             nn = float32_maybe_ah_chs(nn, fpcr_ah);
2637         }
2638         d[i] = nn;
2639     }
2640 }
2641 
HELPER(sve_ftssel_d)2642 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2643 {
2644     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2645     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2646     uint64_t *d = vd, *n = vn, *m = vm;
2647     for (i = 0; i < opr_sz; i += 1) {
2648         uint64_t nn = n[i];
2649         uint64_t mm = m[i];
2650         if (mm & 1) {
2651             nn = float64_one;
2652         }
2653         if (mm & 2) {
2654             nn = float64_maybe_ah_chs(nn, fpcr_ah);
2655         }
2656         d[i] = nn;
2657     }
2658 }
2659 
2660 /*
2661  * Signed saturating addition with scalar operand.
2662  */
2663 
HELPER(sve_sqaddi_b)2664 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2665 {
2666     intptr_t i, oprsz = simd_oprsz(desc);
2667 
2668     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2669         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2670     }
2671 }
2672 
HELPER(sve_sqaddi_h)2673 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2674 {
2675     intptr_t i, oprsz = simd_oprsz(desc);
2676 
2677     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2678         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2679     }
2680 }
2681 
HELPER(sve_sqaddi_s)2682 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2683 {
2684     intptr_t i, oprsz = simd_oprsz(desc);
2685 
2686     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2687         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2688     }
2689 }
2690 
HELPER(sve_sqaddi_d)2691 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2692 {
2693     intptr_t i, oprsz = simd_oprsz(desc);
2694 
2695     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2696         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2697     }
2698 }
2699 
2700 /*
2701  * Unsigned saturating addition with scalar operand.
2702  */
2703 
HELPER(sve_uqaddi_b)2704 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2705 {
2706     intptr_t i, oprsz = simd_oprsz(desc);
2707 
2708     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2709         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2710     }
2711 }
2712 
HELPER(sve_uqaddi_h)2713 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2714 {
2715     intptr_t i, oprsz = simd_oprsz(desc);
2716 
2717     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2718         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2719     }
2720 }
2721 
HELPER(sve_uqaddi_s)2722 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2723 {
2724     intptr_t i, oprsz = simd_oprsz(desc);
2725 
2726     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2727         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2728     }
2729 }
2730 
HELPER(sve_uqaddi_d)2731 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2732 {
2733     intptr_t i, oprsz = simd_oprsz(desc);
2734 
2735     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2736         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2737     }
2738 }
2739 
HELPER(sve_uqsubi_d)2740 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2741 {
2742     intptr_t i, oprsz = simd_oprsz(desc);
2743 
2744     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2745         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2746     }
2747 }
2748 
2749 /* Two operand predicated copy immediate with merge.  All valid immediates
2750  * can fit within 17 signed bits in the simd_data field.
2751  */
HELPER(sve_cpy_m_b)2752 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2753                          uint64_t mm, uint32_t desc)
2754 {
2755     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2756     uint64_t *d = vd, *n = vn;
2757     uint8_t *pg = vg;
2758 
2759     mm = dup_const(MO_8, mm);
2760     for (i = 0; i < opr_sz; i += 1) {
2761         uint64_t nn = n[i];
2762         uint64_t pp = expand_pred_b(pg[H1(i)]);
2763         d[i] = (mm & pp) | (nn & ~pp);
2764     }
2765 }
2766 
HELPER(sve_cpy_m_h)2767 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2768                          uint64_t mm, uint32_t desc)
2769 {
2770     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2771     uint64_t *d = vd, *n = vn;
2772     uint8_t *pg = vg;
2773 
2774     mm = dup_const(MO_16, mm);
2775     for (i = 0; i < opr_sz; i += 1) {
2776         uint64_t nn = n[i];
2777         uint64_t pp = expand_pred_h(pg[H1(i)]);
2778         d[i] = (mm & pp) | (nn & ~pp);
2779     }
2780 }
2781 
HELPER(sve_cpy_m_s)2782 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2783                          uint64_t mm, uint32_t desc)
2784 {
2785     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2786     uint64_t *d = vd, *n = vn;
2787     uint8_t *pg = vg;
2788 
2789     mm = dup_const(MO_32, mm);
2790     for (i = 0; i < opr_sz; i += 1) {
2791         uint64_t nn = n[i];
2792         uint64_t pp = expand_pred_s(pg[H1(i)]);
2793         d[i] = (mm & pp) | (nn & ~pp);
2794     }
2795 }
2796 
HELPER(sve_cpy_m_d)2797 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2798                          uint64_t mm, uint32_t desc)
2799 {
2800     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2801     uint64_t *d = vd, *n = vn;
2802     uint8_t *pg = vg;
2803 
2804     for (i = 0; i < opr_sz; i += 1) {
2805         uint64_t nn = n[i];
2806         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2807     }
2808 }
2809 
HELPER(sve_cpy_z_b)2810 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2811 {
2812     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2813     uint64_t *d = vd;
2814     uint8_t *pg = vg;
2815 
2816     val = dup_const(MO_8, val);
2817     for (i = 0; i < opr_sz; i += 1) {
2818         d[i] = val & expand_pred_b(pg[H1(i)]);
2819     }
2820 }
2821 
HELPER(sve_cpy_z_h)2822 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2823 {
2824     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2825     uint64_t *d = vd;
2826     uint8_t *pg = vg;
2827 
2828     val = dup_const(MO_16, val);
2829     for (i = 0; i < opr_sz; i += 1) {
2830         d[i] = val & expand_pred_h(pg[H1(i)]);
2831     }
2832 }
2833 
HELPER(sve_cpy_z_s)2834 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2835 {
2836     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2837     uint64_t *d = vd;
2838     uint8_t *pg = vg;
2839 
2840     val = dup_const(MO_32, val);
2841     for (i = 0; i < opr_sz; i += 1) {
2842         d[i] = val & expand_pred_s(pg[H1(i)]);
2843     }
2844 }
2845 
HELPER(sve_cpy_z_d)2846 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2847 {
2848     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2849     uint64_t *d = vd;
2850     uint8_t *pg = vg;
2851 
2852     for (i = 0; i < opr_sz; i += 1) {
2853         d[i] = (pg[H1(i)] & 1 ? val : 0);
2854     }
2855 }
2856 
2857 /* Big-endian hosts need to frob the byte indices.  If the copy
2858  * happens to be 8-byte aligned, then no frobbing necessary.
2859  */
swap_memmove(void * vd,void * vs,size_t n)2860 static void swap_memmove(void *vd, void *vs, size_t n)
2861 {
2862     uintptr_t d = (uintptr_t)vd;
2863     uintptr_t s = (uintptr_t)vs;
2864     uintptr_t o = (d | s | n) & 7;
2865     size_t i;
2866 
2867 #if !HOST_BIG_ENDIAN
2868     o = 0;
2869 #endif
2870     switch (o) {
2871     case 0:
2872         memmove(vd, vs, n);
2873         break;
2874 
2875     case 4:
2876         if (d < s || d >= s + n) {
2877             for (i = 0; i < n; i += 4) {
2878                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2879             }
2880         } else {
2881             for (i = n; i > 0; ) {
2882                 i -= 4;
2883                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2884             }
2885         }
2886         break;
2887 
2888     case 2:
2889     case 6:
2890         if (d < s || d >= s + n) {
2891             for (i = 0; i < n; i += 2) {
2892                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2893             }
2894         } else {
2895             for (i = n; i > 0; ) {
2896                 i -= 2;
2897                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2898             }
2899         }
2900         break;
2901 
2902     default:
2903         if (d < s || d >= s + n) {
2904             for (i = 0; i < n; i++) {
2905                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2906             }
2907         } else {
2908             for (i = n; i > 0; ) {
2909                 i -= 1;
2910                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2911             }
2912         }
2913         break;
2914     }
2915 }
2916 
2917 /* Similarly for memset of 0.  */
swap_memzero(void * vd,size_t n)2918 static void swap_memzero(void *vd, size_t n)
2919 {
2920     uintptr_t d = (uintptr_t)vd;
2921     uintptr_t o = (d | n) & 7;
2922     size_t i;
2923 
2924     /* Usually, the first bit of a predicate is set, so N is 0.  */
2925     if (likely(n == 0)) {
2926         return;
2927     }
2928 
2929 #if !HOST_BIG_ENDIAN
2930     o = 0;
2931 #endif
2932     switch (o) {
2933     case 0:
2934         memset(vd, 0, n);
2935         break;
2936 
2937     case 4:
2938         for (i = 0; i < n; i += 4) {
2939             *(uint32_t *)H1_4(d + i) = 0;
2940         }
2941         break;
2942 
2943     case 2:
2944     case 6:
2945         for (i = 0; i < n; i += 2) {
2946             *(uint16_t *)H1_2(d + i) = 0;
2947         }
2948         break;
2949 
2950     default:
2951         for (i = 0; i < n; i++) {
2952             *(uint8_t *)H1(d + i) = 0;
2953         }
2954         break;
2955     }
2956 }
2957 
HELPER(sve_ext)2958 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2959 {
2960     intptr_t opr_sz = simd_oprsz(desc);
2961     size_t n_ofs = simd_data(desc);
2962     size_t n_siz = opr_sz - n_ofs;
2963 
2964     if (vd != vm) {
2965         swap_memmove(vd, vn + n_ofs, n_siz);
2966         swap_memmove(vd + n_siz, vm, n_ofs);
2967     } else if (vd != vn) {
2968         swap_memmove(vd + n_siz, vd, n_ofs);
2969         swap_memmove(vd, vn + n_ofs, n_siz);
2970     } else {
2971         /* vd == vn == vm.  Need temp space.  */
2972         ARMVectorReg tmp;
2973         swap_memmove(&tmp, vm, n_ofs);
2974         swap_memmove(vd, vd + n_ofs, n_siz);
2975         memcpy(vd + n_siz, &tmp, n_ofs);
2976     }
2977 }
2978 
2979 #define DO_INSR(NAME, TYPE, H) \
2980 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2981 {                                                                  \
2982     intptr_t opr_sz = simd_oprsz(desc);                            \
2983     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2984     *(TYPE *)(vd + H(0)) = val;                                    \
2985 }
2986 
DO_INSR(sve_insr_b,uint8_t,H1)2987 DO_INSR(sve_insr_b, uint8_t, H1)
2988 DO_INSR(sve_insr_h, uint16_t, H1_2)
2989 DO_INSR(sve_insr_s, uint32_t, H1_4)
2990 DO_INSR(sve_insr_d, uint64_t, H1_8)
2991 
2992 #undef DO_INSR
2993 
2994 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2995 {
2996     intptr_t i, j, opr_sz = simd_oprsz(desc);
2997     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2998         uint64_t f = *(uint64_t *)(vn + i);
2999         uint64_t b = *(uint64_t *)(vn + j);
3000         *(uint64_t *)(vd + i) = bswap64(b);
3001         *(uint64_t *)(vd + j) = bswap64(f);
3002     }
3003 }
3004 
HELPER(sve_rev_h)3005 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
3006 {
3007     intptr_t i, j, opr_sz = simd_oprsz(desc);
3008     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3009         uint64_t f = *(uint64_t *)(vn + i);
3010         uint64_t b = *(uint64_t *)(vn + j);
3011         *(uint64_t *)(vd + i) = hswap64(b);
3012         *(uint64_t *)(vd + j) = hswap64(f);
3013     }
3014 }
3015 
HELPER(sve_rev_s)3016 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
3017 {
3018     intptr_t i, j, opr_sz = simd_oprsz(desc);
3019     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3020         uint64_t f = *(uint64_t *)(vn + i);
3021         uint64_t b = *(uint64_t *)(vn + j);
3022         *(uint64_t *)(vd + i) = rol64(b, 32);
3023         *(uint64_t *)(vd + j) = rol64(f, 32);
3024     }
3025 }
3026 
HELPER(sve_rev_d)3027 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
3028 {
3029     intptr_t i, j, opr_sz = simd_oprsz(desc);
3030     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3031         uint64_t f = *(uint64_t *)(vn + i);
3032         uint64_t b = *(uint64_t *)(vn + j);
3033         *(uint64_t *)(vd + i) = b;
3034         *(uint64_t *)(vd + j) = f;
3035     }
3036 }
3037 
3038 /*
3039  * TODO: This could use half_shuffle64 and similar bit tricks to
3040  * expand blocks of bits at once.
3041  */
3042 #define DO_PMOV_PV(NAME, ESIZE)                                 \
3043 void HELPER(NAME)(void *vd, void *vs, uint32_t desc)            \
3044 {                                                               \
3045     unsigned vl = simd_oprsz(desc);                             \
3046     unsigned idx = simd_data(desc);                             \
3047     unsigned elements = vl / ESIZE;                             \
3048     ARMPredicateReg *d = vd;                                    \
3049     ARMVectorReg *s = vs;                                       \
3050     memset(d, 0, sizeof(*d));                                   \
3051     for (unsigned e = 0; e < elements; ++e) {                   \
3052         depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \
3053     }                                                           \
3054 }
3055 
3056 DO_PMOV_PV(pmov_pv_h, 2)
3057 DO_PMOV_PV(pmov_pv_s, 4)
3058 DO_PMOV_PV(pmov_pv_d, 8)
3059 
3060 #undef DO_PMOV_PV
3061 
3062 /*
3063  * TODO: This could use half_unshuffle64 and similar bit tricks to
3064  * compress blocks of bits at once.
3065  */
3066 #define DO_PMOV_VP(NAME, ESIZE)                                 \
3067 void HELPER(NAME)(void *vd, void *vs, uint32_t desc)            \
3068 {                                                               \
3069     unsigned vl = simd_oprsz(desc);                             \
3070     unsigned idx = simd_data(desc);                             \
3071     unsigned elements = vl / ESIZE;                             \
3072     ARMVectorReg *d = vd;                                       \
3073     ARMPredicateReg *s = vs;                                    \
3074     if (idx == 0) {                                             \
3075         memset(d, 0, vl);                                       \
3076     }                                                           \
3077     for (unsigned e = 0; e < elements; ++e) {                   \
3078         depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \
3079     }                                                           \
3080 }
3081 
3082 DO_PMOV_VP(pmov_vp_h, 2)
3083 DO_PMOV_VP(pmov_vp_s, 4)
3084 DO_PMOV_VP(pmov_vp_d, 8)
3085 
3086 #undef DO_PMOV_VP
3087 
3088 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
3089 
do_tbl1(void * vd,void * vn,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)3090 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
3091                            bool is_tbx, tb_impl_fn *fn)
3092 {
3093     ARMVectorReg scratch;
3094     uintptr_t oprsz = simd_oprsz(desc);
3095 
3096     if (unlikely(vd == vn)) {
3097         vn = memcpy(&scratch, vn, oprsz);
3098     }
3099 
3100     fn(vd, vn, NULL, vm, oprsz, is_tbx);
3101 }
3102 
do_tbl2(void * vd,void * vn0,void * vn1,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)3103 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3104                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3105 {
3106     ARMVectorReg scratch;
3107     uintptr_t oprsz = simd_oprsz(desc);
3108 
3109     if (unlikely(vd == vn0)) {
3110         vn0 = memcpy(&scratch, vn0, oprsz);
3111         if (vd == vn1) {
3112             vn1 = vn0;
3113         }
3114     } else if (unlikely(vd == vn1)) {
3115         vn1 = memcpy(&scratch, vn1, oprsz);
3116     }
3117 
3118     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3119 }
3120 
3121 #define DO_TB(SUFF, TYPE, H)                                            \
3122 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
3123                                 void *vm, uintptr_t oprsz, bool is_tbx) \
3124 {                                                                       \
3125     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
3126     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
3127     for (i = 0; i < nelem; ++i) {                                       \
3128         TYPE index = indexes[H1(i)], val = 0;                           \
3129         if (index < nelem) {                                            \
3130             val = tbl0[H(index)];                                       \
3131         } else {                                                        \
3132             index -= nelem;                                             \
3133             if (tbl1 && index < nelem) {                                \
3134                 val = tbl1[H(index)];                                   \
3135             } else if (is_tbx) {                                        \
3136                 continue;                                               \
3137             }                                                           \
3138         }                                                               \
3139         d[H(i)] = val;                                                  \
3140     }                                                                   \
3141 }                                                                       \
3142 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3143 {                                                                       \
3144     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3145 }                                                                       \
3146 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3147                              void *vm, uint32_t desc)                   \
3148 {                                                                       \
3149     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3150 }                                                                       \
3151 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3152 {                                                                       \
3153     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3154 }
3155 
3156 DO_TB(b, uint8_t, H1)
3157 DO_TB(h, uint16_t, H2)
3158 DO_TB(s, uint32_t, H4)
3159 DO_TB(d, uint64_t, H8)
3160 
3161 #undef DO_TB
3162 
3163 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3164 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3165 {                                                              \
3166     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3167     TYPED *d = vd;                                             \
3168     TYPES *n = vn;                                             \
3169     ARMVectorReg tmp;                                          \
3170     if (unlikely(vn - vd < opr_sz)) {                          \
3171         n = memcpy(&tmp, n, opr_sz / 2);                       \
3172     }                                                          \
3173     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3174         d[HD(i)] = n[HS(i)];                                   \
3175     }                                                          \
3176 }
3177 
3178 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3179 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3180 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3181 
3182 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3183 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3184 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3185 
3186 #undef DO_UNPK
3187 
3188 /* Mask of bits included in the even numbered predicates of width esz.
3189  * We also use this for expand_bits/compress_bits, and so extend the
3190  * same pattern out to 16-bit units.
3191  */
3192 static const uint64_t even_bit_esz_masks[5] = {
3193     0x5555555555555555ull,
3194     0x3333333333333333ull,
3195     0x0f0f0f0f0f0f0f0full,
3196     0x00ff00ff00ff00ffull,
3197     0x0000ffff0000ffffull,
3198 };
3199 
3200 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3201  * For N==0, this corresponds to the operation that in qemu/bitops.h
3202  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3203  * section 7-2 Shuffling Bits.
3204  */
expand_bits(uint64_t x,int n)3205 static uint64_t expand_bits(uint64_t x, int n)
3206 {
3207     int i;
3208 
3209     x &= 0xffffffffu;
3210     for (i = 4; i >= n; i--) {
3211         int sh = 1 << i;
3212         x = ((x << sh) | x) & even_bit_esz_masks[i];
3213     }
3214     return x;
3215 }
3216 
3217 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3218  * For N==0, this corresponds to the operation that in qemu/bitops.h
3219  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3220  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3221  */
compress_bits(uint64_t x,int n)3222 static uint64_t compress_bits(uint64_t x, int n)
3223 {
3224     int i;
3225 
3226     for (i = n; i <= 4; i++) {
3227         int sh = 1 << i;
3228         x &= even_bit_esz_masks[i];
3229         x = (x >> sh) | x;
3230     }
3231     return x & 0xffffffffu;
3232 }
3233 
HELPER(sve_zip_p)3234 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3235 {
3236     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3237     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3238     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3239     int esize = 1 << esz;
3240     uint64_t *d = vd;
3241     intptr_t i;
3242 
3243     if (oprsz <= 8) {
3244         uint64_t nn = *(uint64_t *)vn;
3245         uint64_t mm = *(uint64_t *)vm;
3246         int half = 4 * oprsz;
3247 
3248         nn = extract64(nn, high * half, half);
3249         mm = extract64(mm, high * half, half);
3250         nn = expand_bits(nn, esz);
3251         mm = expand_bits(mm, esz);
3252         d[0] = nn | (mm << esize);
3253     } else {
3254         ARMPredicateReg tmp;
3255 
3256         /* We produce output faster than we consume input.
3257            Therefore we must be mindful of possible overlap.  */
3258         if (vd == vn) {
3259             vn = memcpy(&tmp, vn, oprsz);
3260             if (vd == vm) {
3261                 vm = vn;
3262             }
3263         } else if (vd == vm) {
3264             vm = memcpy(&tmp, vm, oprsz);
3265         }
3266         if (high) {
3267             high = oprsz >> 1;
3268         }
3269 
3270         if ((oprsz & 7) == 0) {
3271             uint32_t *n = vn, *m = vm;
3272             high >>= 2;
3273 
3274             for (i = 0; i < oprsz / 8; i++) {
3275                 uint64_t nn = n[H4(high + i)];
3276                 uint64_t mm = m[H4(high + i)];
3277 
3278                 nn = expand_bits(nn, esz);
3279                 mm = expand_bits(mm, esz);
3280                 d[i] = nn | (mm << esize);
3281             }
3282         } else {
3283             uint8_t *n = vn, *m = vm;
3284             uint16_t *d16 = vd;
3285 
3286             for (i = 0; i < oprsz / 2; i++) {
3287                 uint16_t nn = n[H1(high + i)];
3288                 uint16_t mm = m[H1(high + i)];
3289 
3290                 nn = expand_bits(nn, esz);
3291                 mm = expand_bits(mm, esz);
3292                 d16[H2(i)] = nn | (mm << esize);
3293             }
3294         }
3295     }
3296 }
3297 
HELPER(sve_uzp_p)3298 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3299 {
3300     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3301     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3302     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3303     uint64_t *d = vd, *n = vn, *m = vm;
3304     uint64_t l, h;
3305     intptr_t i;
3306 
3307     if (oprsz <= 8) {
3308         l = compress_bits(n[0] >> odd, esz);
3309         h = compress_bits(m[0] >> odd, esz);
3310         d[0] = l | (h << (4 * oprsz));
3311     } else {
3312         ARMPredicateReg tmp_m;
3313         intptr_t oprsz_16 = oprsz / 16;
3314 
3315         if ((vm - vd) < (uintptr_t)oprsz) {
3316             m = memcpy(&tmp_m, vm, oprsz);
3317         }
3318 
3319         for (i = 0; i < oprsz_16; i++) {
3320             l = n[2 * i + 0];
3321             h = n[2 * i + 1];
3322             l = compress_bits(l >> odd, esz);
3323             h = compress_bits(h >> odd, esz);
3324             d[i] = l | (h << 32);
3325         }
3326 
3327         /*
3328          * For VL which is not a multiple of 512, the results from M do not
3329          * align nicely with the uint64_t for D.  Put the aligned results
3330          * from M into TMP_M and then copy it into place afterward.
3331          */
3332         if (oprsz & 15) {
3333             int final_shift = (oprsz & 15) * 2;
3334 
3335             l = n[2 * i + 0];
3336             h = n[2 * i + 1];
3337             l = compress_bits(l >> odd, esz);
3338             h = compress_bits(h >> odd, esz);
3339             d[i] = l | (h << final_shift);
3340 
3341             for (i = 0; i < oprsz_16; i++) {
3342                 l = m[2 * i + 0];
3343                 h = m[2 * i + 1];
3344                 l = compress_bits(l >> odd, esz);
3345                 h = compress_bits(h >> odd, esz);
3346                 tmp_m.p[i] = l | (h << 32);
3347             }
3348             l = m[2 * i + 0];
3349             h = m[2 * i + 1];
3350             l = compress_bits(l >> odd, esz);
3351             h = compress_bits(h >> odd, esz);
3352             tmp_m.p[i] = l | (h << final_shift);
3353 
3354             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3355         } else {
3356             for (i = 0; i < oprsz_16; i++) {
3357                 l = m[2 * i + 0];
3358                 h = m[2 * i + 1];
3359                 l = compress_bits(l >> odd, esz);
3360                 h = compress_bits(h >> odd, esz);
3361                 d[oprsz_16 + i] = l | (h << 32);
3362             }
3363         }
3364     }
3365 }
3366 
HELPER(sve_trn_p)3367 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3368 {
3369     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3370     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3371     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3372     uint64_t *d = vd, *n = vn, *m = vm;
3373     uint64_t mask;
3374     int shr, shl;
3375     intptr_t i;
3376 
3377     shl = 1 << esz;
3378     shr = 0;
3379     mask = even_bit_esz_masks[esz];
3380     if (odd) {
3381         mask <<= shl;
3382         shr = shl;
3383         shl = 0;
3384     }
3385 
3386     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3387         uint64_t nn = (n[i] & mask) >> shr;
3388         uint64_t mm = (m[i] & mask) << shl;
3389         d[i] = nn + mm;
3390     }
3391 }
3392 
3393 /* Reverse units of 2**N bits.  */
reverse_bits_64(uint64_t x,int n)3394 static uint64_t reverse_bits_64(uint64_t x, int n)
3395 {
3396     int i, sh;
3397 
3398     x = bswap64(x);
3399     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3400         uint64_t mask = even_bit_esz_masks[i];
3401         x = ((x & mask) << sh) | ((x >> sh) & mask);
3402     }
3403     return x;
3404 }
3405 
reverse_bits_8(uint8_t x,int n)3406 static uint8_t reverse_bits_8(uint8_t x, int n)
3407 {
3408     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3409     int i, sh;
3410 
3411     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3412         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3413     }
3414     return x;
3415 }
3416 
HELPER(sve_rev_p)3417 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3418 {
3419     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3420     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3421     intptr_t i, oprsz_2 = oprsz / 2;
3422 
3423     if (oprsz <= 8) {
3424         uint64_t l = *(uint64_t *)vn;
3425         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3426         *(uint64_t *)vd = l;
3427     } else if ((oprsz & 15) == 0) {
3428         for (i = 0; i < oprsz_2; i += 8) {
3429             intptr_t ih = oprsz - 8 - i;
3430             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3431             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3432             *(uint64_t *)(vd + i) = h;
3433             *(uint64_t *)(vd + ih) = l;
3434         }
3435     } else {
3436         for (i = 0; i < oprsz_2; i += 1) {
3437             intptr_t il = H1(i);
3438             intptr_t ih = H1(oprsz - 1 - i);
3439             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3440             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3441             *(uint8_t *)(vd + il) = h;
3442             *(uint8_t *)(vd + ih) = l;
3443         }
3444     }
3445 }
3446 
HELPER(sve_punpk_p)3447 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3448 {
3449     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3450     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3451     uint64_t *d = vd;
3452     intptr_t i;
3453 
3454     if (oprsz <= 8) {
3455         uint64_t nn = *(uint64_t *)vn;
3456         int half = 4 * oprsz;
3457 
3458         nn = extract64(nn, high * half, half);
3459         nn = expand_bits(nn, 0);
3460         d[0] = nn;
3461     } else {
3462         ARMPredicateReg tmp_n;
3463 
3464         /* We produce output faster than we consume input.
3465            Therefore we must be mindful of possible overlap.  */
3466         if ((vn - vd) < (uintptr_t)oprsz) {
3467             vn = memcpy(&tmp_n, vn, oprsz);
3468         }
3469         if (high) {
3470             high = oprsz >> 1;
3471         }
3472 
3473         if ((oprsz & 7) == 0) {
3474             uint32_t *n = vn;
3475             high >>= 2;
3476 
3477             for (i = 0; i < oprsz / 8; i++) {
3478                 uint64_t nn = n[H4(high + i)];
3479                 d[i] = expand_bits(nn, 0);
3480             }
3481         } else {
3482             uint16_t *d16 = vd;
3483             uint8_t *n = vn;
3484 
3485             for (i = 0; i < oprsz / 2; i++) {
3486                 uint16_t nn = n[H1(high + i)];
3487                 d16[H2(i)] = expand_bits(nn, 0);
3488             }
3489         }
3490     }
3491 }
3492 
3493 #define DO_ZIP(NAME, TYPE, H) \
3494 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3495 {                                                                    \
3496     intptr_t oprsz = simd_oprsz(desc);                               \
3497     intptr_t odd_ofs = simd_data(desc);                              \
3498     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3499     ARMVectorReg tmp_n, tmp_m;                                       \
3500     /* We produce output faster than we consume input.               \
3501        Therefore we must be mindful of possible overlap.  */         \
3502     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3503         vn = memcpy(&tmp_n, vn, oprsz);                              \
3504     }                                                                \
3505     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3506         vm = memcpy(&tmp_m, vm, oprsz);                              \
3507     }                                                                \
3508     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3509         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3510         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3511             *(TYPE *)(vm + odd_ofs + H(i));                          \
3512     }                                                                \
3513     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3514         memset(vd + oprsz - 16, 0, 16);                              \
3515     }                                                                \
3516 }
3517 
3518 DO_ZIP(sve_zip_b, uint8_t, H1)
3519 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3520 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3521 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3522 DO_ZIP(sve2_zip_q, Int128, )
3523 
3524 #define DO_UZP(NAME, TYPE, H) \
3525 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3526 {                                                                      \
3527     intptr_t oprsz = simd_oprsz(desc);                                 \
3528     intptr_t odd_ofs = simd_data(desc);                                \
3529     intptr_t i, p;                                                     \
3530     ARMVectorReg tmp_m;                                                \
3531     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3532         vm = memcpy(&tmp_m, vm, oprsz);                                \
3533     }                                                                  \
3534     i = 0, p = odd_ofs;                                                \
3535     do {                                                               \
3536         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3537         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3538     } while (p < oprsz);                                               \
3539     p -= oprsz;                                                        \
3540     do {                                                               \
3541         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3542         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3543     } while (p < oprsz);                                               \
3544     tcg_debug_assert(i == oprsz);                                      \
3545 }
3546 
3547 DO_UZP(sve_uzp_b, uint8_t, H1)
3548 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3549 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3550 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3551 DO_UZP(sve2_uzp_q, Int128, )
3552 
3553 typedef void perseg_zzz_fn(void *vd, void *vn, void *vm, uint32_t desc);
3554 
do_perseg_zzz(void * vd,void * vn,void * vm,uint32_t desc,perseg_zzz_fn * fn)3555 static void do_perseg_zzz(void *vd, void *vn, void *vm,
3556                           uint32_t desc, perseg_zzz_fn *fn)
3557 {
3558     intptr_t oprsz = simd_oprsz(desc);
3559 
3560     desc = simd_desc(16, 16, simd_data(desc));
3561     for (intptr_t i = 0; i < oprsz; i += 16) {
3562         fn(vd + i, vn + i, vm + i, desc);
3563     }
3564 }
3565 
3566 #define DO_PERSEG_ZZZ(NAME, FUNC) \
3567     void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3568     { do_perseg_zzz(vd, vn, vm, desc, FUNC); }
3569 
DO_PERSEG_ZZZ(sve2p1_uzpq_b,helper_sve_uzp_b)3570 DO_PERSEG_ZZZ(sve2p1_uzpq_b, helper_sve_uzp_b)
3571 DO_PERSEG_ZZZ(sve2p1_uzpq_h, helper_sve_uzp_h)
3572 DO_PERSEG_ZZZ(sve2p1_uzpq_s, helper_sve_uzp_s)
3573 DO_PERSEG_ZZZ(sve2p1_uzpq_d, helper_sve_uzp_d)
3574 
3575 DO_PERSEG_ZZZ(sve2p1_zipq_b, helper_sve_zip_b)
3576 DO_PERSEG_ZZZ(sve2p1_zipq_h, helper_sve_zip_h)
3577 DO_PERSEG_ZZZ(sve2p1_zipq_s, helper_sve_zip_s)
3578 DO_PERSEG_ZZZ(sve2p1_zipq_d, helper_sve_zip_d)
3579 
3580 DO_PERSEG_ZZZ(sve2p1_tblq_b, helper_sve_tbl_b)
3581 DO_PERSEG_ZZZ(sve2p1_tblq_h, helper_sve_tbl_h)
3582 DO_PERSEG_ZZZ(sve2p1_tblq_s, helper_sve_tbl_s)
3583 DO_PERSEG_ZZZ(sve2p1_tblq_d, helper_sve_tbl_d)
3584 
3585 DO_PERSEG_ZZZ(sve2p1_tbxq_b, helper_sve2_tbx_b)
3586 DO_PERSEG_ZZZ(sve2p1_tbxq_h, helper_sve2_tbx_h)
3587 DO_PERSEG_ZZZ(sve2p1_tbxq_s, helper_sve2_tbx_s)
3588 DO_PERSEG_ZZZ(sve2p1_tbxq_d, helper_sve2_tbx_d)
3589 
3590 #undef DO_PERSEG_ZZZ
3591 
3592 #define DO_TRN(NAME, TYPE, H) \
3593 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3594 {                                                                      \
3595     intptr_t oprsz = simd_oprsz(desc);                                 \
3596     intptr_t odd_ofs = simd_data(desc);                                \
3597     intptr_t i;                                                        \
3598     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3599         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3600         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3601         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3602         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3603     }                                                                  \
3604     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3605         memset(vd + oprsz - 16, 0, 16);                                \
3606     }                                                                  \
3607 }
3608 
3609 DO_TRN(sve_trn_b, uint8_t, H1)
3610 DO_TRN(sve_trn_h, uint16_t, H1_2)
3611 DO_TRN(sve_trn_s, uint32_t, H1_4)
3612 DO_TRN(sve_trn_d, uint64_t, H1_8)
3613 DO_TRN(sve2_trn_q, Int128, )
3614 
3615 #undef DO_ZIP
3616 #undef DO_UZP
3617 #undef DO_TRN
3618 
3619 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3620 {
3621     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3622     uint32_t *d = vd, *n = vn;
3623     uint8_t *pg = vg;
3624 
3625     for (i = j = 0; i < opr_sz; i++) {
3626         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3627             d[H4(j)] = n[H4(i)];
3628             j++;
3629         }
3630     }
3631     for (; j < opr_sz; j++) {
3632         d[H4(j)] = 0;
3633     }
3634 }
3635 
HELPER(sve_compact_d)3636 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3637 {
3638     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3639     uint64_t *d = vd, *n = vn;
3640     uint8_t *pg = vg;
3641 
3642     for (i = j = 0; i < opr_sz; i++) {
3643         if (pg[H1(i)] & 1) {
3644             d[j] = n[i];
3645             j++;
3646         }
3647     }
3648     for (; j < opr_sz; j++) {
3649         d[j] = 0;
3650     }
3651 }
3652 
3653 /* Similar to the ARM LastActiveElement pseudocode function, except the
3654  * result is multiplied by the element size.  This includes the not found
3655  * indication; e.g. not found for esz=3 is -8.
3656  */
HELPER(sve_last_active_element)3657 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3658 {
3659     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3660     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3661 
3662     return last_active_element(vg, words, esz);
3663 }
3664 
HELPER(sve_splice)3665 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3666 {
3667     intptr_t opr_sz = simd_oprsz(desc) / 8;
3668     int esz = simd_data(desc);
3669     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3670     intptr_t i, first_i, last_i;
3671     ARMVectorReg tmp;
3672 
3673     first_i = last_i = 0;
3674     first_g = last_g = 0;
3675 
3676     /* Find the extent of the active elements within VG.  */
3677     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3678         pg = *(uint64_t *)(vg + i) & mask;
3679         if (pg) {
3680             if (last_g == 0) {
3681                 last_g = pg;
3682                 last_i = i;
3683             }
3684             first_g = pg;
3685             first_i = i;
3686         }
3687     }
3688 
3689     len = 0;
3690     if (first_g != 0) {
3691         first_i = first_i * 8 + ctz64(first_g);
3692         last_i = last_i * 8 + 63 - clz64(last_g);
3693         len = last_i - first_i + (1 << esz);
3694         if (vd == vm) {
3695             vm = memcpy(&tmp, vm, opr_sz * 8);
3696         }
3697         swap_memmove(vd, vn + first_i, len);
3698     }
3699     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3700 }
3701 
HELPER(sve_sel_zpzz_b)3702 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3703                             void *vg, uint32_t desc)
3704 {
3705     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3706     uint64_t *d = vd, *n = vn, *m = vm;
3707     uint8_t *pg = vg;
3708 
3709     for (i = 0; i < opr_sz; i += 1) {
3710         uint64_t nn = n[i], mm = m[i];
3711         uint64_t pp = expand_pred_b(pg[H1(i)]);
3712         d[i] = (nn & pp) | (mm & ~pp);
3713     }
3714 }
3715 
HELPER(sve_sel_zpzz_h)3716 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3717                             void *vg, uint32_t desc)
3718 {
3719     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3720     uint64_t *d = vd, *n = vn, *m = vm;
3721     uint8_t *pg = vg;
3722 
3723     for (i = 0; i < opr_sz; i += 1) {
3724         uint64_t nn = n[i], mm = m[i];
3725         uint64_t pp = expand_pred_h(pg[H1(i)]);
3726         d[i] = (nn & pp) | (mm & ~pp);
3727     }
3728 }
3729 
HELPER(sve_sel_zpzz_s)3730 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3731                             void *vg, uint32_t desc)
3732 {
3733     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3734     uint64_t *d = vd, *n = vn, *m = vm;
3735     uint8_t *pg = vg;
3736 
3737     for (i = 0; i < opr_sz; i += 1) {
3738         uint64_t nn = n[i], mm = m[i];
3739         uint64_t pp = expand_pred_s(pg[H1(i)]);
3740         d[i] = (nn & pp) | (mm & ~pp);
3741     }
3742 }
3743 
HELPER(sve_sel_zpzz_d)3744 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3745                             void *vg, uint32_t desc)
3746 {
3747     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3748     uint64_t *d = vd, *n = vn, *m = vm;
3749     uint8_t *pg = vg;
3750 
3751     for (i = 0; i < opr_sz; i += 1) {
3752         uint64_t nn = n[i], mm = m[i];
3753         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3754     }
3755 }
3756 
HELPER(sve_sel_zpzz_q)3757 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3758                             void *vg, uint32_t desc)
3759 {
3760     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3761     Int128 *d = vd, *n = vn, *m = vm;
3762     uint16_t *pg = vg;
3763 
3764     for (i = 0; i < opr_sz; i += 1) {
3765         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3766     }
3767 }
3768 
3769 /* Two operand comparison controlled by a predicate.
3770  * ??? It is very tempting to want to be able to expand this inline
3771  * with x86 instructions, e.g.
3772  *
3773  *    vcmpeqw    zm, zn, %ymm0
3774  *    vpmovmskb  %ymm0, %eax
3775  *    and        $0x5555, %eax
3776  *    and        pg, %eax
3777  *
3778  * or even aarch64, e.g.
3779  *
3780  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3781  *    cmeq       v0.8h, zn, zm
3782  *    and        v0.8h, v0.8h, mask
3783  *    addv       h0, v0.8h
3784  *    and        v0.8b, pg
3785  *
3786  * However, coming up with an abstraction that allows vector inputs and
3787  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3788  * scalar outputs, is tricky.
3789  */
3790 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3791 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3792 {                                                                            \
3793     intptr_t opr_sz = simd_oprsz(desc);                                      \
3794     uint32_t flags = PREDTEST_INIT;                                          \
3795     intptr_t i = opr_sz;                                                     \
3796     do {                                                                     \
3797         uint64_t out = 0, pg;                                                \
3798         do {                                                                 \
3799             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3800             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3801             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3802             out |= nn OP mm;                                                 \
3803         } while (i & 63);                                                    \
3804         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3805         out &= pg;                                                           \
3806         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3807         flags = iter_predtest_bwd(out, pg, flags);                           \
3808     } while (i > 0);                                                         \
3809     return flags;                                                            \
3810 }
3811 
3812 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3813     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3814 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3815     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3816 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3817     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3818 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3819     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3820 
3821 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3822 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3823 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3824 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3825 
3826 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3827 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3828 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3829 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3830 
3831 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3832 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3833 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3834 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3835 
3836 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3837 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3838 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3839 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3840 
3841 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3842 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3843 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3844 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3845 
3846 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3847 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3848 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3849 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3850 
3851 #undef DO_CMP_PPZZ_B
3852 #undef DO_CMP_PPZZ_H
3853 #undef DO_CMP_PPZZ_S
3854 #undef DO_CMP_PPZZ_D
3855 #undef DO_CMP_PPZZ
3856 
3857 /* Similar, but the second source is "wide".  */
3858 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3859 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3860 {                                                                            \
3861     intptr_t opr_sz = simd_oprsz(desc);                                      \
3862     uint32_t flags = PREDTEST_INIT;                                          \
3863     intptr_t i = opr_sz;                                                     \
3864     do {                                                                     \
3865         uint64_t out = 0, pg;                                                \
3866         do {                                                                 \
3867             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3868             do {                                                             \
3869                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3870                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3871                 out |= nn OP mm;                                             \
3872             } while (i & 7);                                                 \
3873         } while (i & 63);                                                    \
3874         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3875         out &= pg;                                                           \
3876         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3877         flags = iter_predtest_bwd(out, pg, flags);                           \
3878     } while (i > 0);                                                         \
3879     return flags;                                                            \
3880 }
3881 
3882 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3883     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3884 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3885     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3886 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3887     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3888 
3889 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3890 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3891 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3892 
3893 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3894 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3895 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3896 
3897 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3898 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3899 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3900 
3901 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3902 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3903 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3904 
3905 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3906 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3907 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3908 
3909 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3910 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3911 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3912 
3913 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3914 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3915 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3916 
3917 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3918 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3919 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3920 
3921 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3922 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3923 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3924 
3925 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3926 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3927 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3928 
3929 #undef DO_CMP_PPZW_B
3930 #undef DO_CMP_PPZW_H
3931 #undef DO_CMP_PPZW_S
3932 #undef DO_CMP_PPZW
3933 
3934 /* Similar, but the second source is immediate.  */
3935 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3936 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3937 {                                                                    \
3938     intptr_t opr_sz = simd_oprsz(desc);                              \
3939     uint32_t flags = PREDTEST_INIT;                                  \
3940     TYPE mm = simd_data(desc);                                       \
3941     intptr_t i = opr_sz;                                             \
3942     do {                                                             \
3943         uint64_t out = 0, pg;                                        \
3944         do {                                                         \
3945             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3946             TYPE nn = *(TYPE *)(vn + H(i));                          \
3947             out |= nn OP mm;                                         \
3948         } while (i & 63);                                            \
3949         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3950         out &= pg;                                                   \
3951         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3952         flags = iter_predtest_bwd(out, pg, flags);                   \
3953     } while (i > 0);                                                 \
3954     return flags;                                                    \
3955 }
3956 
3957 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3958     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3959 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3960     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3961 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3962     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3963 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3964     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3965 
3966 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3967 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3968 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3969 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3970 
3971 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3972 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3973 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3974 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3975 
3976 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3977 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3978 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3979 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3980 
3981 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3982 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3983 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3984 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3985 
3986 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3987 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3988 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3989 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3990 
3991 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3992 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3993 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3994 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3995 
3996 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3997 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3998 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3999 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
4000 
4001 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
4002 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
4003 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
4004 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
4005 
4006 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
4007 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
4008 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
4009 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
4010 
4011 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
4012 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
4013 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
4014 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
4015 
4016 #undef DO_CMP_PPZI_B
4017 #undef DO_CMP_PPZI_H
4018 #undef DO_CMP_PPZI_S
4019 #undef DO_CMP_PPZI_D
4020 #undef DO_CMP_PPZI
4021 
4022 /* Similar to the ARM LastActive pseudocode function.  */
last_active_pred(void * vd,void * vg,intptr_t oprsz)4023 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
4024 {
4025     intptr_t i;
4026 
4027     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
4028         uint64_t pg = *(uint64_t *)(vg + i);
4029         if (pg) {
4030             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
4031         }
4032     }
4033     return 0;
4034 }
4035 
4036 /* Compute a mask into RETB that is true for all G, up to and including
4037  * (if after) or excluding (if !after) the first G & N.
4038  * Return true if BRK found.
4039  */
compute_brk(uint64_t * retb,uint64_t n,uint64_t g,bool brk,bool after)4040 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
4041                         bool brk, bool after)
4042 {
4043     uint64_t b;
4044 
4045     if (brk) {
4046         b = 0;
4047     } else if ((g & n) == 0) {
4048         /* For all G, no N are set; break not found.  */
4049         b = g;
4050     } else {
4051         /* Break somewhere in N.  Locate it.  */
4052         b = g & n;            /* guard true, pred true */
4053         b = b & -b;           /* first such */
4054         if (after) {
4055             b = b | (b - 1);  /* break after same */
4056         } else {
4057             b = b - 1;        /* break before same */
4058         }
4059         brk = true;
4060     }
4061 
4062     *retb = b;
4063     return brk;
4064 }
4065 
4066 /* Compute a zeroing BRK.  */
compute_brk_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)4067 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
4068                           intptr_t oprsz, bool after)
4069 {
4070     bool brk = false;
4071     intptr_t i;
4072 
4073     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4074         uint64_t this_b, this_g = g[i];
4075 
4076         brk = compute_brk(&this_b, n[i], this_g, brk, after);
4077         d[i] = this_b & this_g;
4078     }
4079 }
4080 
4081 /* Likewise, but also compute flags.  */
compute_brks_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)4082 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
4083                                intptr_t oprsz, bool after)
4084 {
4085     uint32_t flags = PREDTEST_INIT;
4086     bool brk = false;
4087     intptr_t i;
4088 
4089     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4090         uint64_t this_b, this_d, this_g = g[i];
4091 
4092         brk = compute_brk(&this_b, n[i], this_g, brk, after);
4093         d[i] = this_d = this_b & this_g;
4094         flags = iter_predtest_fwd(this_d, this_g, flags);
4095     }
4096     return flags;
4097 }
4098 
4099 /* Compute a merging BRK.  */
compute_brk_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)4100 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
4101                           intptr_t oprsz, bool after)
4102 {
4103     bool brk = false;
4104     intptr_t i;
4105 
4106     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4107         uint64_t this_b, this_g = g[i];
4108 
4109         brk = compute_brk(&this_b, n[i], this_g, brk, after);
4110         d[i] = (this_b & this_g) | (d[i] & ~this_g);
4111     }
4112 }
4113 
4114 /* Likewise, but also compute flags.  */
compute_brks_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)4115 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
4116                                intptr_t oprsz, bool after)
4117 {
4118     uint32_t flags = PREDTEST_INIT;
4119     bool brk = false;
4120     intptr_t i;
4121 
4122     for (i = 0; i < oprsz / 8; ++i) {
4123         uint64_t this_b, this_d = d[i], this_g = g[i];
4124 
4125         brk = compute_brk(&this_b, n[i], this_g, brk, after);
4126         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
4127         flags = iter_predtest_fwd(this_d, this_g, flags);
4128     }
4129     return flags;
4130 }
4131 
HELPER(sve_brkpa)4132 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4133                        uint32_t pred_desc)
4134 {
4135     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4136     if (last_active_pred(vn, vg, oprsz)) {
4137         compute_brk_z(vd, vm, vg, oprsz, true);
4138     } else {
4139         memset(vd, 0, sizeof(ARMPredicateReg));
4140     }
4141 }
4142 
HELPER(sve_brkpas)4143 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4144                             uint32_t pred_desc)
4145 {
4146     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4147     if (last_active_pred(vn, vg, oprsz)) {
4148         return compute_brks_z(vd, vm, vg, oprsz, true);
4149     } else {
4150         memset(vd, 0, sizeof(ARMPredicateReg));
4151         return PREDTEST_INIT;
4152     }
4153 }
4154 
HELPER(sve_brkpb)4155 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4156                        uint32_t pred_desc)
4157 {
4158     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4159     if (last_active_pred(vn, vg, oprsz)) {
4160         compute_brk_z(vd, vm, vg, oprsz, false);
4161     } else {
4162         memset(vd, 0, sizeof(ARMPredicateReg));
4163     }
4164 }
4165 
HELPER(sve_brkpbs)4166 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4167                             uint32_t pred_desc)
4168 {
4169     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4170     if (last_active_pred(vn, vg, oprsz)) {
4171         return compute_brks_z(vd, vm, vg, oprsz, false);
4172     } else {
4173         memset(vd, 0, sizeof(ARMPredicateReg));
4174         return PREDTEST_INIT;
4175     }
4176 }
4177 
HELPER(sve_brka_z)4178 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4179 {
4180     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4181     compute_brk_z(vd, vn, vg, oprsz, true);
4182 }
4183 
HELPER(sve_brkas_z)4184 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4185 {
4186     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4187     return compute_brks_z(vd, vn, vg, oprsz, true);
4188 }
4189 
HELPER(sve_brkb_z)4190 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4191 {
4192     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4193     compute_brk_z(vd, vn, vg, oprsz, false);
4194 }
4195 
HELPER(sve_brkbs_z)4196 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4197 {
4198     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4199     return compute_brks_z(vd, vn, vg, oprsz, false);
4200 }
4201 
HELPER(sve_brka_m)4202 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4203 {
4204     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4205     compute_brk_m(vd, vn, vg, oprsz, true);
4206 }
4207 
HELPER(sve_brkas_m)4208 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4209 {
4210     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4211     return compute_brks_m(vd, vn, vg, oprsz, true);
4212 }
4213 
HELPER(sve_brkb_m)4214 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4215 {
4216     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4217     compute_brk_m(vd, vn, vg, oprsz, false);
4218 }
4219 
HELPER(sve_brkbs_m)4220 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4221 {
4222     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4223     return compute_brks_m(vd, vn, vg, oprsz, false);
4224 }
4225 
HELPER(sve_brkn)4226 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4227 {
4228     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4229     if (!last_active_pred(vn, vg, oprsz)) {
4230         memset(vd, 0, sizeof(ARMPredicateReg));
4231     }
4232 }
4233 
HELPER(sve_brkns)4234 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4235 {
4236     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4237     if (last_active_pred(vn, vg, oprsz)) {
4238         ARMPredicateReg *d = vd;
4239         uint32_t flags = PREDTEST_INIT;
4240         intptr_t i;
4241 
4242         /* As if PredTest(Ones(PL), D, MO_8).  */
4243         for (i = 0; i < oprsz / 8; i++) {
4244             flags = iter_predtest_fwd(d->p[i], -1, flags);
4245         }
4246         if (oprsz & 7) {
4247             uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4248             flags = iter_predtest_fwd(d->p[i], mask, flags);
4249         }
4250         return flags;
4251     }
4252     memset(vd, 0, sizeof(ARMPredicateReg));
4253     return PREDTEST_INIT;
4254 }
4255 
HELPER(sve_cntp)4256 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4257 {
4258     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4259     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4260     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4261     intptr_t i;
4262 
4263     for (i = 0; i < words; ++i) {
4264         uint64_t t = n[i] & g[i] & mask;
4265         sum += ctpop64(t);
4266     }
4267     return sum;
4268 }
4269 
HELPER(sve2p1_cntp_c)4270 uint64_t HELPER(sve2p1_cntp_c)(uint32_t png, uint32_t desc)
4271 {
4272     int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
4273     int vl = pl * 8;
4274     unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
4275     int lg2_width = FIELD_EX32(desc, PREDDESC, DATA) + 1;
4276     DecodeCounter p = decode_counter(png, vl, v_esz);
4277     unsigned maxelem = (vl << lg2_width) >> v_esz;
4278     unsigned count = p.count;
4279 
4280     if (p.invert) {
4281         if (count >= maxelem) {
4282             return 0;
4283         }
4284         count = maxelem - count;
4285     } else {
4286         count = MIN(count, maxelem);
4287     }
4288     return count >> p.lg2_stride;
4289 }
4290 
4291 /* C.f. Arm pseudocode EncodePredCount */
encode_pred_count(uint32_t elements,uint32_t count,uint32_t esz,bool invert)4292 static uint64_t encode_pred_count(uint32_t elements, uint32_t count,
4293                                   uint32_t esz, bool invert)
4294 {
4295     uint32_t pred;
4296 
4297     if (count == 0) {
4298         return 0;
4299     }
4300     if (invert) {
4301         count = elements - count;
4302     } else if (count == elements) {
4303         count = 0;
4304         invert = true;
4305     }
4306 
4307     pred = (count << 1) | 1;
4308     pred <<= esz;
4309     pred |= invert << 15;
4310 
4311     return pred;
4312 }
4313 
4314 /* C.f. Arm pseudocode PredCountTest */
pred_count_test(uint32_t elements,uint32_t count,bool invert)4315 static uint32_t pred_count_test(uint32_t elements, uint32_t count, bool invert)
4316 {
4317     uint32_t flags;
4318 
4319     if (count == 0) {
4320         flags = 1;                              /* !N, Z, C */
4321     } else if (!invert) {
4322         flags = (1u << 31) | 2;                 /* N, !Z */
4323         flags |= count != elements;             /* C */
4324     } else {
4325         flags = 2;                              /* !Z, !C */
4326         flags |= (count == elements) << 31;     /* N */
4327     }
4328     return flags;
4329 }
4330 
4331 /* D must be cleared on entry. */
do_whilel(ARMPredicateReg * d,uint64_t esz_mask,uint32_t count,uint32_t oprbits)4332 static void do_whilel(ARMPredicateReg *d, uint64_t esz_mask,
4333                       uint32_t count, uint32_t oprbits)
4334 {
4335     tcg_debug_assert(count <= oprbits);
4336     if (count) {
4337         uint32_t i;
4338 
4339         /* Set all of the requested bits.  */
4340         for (i = 0; i < count / 64; ++i) {
4341             d->p[i] = esz_mask;
4342         }
4343         if (count & 63) {
4344             d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4345         }
4346     }
4347 }
4348 
HELPER(sve_whilel)4349 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4350 {
4351     uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4352     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4353     uint32_t oprbits = oprsz * 8;
4354     uint64_t esz_mask = pred_esz_masks[esz];
4355     ARMPredicateReg *d = vd;
4356 
4357     count <<= esz;
4358     memset(d, 0, sizeof(*d));
4359     do_whilel(d, esz_mask, count, oprbits);
4360     return pred_count_test(oprbits, count, false);
4361 }
4362 
HELPER(sve_while2l)4363 uint32_t HELPER(sve_while2l)(void *vd, uint32_t count, uint32_t pred_desc)
4364 {
4365     uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4366     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4367     uint32_t oprbits = oprsz * 8;
4368     uint64_t esz_mask = pred_esz_masks[esz];
4369     ARMPredicateReg *d = vd;
4370 
4371     count <<= esz;
4372     memset(d, 0, 2 * sizeof(*d));
4373     if (count <= oprbits) {
4374         do_whilel(&d[0], esz_mask, count, oprbits);
4375     } else {
4376         do_whilel(&d[0], esz_mask, oprbits, oprbits);
4377         do_whilel(&d[1], esz_mask, count - oprbits, oprbits);
4378     }
4379 
4380     return pred_count_test(2 * oprbits, count, false);
4381 }
4382 
HELPER(sve_whilecl)4383 uint32_t HELPER(sve_whilecl)(void *vd, uint32_t count, uint32_t pred_desc)
4384 {
4385     uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4386     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4387     uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA);
4388     uint32_t vl = pl * 8;
4389     uint32_t elements = (vl >> esz) << scale;
4390     ARMPredicateReg *d = vd;
4391 
4392     *d = (ARMPredicateReg) {
4393         .p[0] = encode_pred_count(elements, count, esz, false)
4394     };
4395     return pred_count_test(elements, count, false);
4396 }
4397 
4398 /* D must be cleared on entry. */
do_whileg(ARMPredicateReg * d,uint64_t esz_mask,uint32_t count,uint32_t oprbits)4399 static void do_whileg(ARMPredicateReg *d, uint64_t esz_mask,
4400                       uint32_t count, uint32_t oprbits)
4401 {
4402     tcg_debug_assert(count <= oprbits);
4403     if (count) {
4404         uint32_t i, invcount = oprbits - count;
4405         uint64_t bits = esz_mask & MAKE_64BIT_MASK(invcount & 63, 64);
4406 
4407         for (i = invcount / 64; i < oprbits / 64; ++i) {
4408             d->p[i] = bits;
4409             bits = esz_mask;
4410         }
4411         if (oprbits & 63) {
4412             d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63);
4413         }
4414     }
4415 }
4416 
HELPER(sve_whileg)4417 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4418 {
4419     uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4420     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4421     uint32_t oprbits = oprsz * 8;
4422     uint64_t esz_mask = pred_esz_masks[esz];
4423     ARMPredicateReg *d = vd;
4424 
4425     count <<= esz;
4426     memset(d, 0, sizeof(*d));
4427     do_whileg(d, esz_mask, count, oprbits);
4428     return pred_count_test(oprbits, count, true);
4429 }
4430 
HELPER(sve_while2g)4431 uint32_t HELPER(sve_while2g)(void *vd, uint32_t count, uint32_t pred_desc)
4432 {
4433     uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4434     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4435     uint32_t oprbits = oprsz * 8;
4436     uint64_t esz_mask = pred_esz_masks[esz];
4437     ARMPredicateReg *d = vd;
4438 
4439     count <<= esz;
4440     memset(d, 0, 2 * sizeof(*d));
4441     if (count <= oprbits) {
4442         do_whileg(&d[1], esz_mask, count, oprbits);
4443     } else {
4444         do_whilel(&d[1], esz_mask, oprbits, oprbits);
4445         do_whileg(&d[0], esz_mask, count - oprbits, oprbits);
4446     }
4447 
4448     return pred_count_test(2 * oprbits, count, true);
4449 }
4450 
HELPER(sve_whilecg)4451 uint32_t HELPER(sve_whilecg)(void *vd, uint32_t count, uint32_t pred_desc)
4452 {
4453     uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4454     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4455     uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA);
4456     uint32_t vl = pl * 8;
4457     uint32_t elements = (vl >> esz) << scale;
4458     ARMPredicateReg *d = vd;
4459 
4460     *d = (ARMPredicateReg) {
4461         .p[0] = encode_pred_count(elements, count, esz, true)
4462     };
4463     return pred_count_test(elements, count, true);
4464 }
4465 
4466 /* Recursive reduction on a function;
4467  * C.f. the ARM ARM function ReducePredicated.
4468  *
4469  * While it would be possible to write this without the DATA temporary,
4470  * it is much simpler to process the predicate register this way.
4471  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4472  * little to gain with a more complex non-recursive form.
4473  */
4474 #define DO_REDUCE(NAME, SUF, TYPE, H, FUNC, IDENT)                      \
4475 static TYPE FUNC##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4476 {                                                                     \
4477     if (n == 1) {                                                     \
4478         return *data;                                                 \
4479     } else {                                                          \
4480         uintptr_t half = n / 2;                                       \
4481         TYPE lo = FUNC##_reduce(data, status, half);                  \
4482         TYPE hi = FUNC##_reduce(data + half, status, half);           \
4483         return FUNC(lo, hi, status);                                  \
4484     }                                                                 \
4485 }                                                                     \
4486 uint64_t helper_sve_##NAME##v_##SUF(void *vn, void *vg,               \
4487                                     float_status *status, uint32_t desc) \
4488 {                                                                     \
4489     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4490     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4491     TYPE ident = IDENT;                                               \
4492     for (i = 0; i < oprsz; ) {                                        \
4493         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4494         do {                                                          \
4495             TYPE nn = *(TYPE *)(vn + H(i));                           \
4496             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : ident);      \
4497             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4498         } while (i & 15);                                             \
4499     }                                                                 \
4500     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4501         *(TYPE *)((void *)data + i) = ident;                          \
4502     }                                                                 \
4503     return FUNC##_reduce(data, status, maxsz / sizeof(TYPE));         \
4504 }                                                                     \
4505 void helper_sve2p1_##NAME##qv_##SUF(void *vd, void *vn, void *vg,     \
4506                                     float_status *status, uint32_t desc) \
4507 {                                                                     \
4508     unsigned oprsz = simd_oprsz(desc), segments = oprsz / 16;         \
4509     TYPE ident = IDENT;                                               \
4510     for (unsigned e = 0; e < 16; e += sizeof(TYPE)) {                 \
4511         TYPE data[ARM_MAX_VQ];                                        \
4512         for (unsigned s = 0; s < segments; s++) {                     \
4513             uint16_t pg = *(uint16_t *)(vg + H1_2(s * 2));            \
4514             TYPE nn = *(TYPE *)(vn + (s * 16 + H(e)));                \
4515             data[s] = (pg >> e) & 1 ? nn : ident;                     \
4516         }                                                             \
4517         *(TYPE *)(vd + H(e)) = FUNC##_reduce(data, status, segments); \
4518     }                                                                 \
4519     clear_tail(vd, 16, simd_maxsz(desc));                             \
4520 }
4521 
DO_REDUCE(fadd,h,float16,H1_2,float16_add,float16_zero)4522 DO_REDUCE(fadd,h, float16, H1_2, float16_add, float16_zero)
4523 DO_REDUCE(fadd,s, float32, H1_4, float32_add, float32_zero)
4524 DO_REDUCE(fadd,d, float64, H1_8, float64_add, float64_zero)
4525 
4526 /*
4527  * We can't avoid the function call for the default NaN value, because
4528  * it changes when FPCR.AH is set.
4529  */
4530 DO_REDUCE(fminnm,h, float16, H1_2, float16_minnum, float16_default_nan(status))
4531 DO_REDUCE(fminnm,s, float32, H1_4, float32_minnum, float32_default_nan(status))
4532 DO_REDUCE(fminnm,d, float64, H1_8, float64_minnum, float64_default_nan(status))
4533 
4534 DO_REDUCE(fmaxnm,h, float16, H1_2, float16_maxnum, float16_default_nan(status))
4535 DO_REDUCE(fmaxnm,s, float32, H1_4, float32_maxnum, float32_default_nan(status))
4536 DO_REDUCE(fmaxnm,d, float64, H1_8, float64_maxnum, float64_default_nan(status))
4537 
4538 DO_REDUCE(fmin,h, float16, H1_2, float16_min, float16_infinity)
4539 DO_REDUCE(fmin,s, float32, H1_4, float32_min, float32_infinity)
4540 DO_REDUCE(fmin,d, float64, H1_8, float64_min, float64_infinity)
4541 
4542 DO_REDUCE(fmax,h, float16, H1_2, float16_max, float16_chs(float16_infinity))
4543 DO_REDUCE(fmax,s, float32, H1_4, float32_max, float32_chs(float32_infinity))
4544 DO_REDUCE(fmax,d, float64, H1_8, float64_max, float64_chs(float64_infinity))
4545 
4546 DO_REDUCE(ah_fmin,h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
4547 DO_REDUCE(ah_fmin,s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
4548 DO_REDUCE(ah_fmin,d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
4549 
4550 DO_REDUCE(ah_fmax,h, float16, H1_2, helper_vfp_ah_maxh,
4551           float16_chs(float16_infinity))
4552 DO_REDUCE(ah_fmax,s, float32, H1_4, helper_vfp_ah_maxs,
4553           float32_chs(float32_infinity))
4554 DO_REDUCE(ah_fmax,d, float64, H1_8, helper_vfp_ah_maxd,
4555           float64_chs(float64_infinity))
4556 
4557 #undef DO_REDUCE
4558 
4559 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4560                              float_status *status, uint32_t desc)
4561 {
4562     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4563     float16 result = nn;
4564 
4565     do {
4566         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4567         do {
4568             if (pg & 1) {
4569                 float16 mm = *(float16 *)(vm + H1_2(i));
4570                 result = float16_add(result, mm, status);
4571             }
4572             i += sizeof(float16), pg >>= sizeof(float16);
4573         } while (i & 15);
4574     } while (i < opr_sz);
4575 
4576     return result;
4577 }
4578 
HELPER(sve_fadda_s)4579 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4580                              float_status *status, uint32_t desc)
4581 {
4582     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4583     float32 result = nn;
4584 
4585     do {
4586         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4587         do {
4588             if (pg & 1) {
4589                 float32 mm = *(float32 *)(vm + H1_2(i));
4590                 result = float32_add(result, mm, status);
4591             }
4592             i += sizeof(float32), pg >>= sizeof(float32);
4593         } while (i & 15);
4594     } while (i < opr_sz);
4595 
4596     return result;
4597 }
4598 
HELPER(sve_fadda_d)4599 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4600                              float_status *status, uint32_t desc)
4601 {
4602     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4603     uint64_t *m = vm;
4604     uint8_t *pg = vg;
4605 
4606     for (i = 0; i < opr_sz; i++) {
4607         if (pg[H1(i)] & 1) {
4608             nn = float64_add(nn, m[i], status);
4609         }
4610     }
4611 
4612     return nn;
4613 }
4614 
4615 /* Fully general three-operand expander, controlled by a predicate,
4616  * With the extra float_status parameter.
4617  */
4618 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4619 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4620                   float_status *status, uint32_t desc)          \
4621 {                                                               \
4622     intptr_t i = simd_oprsz(desc);                              \
4623     uint64_t *g = vg;                                           \
4624     do {                                                        \
4625         uint64_t pg = g[(i - 1) >> 6];                          \
4626         do {                                                    \
4627             i -= sizeof(TYPE);                                  \
4628             if (likely((pg >> (i & 63)) & 1)) {                 \
4629                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4630                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4631                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4632             }                                                   \
4633         } while (i & 63);                                       \
4634     } while (i != 0);                                           \
4635 }
4636 
DO_ZPZZ_FP(sve_fadd_b16,uint16_t,H1_2,bfloat16_add)4637 DO_ZPZZ_FP(sve_fadd_b16, uint16_t, H1_2, bfloat16_add)
4638 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4639 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4640 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4641 
4642 DO_ZPZZ_FP(sve_fsub_b16, uint16_t, H1_2, bfloat16_sub)
4643 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4644 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4645 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4646 
4647 DO_ZPZZ_FP(sve_fmul_b16, uint16_t, H1_2, bfloat16_mul)
4648 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4649 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4650 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4651 
4652 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4653 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4654 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4655 
4656 DO_ZPZZ_FP(sve_fmin_b16, uint16_t, H1_2, bfloat16_min)
4657 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4658 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4659 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4660 
4661 DO_ZPZZ_FP(sve_fmax_b16, uint16_t, H1_2, bfloat16_max)
4662 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4663 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4664 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4665 
4666 DO_ZPZZ_FP(sve_ah_fmin_b16, uint16_t, H1_2, helper_sme2_ah_fmin_b16)
4667 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh)
4668 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins)
4669 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind)
4670 
4671 DO_ZPZZ_FP(sve_ah_fmax_b16, uint16_t, H1_2, helper_sme2_ah_fmax_b16)
4672 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh)
4673 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs)
4674 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd)
4675 
4676 DO_ZPZZ_FP(sve_fminnum_b16, uint16_t, H1_2, bfloat16_minnum)
4677 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4678 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4679 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4680 
4681 DO_ZPZZ_FP(sve_fmaxnum_b16, uint16_t, H1_2, bfloat16_maxnum)
4682 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4683 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4684 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4685 
4686 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4687 {
4688     return float16_abs(float16_sub(a, b, s));
4689 }
4690 
abd_s(float32 a,float32 b,float_status * s)4691 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4692 {
4693     return float32_abs(float32_sub(a, b, s));
4694 }
4695 
abd_d(float64 a,float64 b,float_status * s)4696 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4697 {
4698     return float64_abs(float64_sub(a, b, s));
4699 }
4700 
4701 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
ah_abd_h(float16 op1,float16 op2,float_status * stat)4702 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat)
4703 {
4704     float16 r = float16_sub(op1, op2, stat);
4705     return float16_is_any_nan(r) ? r : float16_abs(r);
4706 }
4707 
ah_abd_s(float32 op1,float32 op2,float_status * stat)4708 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat)
4709 {
4710     float32 r = float32_sub(op1, op2, stat);
4711     return float32_is_any_nan(r) ? r : float32_abs(r);
4712 }
4713 
ah_abd_d(float64 op1,float64 op2,float_status * stat)4714 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat)
4715 {
4716     float64 r = float64_sub(op1, op2, stat);
4717     return float64_is_any_nan(r) ? r : float64_abs(r);
4718 }
4719 
DO_ZPZZ_FP(sve_fabd_h,uint16_t,H1_2,abd_h)4720 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4721 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4722 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4723 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h)
4724 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s)
4725 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d)
4726 
4727 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4728 {
4729     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4730     return float64_scalbn(a, b_int, s);
4731 }
4732 
DO_ZPZZ_FP(sve_fscalbn_h,int16_t,H1_2,float16_scalbn)4733 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4734 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4735 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4736 
4737 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4738 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4739 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4740 
4741 #undef DO_ZPZZ_FP
4742 
4743 /* Three-operand expander, with one scalar operand, controlled by
4744  * a predicate, with the extra float_status parameter.
4745  */
4746 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4747 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4748                   float_status *status, uint32_t desc)            \
4749 {                                                                 \
4750     intptr_t i = simd_oprsz(desc);                                \
4751     uint64_t *g = vg;                                             \
4752     TYPE mm = scalar;                                             \
4753     do {                                                          \
4754         uint64_t pg = g[(i - 1) >> 6];                            \
4755         do {                                                      \
4756             i -= sizeof(TYPE);                                    \
4757             if (likely((pg >> (i & 63)) & 1)) {                   \
4758                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4759                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4760             }                                                     \
4761         } while (i & 63);                                         \
4762     } while (i != 0);                                             \
4763 }
4764 
4765 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4766 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4767 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4768 
4769 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4770 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4771 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4772 
4773 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4774 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4775 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4776 
4777 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4778 {
4779     return float16_sub(b, a, s);
4780 }
4781 
subr_s(float32 a,float32 b,float_status * s)4782 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4783 {
4784     return float32_sub(b, a, s);
4785 }
4786 
subr_d(float64 a,float64 b,float_status * s)4787 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4788 {
4789     return float64_sub(b, a, s);
4790 }
4791 
DO_ZPZS_FP(sve_fsubrs_h,float16,H1_2,subr_h)4792 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4793 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4794 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4795 
4796 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4797 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4798 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4799 
4800 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4801 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4802 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4803 
4804 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4805 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4806 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4807 
4808 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4809 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4810 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4811 
4812 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh)
4813 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs)
4814 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd)
4815 
4816 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh)
4817 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins)
4818 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind)
4819 
4820 /* Fully general two-operand expander, controlled by a predicate,
4821  * With the extra float_status parameter.
4822  */
4823 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4824 void HELPER(NAME)(void *vd, void *vn, void *vg,                       \
4825                   float_status *status, uint32_t desc)                \
4826 {                                                                     \
4827     intptr_t i = simd_oprsz(desc);                                    \
4828     uint64_t *g = vg;                                                 \
4829     do {                                                              \
4830         uint64_t pg = g[(i - 1) >> 6];                                \
4831         do {                                                          \
4832             i -= sizeof(TYPE);                                        \
4833             if (likely((pg >> (i & 63)) & 1)) {                       \
4834                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4835                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4836             }                                                         \
4837         } while (i & 63);                                             \
4838     } while (i != 0);                                                 \
4839 }
4840 
4841 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4842  * FZ16.  When converting from fp16, this affects flushing input denormals;
4843  * when converting to fp16, this affects flushing output denormals.
4844  */
4845 float32 sve_f16_to_f32(float16 f, float_status *fpst)
4846 {
4847     bool save = get_flush_inputs_to_zero(fpst);
4848     float32 ret;
4849 
4850     set_flush_inputs_to_zero(false, fpst);
4851     ret = float16_to_float32(f, true, fpst);
4852     set_flush_inputs_to_zero(save, fpst);
4853     return ret;
4854 }
4855 
sve_f16_to_f64(float16 f,float_status * fpst)4856 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4857 {
4858     bool save = get_flush_inputs_to_zero(fpst);
4859     float64 ret;
4860 
4861     set_flush_inputs_to_zero(false, fpst);
4862     ret = float16_to_float64(f, true, fpst);
4863     set_flush_inputs_to_zero(save, fpst);
4864     return ret;
4865 }
4866 
sve_f32_to_f16(float32 f,float_status * fpst)4867 float16 sve_f32_to_f16(float32 f, float_status *fpst)
4868 {
4869     bool save = get_flush_to_zero(fpst);
4870     float16 ret;
4871 
4872     set_flush_to_zero(false, fpst);
4873     ret = float32_to_float16(f, true, fpst);
4874     set_flush_to_zero(save, fpst);
4875     return ret;
4876 }
4877 
sve_f64_to_f16(float64 f,float_status * fpst)4878 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4879 {
4880     bool save = get_flush_to_zero(fpst);
4881     float16 ret;
4882 
4883     set_flush_to_zero(false, fpst);
4884     ret = float64_to_float16(f, true, fpst);
4885     set_flush_to_zero(save, fpst);
4886     return ret;
4887 }
4888 
vfp_float16_to_int16_rtz(float16 f,float_status * s)4889 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4890 {
4891     if (float16_is_any_nan(f)) {
4892         float_raise(float_flag_invalid, s);
4893         return 0;
4894     }
4895     return float16_to_int16_round_to_zero(f, s);
4896 }
4897 
vfp_float16_to_int64_rtz(float16 f,float_status * s)4898 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4899 {
4900     if (float16_is_any_nan(f)) {
4901         float_raise(float_flag_invalid, s);
4902         return 0;
4903     }
4904     return float16_to_int64_round_to_zero(f, s);
4905 }
4906 
vfp_float32_to_int64_rtz(float32 f,float_status * s)4907 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4908 {
4909     if (float32_is_any_nan(f)) {
4910         float_raise(float_flag_invalid, s);
4911         return 0;
4912     }
4913     return float32_to_int64_round_to_zero(f, s);
4914 }
4915 
vfp_float64_to_int64_rtz(float64 f,float_status * s)4916 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4917 {
4918     if (float64_is_any_nan(f)) {
4919         float_raise(float_flag_invalid, s);
4920         return 0;
4921     }
4922     return float64_to_int64_round_to_zero(f, s);
4923 }
4924 
vfp_float16_to_uint16_rtz(float16 f,float_status * s)4925 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4926 {
4927     if (float16_is_any_nan(f)) {
4928         float_raise(float_flag_invalid, s);
4929         return 0;
4930     }
4931     return float16_to_uint16_round_to_zero(f, s);
4932 }
4933 
vfp_float16_to_uint64_rtz(float16 f,float_status * s)4934 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4935 {
4936     if (float16_is_any_nan(f)) {
4937         float_raise(float_flag_invalid, s);
4938         return 0;
4939     }
4940     return float16_to_uint64_round_to_zero(f, s);
4941 }
4942 
vfp_float32_to_uint64_rtz(float32 f,float_status * s)4943 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4944 {
4945     if (float32_is_any_nan(f)) {
4946         float_raise(float_flag_invalid, s);
4947         return 0;
4948     }
4949     return float32_to_uint64_round_to_zero(f, s);
4950 }
4951 
vfp_float64_to_uint64_rtz(float64 f,float_status * s)4952 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4953 {
4954     if (float64_is_any_nan(f)) {
4955         float_raise(float_flag_invalid, s);
4956         return 0;
4957     }
4958     return float64_to_uint64_round_to_zero(f, s);
4959 }
4960 
DO_ZPZ_FP(sve_fcvt_sh,uint32_t,H1_4,sve_f32_to_f16)4961 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4962 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4963 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4964 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4965 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4966 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4967 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4968 
4969 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4970 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4971 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4972 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4973 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4974 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4975 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4976 
4977 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4978 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4979 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4980 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4981 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4982 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4983 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4984 
4985 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4986 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4987 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4988 
4989 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4990 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4991 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4992 
4993 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4994 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4995 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4996 
4997 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4998 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4999 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
5000 
5001 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
5002 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
5003 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
5004 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
5005 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
5006 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
5007 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
5008 
5009 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
5010 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
5011 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
5012 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
5013 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
5014 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
5015 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
5016 
5017 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
5018 {
5019     /* Extract frac to the top of the uint32_t. */
5020     uint32_t frac = (uint32_t)a << (16 + 6);
5021     int16_t exp = extract32(a, 10, 5);
5022 
5023     if (unlikely(exp == 0)) {
5024         if (frac != 0) {
5025             if (!get_flush_inputs_to_zero(s)) {
5026                 /* denormal: bias - fractional_zeros */
5027                 return -15 - clz32(frac);
5028             }
5029             /* flush to zero */
5030             float_raise(float_flag_input_denormal_flushed, s);
5031         }
5032     } else if (unlikely(exp == 0x1f)) {
5033         if (frac == 0) {
5034             return INT16_MAX; /* infinity */
5035         }
5036     } else {
5037         /* normal: exp - bias */
5038         return exp - 15;
5039     }
5040     /* nan or zero */
5041     float_raise(float_flag_invalid, s);
5042     return INT16_MIN;
5043 }
5044 
do_float32_logb_as_int(float32 a,float_status * s)5045 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
5046 {
5047     /* Extract frac to the top of the uint32_t. */
5048     uint32_t frac = a << 9;
5049     int32_t exp = extract32(a, 23, 8);
5050 
5051     if (unlikely(exp == 0)) {
5052         if (frac != 0) {
5053             if (!get_flush_inputs_to_zero(s)) {
5054                 /* denormal: bias - fractional_zeros */
5055                 return -127 - clz32(frac);
5056             }
5057             /* flush to zero */
5058             float_raise(float_flag_input_denormal_flushed, s);
5059         }
5060     } else if (unlikely(exp == 0xff)) {
5061         if (frac == 0) {
5062             return INT32_MAX; /* infinity */
5063         }
5064     } else {
5065         /* normal: exp - bias */
5066         return exp - 127;
5067     }
5068     /* nan or zero */
5069     float_raise(float_flag_invalid, s);
5070     return INT32_MIN;
5071 }
5072 
do_float64_logb_as_int(float64 a,float_status * s)5073 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
5074 {
5075     /* Extract frac to the top of the uint64_t. */
5076     uint64_t frac = a << 12;
5077     int64_t exp = extract64(a, 52, 11);
5078 
5079     if (unlikely(exp == 0)) {
5080         if (frac != 0) {
5081             if (!get_flush_inputs_to_zero(s)) {
5082                 /* denormal: bias - fractional_zeros */
5083                 return -1023 - clz64(frac);
5084             }
5085             /* flush to zero */
5086             float_raise(float_flag_input_denormal_flushed, s);
5087         }
5088     } else if (unlikely(exp == 0x7ff)) {
5089         if (frac == 0) {
5090             return INT64_MAX; /* infinity */
5091         }
5092     } else {
5093         /* normal: exp - bias */
5094         return exp - 1023;
5095     }
5096     /* nan or zero */
5097     float_raise(float_flag_invalid, s);
5098     return INT64_MIN;
5099 }
5100 
DO_ZPZ_FP(flogb_h,float16,H1_2,do_float16_logb_as_int)5101 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
5102 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
5103 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
5104 
5105 #undef DO_ZPZ_FP
5106 
5107 static void do_fmla_zpzzz_b16(void *vd, void *vn, void *vm, void *va, void *vg,
5108                               float_status *status, uint32_t desc,
5109                               uint16_t neg1, uint16_t neg3, int flags)
5110 {
5111     intptr_t i = simd_oprsz(desc);
5112     uint64_t *g = vg;
5113 
5114     do {
5115         uint64_t pg = g[(i - 1) >> 6];
5116         do {
5117             i -= 2;
5118             if (likely((pg >> (i & 63)) & 1)) {
5119                 float16 e1, e2, e3, r;
5120 
5121                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
5122                 e2 = *(uint16_t *)(vm + H1_2(i));
5123                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
5124                 r = bfloat16_muladd(e1, e2, e3, flags, status);
5125                 *(uint16_t *)(vd + H1_2(i)) = r;
5126             }
5127         } while (i & 63);
5128     } while (i != 0);
5129 }
5130 
HELPER(sve_fmla_zpzzz_b16)5131 void HELPER(sve_fmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5132                               void *vg, float_status *status, uint32_t desc)
5133 {
5134     do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5135 }
5136 
HELPER(sve_fmls_zpzzz_b16)5137 void HELPER(sve_fmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5138                               void *vg, float_status *status, uint32_t desc)
5139 {
5140     do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
5141 }
5142 
HELPER(sve_fnmla_zpzzz_b16)5143 void HELPER(sve_fnmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5144                                void *vg, float_status *status, uint32_t desc)
5145 {
5146     do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
5147 }
5148 
HELPER(sve_fnmls_zpzzz_b16)5149 void HELPER(sve_fnmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5150                                void *vg, float_status *status, uint32_t desc)
5151 {
5152     do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
5153 }
5154 
HELPER(sve_ah_fmls_zpzzz_b16)5155 void HELPER(sve_ah_fmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5156                               void *vg, float_status *status, uint32_t desc)
5157 {
5158     do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0,
5159                       float_muladd_negate_product);
5160 }
5161 
HELPER(sve_ah_fnmla_zpzzz_b16)5162 void HELPER(sve_ah_fnmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5163                                void *vg, float_status *status, uint32_t desc)
5164 {
5165     do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0,
5166                       float_muladd_negate_product | float_muladd_negate_c);
5167 }
5168 
HELPER(sve_ah_fnmls_zpzzz_b16)5169 void HELPER(sve_ah_fnmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va,
5170                                void *vg, float_status *status, uint32_t desc)
5171 {
5172     do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0,
5173                       float_muladd_negate_c);
5174 }
5175 
do_fmla_zpzzz_h(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint16_t neg1,uint16_t neg3,int flags)5176 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
5177                             float_status *status, uint32_t desc,
5178                             uint16_t neg1, uint16_t neg3, int flags)
5179 {
5180     intptr_t i = simd_oprsz(desc);
5181     uint64_t *g = vg;
5182 
5183     do {
5184         uint64_t pg = g[(i - 1) >> 6];
5185         do {
5186             i -= 2;
5187             if (likely((pg >> (i & 63)) & 1)) {
5188                 float16 e1, e2, e3, r;
5189 
5190                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
5191                 e2 = *(uint16_t *)(vm + H1_2(i));
5192                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
5193                 r = float16_muladd(e1, e2, e3, flags, status);
5194                 *(uint16_t *)(vd + H1_2(i)) = r;
5195             }
5196         } while (i & 63);
5197     } while (i != 0);
5198 }
5199 
HELPER(sve_fmla_zpzzz_h)5200 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5201                               void *vg, float_status *status, uint32_t desc)
5202 {
5203     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5204 }
5205 
HELPER(sve_fmls_zpzzz_h)5206 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5207                               void *vg, float_status *status, uint32_t desc)
5208 {
5209     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
5210 }
5211 
HELPER(sve_fnmla_zpzzz_h)5212 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5213                                void *vg, float_status *status, uint32_t desc)
5214 {
5215     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
5216 }
5217 
HELPER(sve_fnmls_zpzzz_h)5218 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5219                                void *vg, float_status *status, uint32_t desc)
5220 {
5221     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
5222 }
5223 
HELPER(sve_ah_fmls_zpzzz_h)5224 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5225                               void *vg, float_status *status, uint32_t desc)
5226 {
5227     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
5228                     float_muladd_negate_product);
5229 }
5230 
HELPER(sve_ah_fnmla_zpzzz_h)5231 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5232                                void *vg, float_status *status, uint32_t desc)
5233 {
5234     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
5235                     float_muladd_negate_product | float_muladd_negate_c);
5236 }
5237 
HELPER(sve_ah_fnmls_zpzzz_h)5238 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5239                                void *vg, float_status *status, uint32_t desc)
5240 {
5241     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
5242                     float_muladd_negate_c);
5243 }
5244 
do_fmla_zpzzz_s(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint32_t neg1,uint32_t neg3,int flags)5245 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
5246                             float_status *status, uint32_t desc,
5247                             uint32_t neg1, uint32_t neg3, int flags)
5248 {
5249     intptr_t i = simd_oprsz(desc);
5250     uint64_t *g = vg;
5251 
5252     do {
5253         uint64_t pg = g[(i - 1) >> 6];
5254         do {
5255             i -= 4;
5256             if (likely((pg >> (i & 63)) & 1)) {
5257                 float32 e1, e2, e3, r;
5258 
5259                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
5260                 e2 = *(uint32_t *)(vm + H1_4(i));
5261                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
5262                 r = float32_muladd(e1, e2, e3, flags, status);
5263                 *(uint32_t *)(vd + H1_4(i)) = r;
5264             }
5265         } while (i & 63);
5266     } while (i != 0);
5267 }
5268 
HELPER(sve_fmla_zpzzz_s)5269 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5270                               void *vg, float_status *status, uint32_t desc)
5271 {
5272     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5273 }
5274 
HELPER(sve_fmls_zpzzz_s)5275 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5276                               void *vg, float_status *status, uint32_t desc)
5277 {
5278     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0);
5279 }
5280 
HELPER(sve_fnmla_zpzzz_s)5281 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5282                                void *vg, float_status *status, uint32_t desc)
5283 {
5284     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0);
5285 }
5286 
HELPER(sve_fnmls_zpzzz_s)5287 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5288                                void *vg, float_status *status, uint32_t desc)
5289 {
5290     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0);
5291 }
5292 
HELPER(sve_ah_fmls_zpzzz_s)5293 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5294                               void *vg, float_status *status, uint32_t desc)
5295 {
5296     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
5297                     float_muladd_negate_product);
5298 }
5299 
HELPER(sve_ah_fnmla_zpzzz_s)5300 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5301                                void *vg, float_status *status, uint32_t desc)
5302 {
5303     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
5304                     float_muladd_negate_product | float_muladd_negate_c);
5305 }
5306 
HELPER(sve_ah_fnmls_zpzzz_s)5307 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5308                                void *vg, float_status *status, uint32_t desc)
5309 {
5310     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
5311                     float_muladd_negate_c);
5312 }
5313 
do_fmla_zpzzz_d(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint64_t neg1,uint64_t neg3,int flags)5314 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
5315                             float_status *status, uint32_t desc,
5316                             uint64_t neg1, uint64_t neg3, int flags)
5317 {
5318     intptr_t i = simd_oprsz(desc);
5319     uint64_t *g = vg;
5320 
5321     do {
5322         uint64_t pg = g[(i - 1) >> 6];
5323         do {
5324             i -= 8;
5325             if (likely((pg >> (i & 63)) & 1)) {
5326                 float64 e1, e2, e3, r;
5327 
5328                 e1 = *(uint64_t *)(vn + i) ^ neg1;
5329                 e2 = *(uint64_t *)(vm + i);
5330                 e3 = *(uint64_t *)(va + i) ^ neg3;
5331                 r = float64_muladd(e1, e2, e3, flags, status);
5332                 *(uint64_t *)(vd + i) = r;
5333             }
5334         } while (i & 63);
5335     } while (i != 0);
5336 }
5337 
HELPER(sve_fmla_zpzzz_d)5338 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5339                               void *vg, float_status *status, uint32_t desc)
5340 {
5341     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5342 }
5343 
HELPER(sve_fmls_zpzzz_d)5344 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5345                               void *vg, float_status *status, uint32_t desc)
5346 {
5347     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0);
5348 }
5349 
HELPER(sve_fnmla_zpzzz_d)5350 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5351                                void *vg, float_status *status, uint32_t desc)
5352 {
5353     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0);
5354 }
5355 
HELPER(sve_fnmls_zpzzz_d)5356 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5357                                void *vg, float_status *status, uint32_t desc)
5358 {
5359     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0);
5360 }
5361 
HELPER(sve_ah_fmls_zpzzz_d)5362 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5363                               void *vg, float_status *status, uint32_t desc)
5364 {
5365     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5366                     float_muladd_negate_product);
5367 }
5368 
HELPER(sve_ah_fnmla_zpzzz_d)5369 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5370                                void *vg, float_status *status, uint32_t desc)
5371 {
5372     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5373                     float_muladd_negate_product | float_muladd_negate_c);
5374 }
5375 
HELPER(sve_ah_fnmls_zpzzz_d)5376 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5377                                void *vg, float_status *status, uint32_t desc)
5378 {
5379     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5380                     float_muladd_negate_c);
5381 }
5382 
5383 /* Two operand floating-point comparison controlled by a predicate.
5384  * Unlike the integer version, we are not allowed to optimistically
5385  * compare operands, since the comparison may have side effects wrt
5386  * the FPSR.
5387  */
5388 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
5389 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
5390                   float_status *status, uint32_t desc)                  \
5391 {                                                                       \
5392     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
5393     uint64_t *d = vd, *g = vg;                                          \
5394     do {                                                                \
5395         uint64_t out = 0, pg = g[j];                                    \
5396         do {                                                            \
5397             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
5398             if (likely((pg >> (i & 63)) & 1)) {                         \
5399                 TYPE nn = *(TYPE *)(vn + H(i));                         \
5400                 TYPE mm = *(TYPE *)(vm + H(i));                         \
5401                 out |= OP(TYPE, nn, mm, status);                        \
5402             }                                                           \
5403         } while (i & 63);                                               \
5404         d[j--] = out;                                                   \
5405     } while (i > 0);                                                    \
5406 }
5407 
5408 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5409     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5410 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5411     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5412 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5413     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5414 
5415 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5416     DO_FPCMP_PPZZ_H(NAME, OP)   \
5417     DO_FPCMP_PPZZ_S(NAME, OP)   \
5418     DO_FPCMP_PPZZ_D(NAME, OP)
5419 
5420 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
5421 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
5422 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
5423 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
5424 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
5425 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
5426 #define DO_FCMUO(TYPE, X, Y, ST)  \
5427     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5428 #define DO_FACGE(TYPE, X, Y, ST)  \
5429     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5430 #define DO_FACGT(TYPE, X, Y, ST)  \
5431     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5432 
DO_FPCMP_PPZZ_ALL(sve_fcmge,DO_FCMGE)5433 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5434 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5435 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5436 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5437 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5438 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5439 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5440 
5441 #undef DO_FPCMP_PPZZ_ALL
5442 #undef DO_FPCMP_PPZZ_D
5443 #undef DO_FPCMP_PPZZ_S
5444 #undef DO_FPCMP_PPZZ_H
5445 #undef DO_FPCMP_PPZZ
5446 
5447 /* One operand floating-point comparison against zero, controlled
5448  * by a predicate.
5449  */
5450 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
5451 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
5452                   float_status *status, uint32_t desc)     \
5453 {                                                          \
5454     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
5455     uint64_t *d = vd, *g = vg;                             \
5456     do {                                                   \
5457         uint64_t out = 0, pg = g[j];                       \
5458         do {                                               \
5459             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
5460             if ((pg >> (i & 63)) & 1) {                    \
5461                 TYPE nn = *(TYPE *)(vn + H(i));            \
5462                 out |= OP(TYPE, nn, 0, status);            \
5463             }                                              \
5464         } while (i & 63);                                  \
5465         d[j--] = out;                                      \
5466     } while (i > 0);                                       \
5467 }
5468 
5469 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5470     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5471 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5472     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5473 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5474     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5475 
5476 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5477     DO_FPCMP_PPZ0_H(NAME, OP)   \
5478     DO_FPCMP_PPZ0_S(NAME, OP)   \
5479     DO_FPCMP_PPZ0_D(NAME, OP)
5480 
5481 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5482 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5483 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5484 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5485 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5486 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5487 
5488 /* FP Trig Multiply-Add. */
5489 
5490 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
5491                          float_status *s, uint32_t desc)
5492 {
5493     static const float16 coeff[16] = {
5494         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5495         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5496     };
5497     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5498     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5499     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5500     float16 *d = vd, *n = vn, *m = vm;
5501 
5502     for (i = 0; i < opr_sz; i++) {
5503         float16 mm = m[i];
5504         intptr_t xx = x;
5505         int flags = 0;
5506 
5507         if (float16_is_neg(mm)) {
5508             if (fpcr_ah) {
5509                 flags = float_muladd_negate_product;
5510             } else {
5511                 mm = float16_abs(mm);
5512             }
5513             xx += 8;
5514         }
5515         d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5516     }
5517 }
5518 
HELPER(sve_ftmad_s)5519 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5520                          float_status *s, uint32_t desc)
5521 {
5522     static const float32 coeff[16] = {
5523         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5524         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5525         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5526         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5527     };
5528     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5529     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5530     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5531     float32 *d = vd, *n = vn, *m = vm;
5532 
5533     for (i = 0; i < opr_sz; i++) {
5534         float32 mm = m[i];
5535         intptr_t xx = x;
5536         int flags = 0;
5537 
5538         if (float32_is_neg(mm)) {
5539             if (fpcr_ah) {
5540                 flags = float_muladd_negate_product;
5541             } else {
5542                 mm = float32_abs(mm);
5543             }
5544             xx += 8;
5545         }
5546         d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);
5547     }
5548 }
5549 
HELPER(sve_ftmad_d)5550 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5551                          float_status *s, uint32_t desc)
5552 {
5553     static const float64 coeff[16] = {
5554         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5555         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5556         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5557         0x3de5d8408868552full, 0x0000000000000000ull,
5558         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5559         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5560         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5561         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5562     };
5563     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5564     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5565     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5566     float64 *d = vd, *n = vn, *m = vm;
5567 
5568     for (i = 0; i < opr_sz; i++) {
5569         float64 mm = m[i];
5570         intptr_t xx = x;
5571         int flags = 0;
5572 
5573         if (float64_is_neg(mm)) {
5574             if (fpcr_ah) {
5575                 flags = float_muladd_negate_product;
5576             } else {
5577                 mm = float64_abs(mm);
5578             }
5579             xx += 8;
5580         }
5581         d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);
5582     }
5583 }
5584 
5585 /*
5586  * FP Complex Add
5587  */
5588 
HELPER(sve_fcadd_h)5589 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5590                          float_status *s, uint32_t desc)
5591 {
5592     intptr_t j, i = simd_oprsz(desc);
5593     uint64_t *g = vg;
5594     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5595     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5596 
5597     do {
5598         uint64_t pg = g[(i - 1) >> 6];
5599         do {
5600             float16 e0, e1, e2, e3;
5601 
5602             /* I holds the real index; J holds the imag index.  */
5603             j = i - sizeof(float16);
5604             i -= 2 * sizeof(float16);
5605 
5606             e0 = *(float16 *)(vn + H1_2(i));
5607             e1 = *(float16 *)(vm + H1_2(j));
5608             e2 = *(float16 *)(vn + H1_2(j));
5609             e3 = *(float16 *)(vm + H1_2(i));
5610 
5611             if (rot) {
5612                 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
5613             } else {
5614                 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
5615             }
5616 
5617             if (likely((pg >> (i & 63)) & 1)) {
5618                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5619             }
5620             if (likely((pg >> (j & 63)) & 1)) {
5621                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5622             }
5623         } while (i & 63);
5624     } while (i != 0);
5625 }
5626 
HELPER(sve_fcadd_s)5627 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5628                          float_status *s, uint32_t desc)
5629 {
5630     intptr_t j, i = simd_oprsz(desc);
5631     uint64_t *g = vg;
5632     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5633     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5634 
5635     do {
5636         uint64_t pg = g[(i - 1) >> 6];
5637         do {
5638             float32 e0, e1, e2, e3;
5639 
5640             /* I holds the real index; J holds the imag index.  */
5641             j = i - sizeof(float32);
5642             i -= 2 * sizeof(float32);
5643 
5644             e0 = *(float32 *)(vn + H1_2(i));
5645             e1 = *(float32 *)(vm + H1_2(j));
5646             e2 = *(float32 *)(vn + H1_2(j));
5647             e3 = *(float32 *)(vm + H1_2(i));
5648 
5649             if (rot) {
5650                 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
5651             } else {
5652                 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
5653             }
5654 
5655             if (likely((pg >> (i & 63)) & 1)) {
5656                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5657             }
5658             if (likely((pg >> (j & 63)) & 1)) {
5659                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5660             }
5661         } while (i & 63);
5662     } while (i != 0);
5663 }
5664 
HELPER(sve_fcadd_d)5665 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5666                          float_status *s, uint32_t desc)
5667 {
5668     intptr_t j, i = simd_oprsz(desc);
5669     uint64_t *g = vg;
5670     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5671     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5672 
5673     do {
5674         uint64_t pg = g[(i - 1) >> 6];
5675         do {
5676             float64 e0, e1, e2, e3;
5677 
5678             /* I holds the real index; J holds the imag index.  */
5679             j = i - sizeof(float64);
5680             i -= 2 * sizeof(float64);
5681 
5682             e0 = *(float64 *)(vn + H1_2(i));
5683             e1 = *(float64 *)(vm + H1_2(j));
5684             e2 = *(float64 *)(vn + H1_2(j));
5685             e3 = *(float64 *)(vm + H1_2(i));
5686 
5687             if (rot) {
5688                 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
5689             } else {
5690                 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
5691             }
5692 
5693             if (likely((pg >> (i & 63)) & 1)) {
5694                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5695             }
5696             if (likely((pg >> (j & 63)) & 1)) {
5697                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5698             }
5699         } while (i & 63);
5700     } while (i != 0);
5701 }
5702 
5703 /*
5704  * FP Complex Multiply
5705  */
5706 
HELPER(sve_fcmla_zpzzz_h)5707 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5708                                void *vg, float_status *status, uint32_t desc)
5709 {
5710     intptr_t j, i = simd_oprsz(desc);
5711     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5712     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5713     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5714     uint32_t negf_real = flip ^ negf_imag;
5715     float16 negx_imag, negx_real;
5716     uint64_t *g = vg;
5717 
5718     /* With AH=0, use negx; with AH=1 use negf. */
5719     negx_real = (negf_real & ~fpcr_ah) << 15;
5720     negx_imag = (negf_imag & ~fpcr_ah) << 15;
5721     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5722     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5723 
5724     do {
5725         uint64_t pg = g[(i - 1) >> 6];
5726         do {
5727             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5728 
5729             /* I holds the real index; J holds the imag index.  */
5730             j = i - sizeof(float16);
5731             i -= 2 * sizeof(float16);
5732 
5733             nr = *(float16 *)(vn + H1_2(i));
5734             ni = *(float16 *)(vn + H1_2(j));
5735             mr = *(float16 *)(vm + H1_2(i));
5736             mi = *(float16 *)(vm + H1_2(j));
5737 
5738             e2 = (flip ? ni : nr);
5739             e1 = (flip ? mi : mr) ^ negx_real;
5740             e4 = e2;
5741             e3 = (flip ? mr : mi) ^ negx_imag;
5742 
5743             if (likely((pg >> (i & 63)) & 1)) {
5744                 d = *(float16 *)(va + H1_2(i));
5745                 d = float16_muladd(e2, e1, d, negf_real, status);
5746                 *(float16 *)(vd + H1_2(i)) = d;
5747             }
5748             if (likely((pg >> (j & 63)) & 1)) {
5749                 d = *(float16 *)(va + H1_2(j));
5750                 d = float16_muladd(e4, e3, d, negf_imag, status);
5751                 *(float16 *)(vd + H1_2(j)) = d;
5752             }
5753         } while (i & 63);
5754     } while (i != 0);
5755 }
5756 
HELPER(sve_fcmla_zpzzz_s)5757 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5758                                void *vg, float_status *status, uint32_t desc)
5759 {
5760     intptr_t j, i = simd_oprsz(desc);
5761     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5762     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5763     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5764     uint32_t negf_real = flip ^ negf_imag;
5765     float32 negx_imag, negx_real;
5766     uint64_t *g = vg;
5767 
5768     /* With AH=0, use negx; with AH=1 use negf. */
5769     negx_real = (negf_real & ~fpcr_ah) << 31;
5770     negx_imag = (negf_imag & ~fpcr_ah) << 31;
5771     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5772     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5773 
5774     do {
5775         uint64_t pg = g[(i - 1) >> 6];
5776         do {
5777             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5778 
5779             /* I holds the real index; J holds the imag index.  */
5780             j = i - sizeof(float32);
5781             i -= 2 * sizeof(float32);
5782 
5783             nr = *(float32 *)(vn + H1_2(i));
5784             ni = *(float32 *)(vn + H1_2(j));
5785             mr = *(float32 *)(vm + H1_2(i));
5786             mi = *(float32 *)(vm + H1_2(j));
5787 
5788             e2 = (flip ? ni : nr);
5789             e1 = (flip ? mi : mr) ^ negx_real;
5790             e4 = e2;
5791             e3 = (flip ? mr : mi) ^ negx_imag;
5792 
5793             if (likely((pg >> (i & 63)) & 1)) {
5794                 d = *(float32 *)(va + H1_2(i));
5795                 d = float32_muladd(e2, e1, d, negf_real, status);
5796                 *(float32 *)(vd + H1_2(i)) = d;
5797             }
5798             if (likely((pg >> (j & 63)) & 1)) {
5799                 d = *(float32 *)(va + H1_2(j));
5800                 d = float32_muladd(e4, e3, d, negf_imag, status);
5801                 *(float32 *)(vd + H1_2(j)) = d;
5802             }
5803         } while (i & 63);
5804     } while (i != 0);
5805 }
5806 
HELPER(sve_fcmla_zpzzz_d)5807 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5808                                void *vg, float_status *status, uint32_t desc)
5809 {
5810     intptr_t j, i = simd_oprsz(desc);
5811     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5812     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5813     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5814     uint32_t negf_real = flip ^ negf_imag;
5815     float64 negx_imag, negx_real;
5816     uint64_t *g = vg;
5817 
5818     /* With AH=0, use negx; with AH=1 use negf. */
5819     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
5820     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
5821     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5822     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5823 
5824     do {
5825         uint64_t pg = g[(i - 1) >> 6];
5826         do {
5827             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5828 
5829             /* I holds the real index; J holds the imag index.  */
5830             j = i - sizeof(float64);
5831             i -= 2 * sizeof(float64);
5832 
5833             nr = *(float64 *)(vn + H1_2(i));
5834             ni = *(float64 *)(vn + H1_2(j));
5835             mr = *(float64 *)(vm + H1_2(i));
5836             mi = *(float64 *)(vm + H1_2(j));
5837 
5838             e2 = (flip ? ni : nr);
5839             e1 = (flip ? mi : mr) ^ negx_real;
5840             e4 = e2;
5841             e3 = (flip ? mr : mi) ^ negx_imag;
5842 
5843             if (likely((pg >> (i & 63)) & 1)) {
5844                 d = *(float64 *)(va + H1_2(i));
5845                 d = float64_muladd(e2, e1, d, negf_real, status);
5846                 *(float64 *)(vd + H1_2(i)) = d;
5847             }
5848             if (likely((pg >> (j & 63)) & 1)) {
5849                 d = *(float64 *)(va + H1_2(j));
5850                 d = float64_muladd(e4, e3, d, negf_imag, status);
5851                 *(float64 *)(vd + H1_2(j)) = d;
5852             }
5853         } while (i & 63);
5854     } while (i != 0);
5855 }
5856 
5857 /*
5858  * Load contiguous data, protected by a governing predicate.
5859  */
5860 
5861 /*
5862  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5863  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5864  * element >= @reg_off, or @reg_max if there were no active elements at all.
5865  */
find_next_active(uint64_t * vg,intptr_t reg_off,intptr_t reg_max,int esz)5866 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5867                                  intptr_t reg_max, int esz)
5868 {
5869     uint64_t pg_mask = pred_esz_masks[esz];
5870     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5871 
5872     /* In normal usage, the first element is active.  */
5873     if (likely(pg & 1)) {
5874         return reg_off;
5875     }
5876 
5877     if (pg == 0) {
5878         reg_off &= -64;
5879         do {
5880             reg_off += 64;
5881             if (unlikely(reg_off >= reg_max)) {
5882                 /* The entire predicate was false.  */
5883                 return reg_max;
5884             }
5885             pg = vg[reg_off >> 6] & pg_mask;
5886         } while (pg == 0);
5887     }
5888     reg_off += ctz64(pg);
5889 
5890     /* We should never see an out of range predicate bit set.  */
5891     tcg_debug_assert(reg_off < reg_max);
5892     return reg_off;
5893 }
5894 
5895 /*
5896  * Resolve the guest virtual address to info->host and info->flags.
5897  * If @nofault, return false if the page is invalid, otherwise
5898  * exit via page fault exception.
5899  */
5900 
sve_probe_page(SVEHostPage * info,bool nofault,CPUARMState * env,target_ulong addr,int mem_off,MMUAccessType access_type,int mmu_idx,uintptr_t retaddr)5901 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5902                     target_ulong addr, int mem_off, MMUAccessType access_type,
5903                     int mmu_idx, uintptr_t retaddr)
5904 {
5905     int flags;
5906 
5907     addr += mem_off;
5908 
5909     /*
5910      * User-only currently always issues with TBI.  See the comment
5911      * above useronly_clean_ptr.  Usually we clean this top byte away
5912      * during translation, but we can't do that for e.g. vector + imm
5913      * addressing modes.
5914      *
5915      * We currently always enable TBI for user-only, and do not provide
5916      * a way to turn it off.  So clean the pointer unconditionally here,
5917      * rather than look it up here, or pass it down from above.
5918      */
5919     addr = useronly_clean_ptr(addr);
5920 
5921 #ifdef CONFIG_USER_ONLY
5922     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5923                                &info->host, retaddr);
5924 #else
5925     CPUTLBEntryFull *full;
5926     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5927                               &info->host, &full, retaddr);
5928 #endif
5929     info->flags = flags;
5930 
5931     if (flags & TLB_INVALID_MASK) {
5932         g_assert(nofault);
5933         return false;
5934     }
5935 
5936 #ifdef CONFIG_USER_ONLY
5937     memset(&info->attrs, 0, sizeof(info->attrs));
5938     /* Require both ANON and MTE; see allocation_tag_mem(). */
5939     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5940 #else
5941     info->attrs = full->attrs;
5942     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5943 #endif
5944 
5945     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5946     info->host -= mem_off;
5947     return true;
5948 }
5949 
5950 /*
5951  * Find first active element on each page, and a loose bound for the
5952  * final element on each page.  Identify any single element that spans
5953  * the page boundary.  Return true if there are any active elements.
5954  */
sve_cont_ldst_elements(SVEContLdSt * info,target_ulong addr,uint64_t * vg,intptr_t reg_max,int esz,int msize)5955 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5956                             intptr_t reg_max, int esz, int msize)
5957 {
5958     const int esize = 1 << esz;
5959     const uint64_t pg_mask = pred_esz_masks[esz];
5960     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5961     intptr_t mem_off_last, mem_off_split;
5962     intptr_t page_split, elt_split;
5963     intptr_t i;
5964 
5965     /* Set all of the element indices to -1, and the TLB data to 0. */
5966     memset(info, -1, offsetof(SVEContLdSt, page));
5967     memset(info->page, 0, sizeof(info->page));
5968 
5969     /* Gross scan over the entire predicate to find bounds. */
5970     i = 0;
5971     do {
5972         uint64_t pg = vg[i] & pg_mask;
5973         if (pg) {
5974             reg_off_last = i * 64 + 63 - clz64(pg);
5975             if (reg_off_first < 0) {
5976                 reg_off_first = i * 64 + ctz64(pg);
5977             }
5978         }
5979     } while (++i * 64 < reg_max);
5980 
5981     if (unlikely(reg_off_first < 0)) {
5982         /* No active elements, no pages touched. */
5983         return false;
5984     }
5985     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5986 
5987     info->reg_off_first[0] = reg_off_first;
5988     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5989     mem_off_last = (reg_off_last >> esz) * msize;
5990 
5991     page_split = -(addr | TARGET_PAGE_MASK);
5992     if (likely(mem_off_last + msize <= page_split)) {
5993         /* The entire operation fits within a single page. */
5994         info->reg_off_last[0] = reg_off_last;
5995         return true;
5996     }
5997 
5998     info->page_split = page_split;
5999     elt_split = page_split / msize;
6000     reg_off_split = elt_split << esz;
6001     mem_off_split = elt_split * msize;
6002 
6003     /*
6004      * This is the last full element on the first page, but it is not
6005      * necessarily active.  If there is no full element, i.e. the first
6006      * active element is the one that's split, this value remains -1.
6007      * It is useful as iteration bounds.
6008      */
6009     if (elt_split != 0) {
6010         info->reg_off_last[0] = reg_off_split - esize;
6011     }
6012 
6013     /* Determine if an unaligned element spans the pages.  */
6014     if (page_split % msize != 0) {
6015         /* It is helpful to know if the split element is active. */
6016         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
6017             info->reg_off_split = reg_off_split;
6018             info->mem_off_split = mem_off_split;
6019 
6020             if (reg_off_split == reg_off_last) {
6021                 /* The page crossing element is last. */
6022                 return true;
6023             }
6024         }
6025         reg_off_split += esize;
6026         mem_off_split += msize;
6027     }
6028 
6029     /*
6030      * We do want the first active element on the second page, because
6031      * this may affect the address reported in an exception.
6032      */
6033     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
6034     tcg_debug_assert(reg_off_split <= reg_off_last);
6035     info->reg_off_first[1] = reg_off_split;
6036     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
6037     info->reg_off_last[1] = reg_off_last;
6038     return true;
6039 }
6040 
6041 /*
6042  * Resolve the guest virtual addresses to info->page[].
6043  * Control the generation of page faults with @fault.  Return false if
6044  * there is no work to do, which can only happen with @fault == FAULT_NO.
6045  */
sve_cont_ldst_pages(SVEContLdSt * info,SVEContFault fault,CPUARMState * env,target_ulong addr,MMUAccessType access_type,uintptr_t retaddr)6046 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
6047                          CPUARMState *env, target_ulong addr,
6048                          MMUAccessType access_type, uintptr_t retaddr)
6049 {
6050     int mmu_idx = arm_env_mmu_index(env);
6051     int mem_off = info->mem_off_first[0];
6052     bool nofault = fault == FAULT_NO;
6053     bool have_work = true;
6054 
6055     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
6056                         access_type, mmu_idx, retaddr)) {
6057         /* No work to be done. */
6058         return false;
6059     }
6060 
6061     if (likely(info->page_split < 0)) {
6062         /* The entire operation was on the one page. */
6063         return true;
6064     }
6065 
6066     /*
6067      * If the second page is invalid, then we want the fault address to be
6068      * the first byte on that page which is accessed.
6069      */
6070     if (info->mem_off_split >= 0) {
6071         /*
6072          * There is an element split across the pages.  The fault address
6073          * should be the first byte of the second page.
6074          */
6075         mem_off = info->page_split;
6076         /*
6077          * If the split element is also the first active element
6078          * of the vector, then:  For first-fault we should continue
6079          * to generate faults for the second page.  For no-fault,
6080          * we have work only if the second page is valid.
6081          */
6082         if (info->mem_off_first[0] < info->mem_off_split) {
6083             nofault = FAULT_FIRST;
6084             have_work = false;
6085         }
6086     } else {
6087         /*
6088          * There is no element split across the pages.  The fault address
6089          * should be the first active element on the second page.
6090          */
6091         mem_off = info->mem_off_first[1];
6092         /*
6093          * There must have been one active element on the first page,
6094          * so we're out of first-fault territory.
6095          */
6096         nofault = fault != FAULT_ALL;
6097     }
6098 
6099     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
6100                                 access_type, mmu_idx, retaddr);
6101     return have_work;
6102 }
6103 
6104 #ifndef CONFIG_USER_ONLY
sve_cont_ldst_watchpoints(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,int wp_access,uintptr_t retaddr)6105 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
6106                                uint64_t *vg, target_ulong addr,
6107                                int esize, int msize, int wp_access,
6108                                uintptr_t retaddr)
6109 {
6110     intptr_t mem_off, reg_off, reg_last;
6111     int flags0 = info->page[0].flags;
6112     int flags1 = info->page[1].flags;
6113 
6114     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
6115         return;
6116     }
6117 
6118     /* Indicate that watchpoints are handled. */
6119     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
6120     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
6121 
6122     if (flags0 & TLB_WATCHPOINT) {
6123         mem_off = info->mem_off_first[0];
6124         reg_off = info->reg_off_first[0];
6125         reg_last = info->reg_off_last[0];
6126 
6127         while (reg_off <= reg_last) {
6128             uint64_t pg = vg[reg_off >> 6];
6129             do {
6130                 if ((pg >> (reg_off & 63)) & 1) {
6131                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
6132                                          msize, info->page[0].attrs,
6133                                          wp_access, retaddr);
6134                 }
6135                 reg_off += esize;
6136                 mem_off += msize;
6137             } while (reg_off <= reg_last && (reg_off & 63));
6138         }
6139     }
6140 
6141     mem_off = info->mem_off_split;
6142     if (mem_off >= 0) {
6143         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
6144                              info->page[0].attrs, wp_access, retaddr);
6145     }
6146 
6147     mem_off = info->mem_off_first[1];
6148     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
6149         reg_off = info->reg_off_first[1];
6150         reg_last = info->reg_off_last[1];
6151 
6152         do {
6153             uint64_t pg = vg[reg_off >> 6];
6154             do {
6155                 if ((pg >> (reg_off & 63)) & 1) {
6156                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
6157                                          msize, info->page[1].attrs,
6158                                          wp_access, retaddr);
6159                 }
6160                 reg_off += esize;
6161                 mem_off += msize;
6162             } while (reg_off & 63);
6163         } while (reg_off <= reg_last);
6164     }
6165 }
6166 #endif
6167 
sve_cont_ldst_mte_check(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,uint32_t mtedesc,uintptr_t ra)6168 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
6169                              uint64_t *vg, target_ulong addr, int esize,
6170                              int msize, uint32_t mtedesc, uintptr_t ra)
6171 {
6172     intptr_t mem_off, reg_off, reg_last;
6173 
6174     /* Process the page only if MemAttr == Tagged. */
6175     if (info->page[0].tagged) {
6176         mem_off = info->mem_off_first[0];
6177         reg_off = info->reg_off_first[0];
6178         reg_last = info->reg_off_split;
6179         if (reg_last < 0) {
6180             reg_last = info->reg_off_last[0];
6181         }
6182 
6183         do {
6184             uint64_t pg = vg[reg_off >> 6];
6185             do {
6186                 if ((pg >> (reg_off & 63)) & 1) {
6187                     mte_check(env, mtedesc, addr, ra);
6188                 }
6189                 reg_off += esize;
6190                 mem_off += msize;
6191             } while (reg_off <= reg_last && (reg_off & 63));
6192         } while (reg_off <= reg_last);
6193     }
6194 
6195     mem_off = info->mem_off_first[1];
6196     if (mem_off >= 0 && info->page[1].tagged) {
6197         reg_off = info->reg_off_first[1];
6198         reg_last = info->reg_off_last[1];
6199 
6200         do {
6201             uint64_t pg = vg[reg_off >> 6];
6202             do {
6203                 if ((pg >> (reg_off & 63)) & 1) {
6204                     mte_check(env, mtedesc, addr, ra);
6205                 }
6206                 reg_off += esize;
6207                 mem_off += msize;
6208             } while (reg_off & 63);
6209         } while (reg_off <= reg_last);
6210     }
6211 }
6212 
6213 /*
6214  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6215  */
6216 static inline QEMU_ALWAYS_INLINE
sve_ldN_r(CPUARMState * env,uint64_t * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const int N,uint32_t mtedesc,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6217 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
6218                uint32_t desc, const uintptr_t retaddr,
6219                const int esz, const int msz, const int N, uint32_t mtedesc,
6220                sve_ldst1_host_fn *host_fn,
6221                sve_ldst1_tlb_fn *tlb_fn)
6222 {
6223     const unsigned rd = simd_data(desc);
6224     const intptr_t reg_max = simd_oprsz(desc);
6225     intptr_t reg_off, reg_last, mem_off;
6226     SVEContLdSt info;
6227     void *host;
6228     int flags, i;
6229 
6230     /* Find the active elements.  */
6231     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6232         /* The entire predicate was false; no load occurs.  */
6233         for (i = 0; i < N; ++i) {
6234             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
6235         }
6236         return;
6237     }
6238 
6239     /* Probe the page(s).  Exit with exception for any invalid page. */
6240     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
6241 
6242     /* Handle watchpoints for all active elements. */
6243     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6244                               BP_MEM_READ, retaddr);
6245 
6246     /*
6247      * Handle mte checks for all active elements.
6248      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6249      */
6250     if (mtedesc) {
6251         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6252                                 mtedesc, retaddr);
6253     }
6254 
6255     flags = info.page[0].flags | info.page[1].flags;
6256     if (unlikely(flags != 0)) {
6257         /*
6258          * At least one page includes MMIO.
6259          * Any bus operation can fail with cpu_transaction_failed,
6260          * which for ARM will raise SyncExternal.  Perform the load
6261          * into scratch memory to preserve register state until the end.
6262          */
6263         ARMVectorReg scratch[4] = { };
6264 
6265         mem_off = info.mem_off_first[0];
6266         reg_off = info.reg_off_first[0];
6267         reg_last = info.reg_off_last[1];
6268         if (reg_last < 0) {
6269             reg_last = info.reg_off_split;
6270             if (reg_last < 0) {
6271                 reg_last = info.reg_off_last[0];
6272             }
6273         }
6274 
6275         do {
6276             uint64_t pg = vg[reg_off >> 6];
6277             do {
6278                 if ((pg >> (reg_off & 63)) & 1) {
6279                     for (i = 0; i < N; ++i) {
6280                         tlb_fn(env, &scratch[i], reg_off,
6281                                addr + mem_off + (i << msz), retaddr);
6282                     }
6283                 }
6284                 reg_off += 1 << esz;
6285                 mem_off += N << msz;
6286             } while (reg_off & 63);
6287         } while (reg_off <= reg_last);
6288 
6289         for (i = 0; i < N; ++i) {
6290             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
6291         }
6292         return;
6293     }
6294 
6295     /* The entire operation is in RAM, on valid pages. */
6296 
6297     for (i = 0; i < N; ++i) {
6298         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
6299     }
6300 
6301     mem_off = info.mem_off_first[0];
6302     reg_off = info.reg_off_first[0];
6303     reg_last = info.reg_off_last[0];
6304     host = info.page[0].host;
6305 
6306     set_helper_retaddr(retaddr);
6307 
6308     while (reg_off <= reg_last) {
6309         uint64_t pg = vg[reg_off >> 6];
6310         do {
6311             if ((pg >> (reg_off & 63)) & 1) {
6312                 for (i = 0; i < N; ++i) {
6313                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6314                             host + mem_off + (i << msz));
6315                 }
6316             }
6317             reg_off += 1 << esz;
6318             mem_off += N << msz;
6319         } while (reg_off <= reg_last && (reg_off & 63));
6320     }
6321 
6322     clear_helper_retaddr();
6323 
6324     /*
6325      * Use the slow path to manage the cross-page misalignment.
6326      * But we know this is RAM and cannot trap.
6327      */
6328     mem_off = info.mem_off_split;
6329     if (unlikely(mem_off >= 0)) {
6330         reg_off = info.reg_off_split;
6331         for (i = 0; i < N; ++i) {
6332             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6333                    addr + mem_off + (i << msz), retaddr);
6334         }
6335     }
6336 
6337     mem_off = info.mem_off_first[1];
6338     if (unlikely(mem_off >= 0)) {
6339         reg_off = info.reg_off_first[1];
6340         reg_last = info.reg_off_last[1];
6341         host = info.page[1].host;
6342 
6343         set_helper_retaddr(retaddr);
6344 
6345         do {
6346             uint64_t pg = vg[reg_off >> 6];
6347             do {
6348                 if ((pg >> (reg_off & 63)) & 1) {
6349                     for (i = 0; i < N; ++i) {
6350                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6351                                 host + mem_off + (i << msz));
6352                     }
6353                 }
6354                 reg_off += 1 << esz;
6355                 mem_off += N << msz;
6356             } while (reg_off & 63);
6357         } while (reg_off <= reg_last);
6358 
6359         clear_helper_retaddr();
6360     }
6361 }
6362 
6363 static inline QEMU_ALWAYS_INLINE
sve_ldN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint64_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6364 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6365                    uint64_t desc, const uintptr_t ra,
6366                    const int esz, const int msz, const int N,
6367                    sve_ldst1_host_fn *host_fn,
6368                    sve_ldst1_tlb_fn *tlb_fn)
6369 {
6370     uint32_t mtedesc = desc >> 32;
6371     int bit55 = extract64(addr, 55, 1);
6372 
6373     /* Perform gross MTE suppression early. */
6374     if (!tbi_check(mtedesc, bit55) ||
6375         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6376         mtedesc = 0;
6377     }
6378 
6379     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6380 }
6381 
6382 #define DO_LD1_1(NAME, ESZ)                                             \
6383 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
6384                             target_ulong addr, uint64_t desc)           \
6385 {                                                                       \
6386     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
6387               sve_##NAME##_host, sve_##NAME##_tlb);                     \
6388 }                                                                       \
6389 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
6390                                 target_ulong addr, uint64_t desc)       \
6391 {                                                                       \
6392     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
6393                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
6394 }
6395 
6396 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
6397 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
6398                                target_ulong addr, uint64_t desc)        \
6399 {                                                                       \
6400     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6401               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
6402 }                                                                       \
6403 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
6404                                target_ulong addr, uint64_t desc)        \
6405 {                                                                       \
6406     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6407               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
6408 }                                                                       \
6409 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
6410                                    target_ulong addr, uint64_t desc)    \
6411 {                                                                       \
6412     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6413                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
6414 }                                                                       \
6415 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
6416                                    target_ulong addr, uint64_t desc)    \
6417 {                                                                       \
6418     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6419                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
6420 }
6421 
DO_LD1_1(ld1bb,MO_8)6422 DO_LD1_1(ld1bb,  MO_8)
6423 DO_LD1_1(ld1bhu, MO_16)
6424 DO_LD1_1(ld1bhs, MO_16)
6425 DO_LD1_1(ld1bsu, MO_32)
6426 DO_LD1_1(ld1bss, MO_32)
6427 DO_LD1_1(ld1bdu, MO_64)
6428 DO_LD1_1(ld1bds, MO_64)
6429 
6430 DO_LD1_2(ld1hh,  MO_16, MO_16)
6431 DO_LD1_2(ld1hsu, MO_32, MO_16)
6432 DO_LD1_2(ld1hss, MO_32, MO_16)
6433 DO_LD1_2(ld1hdu, MO_64, MO_16)
6434 DO_LD1_2(ld1hds, MO_64, MO_16)
6435 
6436 DO_LD1_2(ld1ss,  MO_32, MO_32)
6437 DO_LD1_2(ld1sdu, MO_64, MO_32)
6438 DO_LD1_2(ld1sds, MO_64, MO_32)
6439 
6440 DO_LD1_2(ld1dd,  MO_64, MO_64)
6441 
6442 DO_LD1_2(ld1squ, MO_128, MO_32)
6443 DO_LD1_2(ld1dqu, MO_128, MO_64)
6444 
6445 #undef DO_LD1_1
6446 #undef DO_LD1_2
6447 
6448 #define DO_LDN_1(N)                                                     \
6449 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
6450                              target_ulong addr, uint64_t desc)          \
6451 {                                                                       \
6452     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
6453               sve_ld1bb_host, sve_ld1bb_tlb);                           \
6454 }                                                                       \
6455 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
6456                                  target_ulong addr, uint64_t desc)      \
6457 {                                                                       \
6458     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
6459                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
6460 }
6461 
6462 #define DO_LDN_2(N, SUFF, ESZ)                                          \
6463 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
6464                                     target_ulong addr, uint64_t desc)   \
6465 {                                                                       \
6466     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6467               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
6468 }                                                                       \
6469 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
6470                                     target_ulong addr, uint64_t desc)   \
6471 {                                                                       \
6472     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6473               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
6474 }                                                                       \
6475 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
6476                                         target_ulong addr, uint64_t desc) \
6477 {                                                                       \
6478     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6479                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
6480 }                                                                       \
6481 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
6482                                         target_ulong addr, uint64_t desc) \
6483 {                                                                       \
6484     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6485                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
6486 }
6487 
6488 DO_LDN_1(2)
6489 DO_LDN_1(3)
6490 DO_LDN_1(4)
6491 
6492 DO_LDN_2(2, hh, MO_16)
6493 DO_LDN_2(3, hh, MO_16)
6494 DO_LDN_2(4, hh, MO_16)
6495 
6496 DO_LDN_2(2, ss, MO_32)
6497 DO_LDN_2(3, ss, MO_32)
6498 DO_LDN_2(4, ss, MO_32)
6499 
6500 DO_LDN_2(2, dd, MO_64)
6501 DO_LDN_2(3, dd, MO_64)
6502 DO_LDN_2(4, dd, MO_64)
6503 
6504 DO_LDN_2(2, qq, MO_128)
6505 DO_LDN_2(3, qq, MO_128)
6506 DO_LDN_2(4, qq, MO_128)
6507 
6508 #undef DO_LDN_1
6509 #undef DO_LDN_2
6510 
6511 /*
6512  * Load contiguous data, first-fault and no-fault.
6513  *
6514  * For user-only, we control the race between page_check_range and
6515  * another thread's munmap by using set/clear_helper_retaddr.  Any
6516  * SEGV that occurs between those markers is assumed to be because
6517  * the guest page vanished.  Keep that block as small as possible
6518  * so that unrelated QEMU bugs are not blamed on the guest.
6519  */
6520 
6521 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
6522  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6523  * option, which leaves subsequent data unchanged.
6524  */
6525 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6526 {
6527     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6528 
6529     if (i & 63) {
6530         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6531         i = ROUND_UP(i, 64);
6532     }
6533     for (; i < oprsz; i += 64) {
6534         ffr[i / 64] = 0;
6535     }
6536 }
6537 
6538 /*
6539  * Common helper for all contiguous no-fault and first-fault loads.
6540  */
6541 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r(CPUARMState * env,void * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,uint32_t mtedesc,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6542 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6543                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6544                    const int esz, const int msz, const SVEContFault fault,
6545                    sve_ldst1_host_fn *host_fn,
6546                    sve_ldst1_tlb_fn *tlb_fn)
6547 {
6548     const unsigned rd = simd_data(desc);
6549     void *vd = &env->vfp.zregs[rd];
6550     const intptr_t reg_max = simd_oprsz(desc);
6551     intptr_t reg_off, mem_off, reg_last;
6552     SVEContLdSt info;
6553     int flags;
6554     void *host;
6555 
6556     /* Find the active elements.  */
6557     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6558         /* The entire predicate was false; no load occurs.  */
6559         memset(vd, 0, reg_max);
6560         return;
6561     }
6562     reg_off = info.reg_off_first[0];
6563 
6564     /* Probe the page(s). */
6565     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6566         /* Fault on first element. */
6567         tcg_debug_assert(fault == FAULT_NO);
6568         memset(vd, 0, reg_max);
6569         goto do_fault;
6570     }
6571 
6572     mem_off = info.mem_off_first[0];
6573     flags = info.page[0].flags;
6574 
6575     /*
6576      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6577      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6578      */
6579     if (!info.page[0].tagged) {
6580         mtedesc = 0;
6581     }
6582 
6583     if (fault == FAULT_FIRST) {
6584         /* Trapping mte check for the first-fault element.  */
6585         if (mtedesc) {
6586             mte_check(env, mtedesc, addr + mem_off, retaddr);
6587         }
6588 
6589         /*
6590          * Special handling of the first active element,
6591          * if it crosses a page boundary or is MMIO.
6592          */
6593         bool is_split = mem_off == info.mem_off_split;
6594         if (unlikely(flags != 0) || unlikely(is_split)) {
6595             /*
6596              * Use the slow path for cross-page handling.
6597              * Might trap for MMIO or watchpoints.
6598              */
6599             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6600 
6601             /* After any fault, zero the other elements. */
6602             swap_memzero(vd, reg_off);
6603             reg_off += 1 << esz;
6604             mem_off += 1 << msz;
6605             swap_memzero(vd + reg_off, reg_max - reg_off);
6606 
6607             if (is_split) {
6608                 goto second_page;
6609             }
6610         } else {
6611             memset(vd, 0, reg_max);
6612         }
6613     } else {
6614         memset(vd, 0, reg_max);
6615         if (unlikely(mem_off == info.mem_off_split)) {
6616             /* The first active element crosses a page boundary. */
6617             flags |= info.page[1].flags;
6618             if (unlikely(flags & TLB_MMIO)) {
6619                 /* Some page is MMIO, see below. */
6620                 goto do_fault;
6621             }
6622             if (unlikely(flags & TLB_WATCHPOINT) &&
6623                 (cpu_watchpoint_address_matches
6624                  (env_cpu(env), addr + mem_off, 1 << msz)
6625                  & BP_MEM_READ)) {
6626                 /* Watchpoint hit, see below. */
6627                 goto do_fault;
6628             }
6629             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6630                 goto do_fault;
6631             }
6632             /*
6633              * Use the slow path for cross-page handling.
6634              * This is RAM, without a watchpoint, and will not trap.
6635              */
6636             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6637             goto second_page;
6638         }
6639     }
6640 
6641     /*
6642      * From this point on, all memory operations are MemSingleNF.
6643      *
6644      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6645      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6646      *
6647      * Unfortuately we do not have access to the memory attributes from the
6648      * PTE to tell Device memory from Normal memory.  So we make a mostly
6649      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6650      * This gives the right answer for the common cases of "Normal memory,
6651      * backed by host RAM" and "Device memory, backed by MMIO".
6652      * The architecture allows us to suppress an NF load and return
6653      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6654      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6655      * get wrong is "Device memory, backed by host RAM", for which we
6656      * should return (UNKNOWN, FAULT) for but do not.
6657      *
6658      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6659      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6660      * architectural breakpoints the same.
6661      */
6662     if (unlikely(flags & TLB_MMIO)) {
6663         goto do_fault;
6664     }
6665 
6666     reg_last = info.reg_off_last[0];
6667     host = info.page[0].host;
6668 
6669     set_helper_retaddr(retaddr);
6670 
6671     do {
6672         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6673         do {
6674             if ((pg >> (reg_off & 63)) & 1) {
6675                 if (unlikely(flags & TLB_WATCHPOINT) &&
6676                     (cpu_watchpoint_address_matches
6677                      (env_cpu(env), addr + mem_off, 1 << msz)
6678                      & BP_MEM_READ)) {
6679                     clear_helper_retaddr();
6680                     goto do_fault;
6681                 }
6682                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6683                     clear_helper_retaddr();
6684                     goto do_fault;
6685                 }
6686                 host_fn(vd, reg_off, host + mem_off);
6687             }
6688             reg_off += 1 << esz;
6689             mem_off += 1 << msz;
6690         } while (reg_off <= reg_last && (reg_off & 63));
6691     } while (reg_off <= reg_last);
6692 
6693     clear_helper_retaddr();
6694 
6695     /*
6696      * MemSingleNF is allowed to fail for any reason.  We have special
6697      * code above to handle the first element crossing a page boundary.
6698      * As an implementation choice, decline to handle a cross-page element
6699      * in any other position.
6700      */
6701     reg_off = info.reg_off_split;
6702     if (reg_off >= 0) {
6703         goto do_fault;
6704     }
6705 
6706  second_page:
6707     reg_off = info.reg_off_first[1];
6708     if (likely(reg_off < 0)) {
6709         /* No active elements on the second page.  All done. */
6710         return;
6711     }
6712 
6713     /*
6714      * MemSingleNF is allowed to fail for any reason.  As an implementation
6715      * choice, decline to handle elements on the second page.  This should
6716      * be low frequency as the guest walks through memory -- the next
6717      * iteration of the guest's loop should be aligned on the page boundary,
6718      * and then all following iterations will stay aligned.
6719      */
6720 
6721  do_fault:
6722     record_fault(env, reg_off, reg_max);
6723 }
6724 
6725 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r_mte(CPUARMState * env,void * vg,target_ulong addr,uint64_t desc,const uintptr_t retaddr,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6726 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6727                        uint64_t desc, const uintptr_t retaddr,
6728                        const int esz, const int msz, const SVEContFault fault,
6729                        sve_ldst1_host_fn *host_fn,
6730                        sve_ldst1_tlb_fn *tlb_fn)
6731 {
6732     uint32_t mtedesc = desc >> 32;
6733     int bit55 = extract64(addr, 55, 1);
6734 
6735     /* Perform gross MTE suppression early. */
6736     if (!tbi_check(mtedesc, bit55) ||
6737         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6738         mtedesc = 0;
6739     }
6740 
6741     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6742                   esz, msz, fault, host_fn, tlb_fn);
6743 }
6744 
6745 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6746 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6747                                  target_ulong addr, uint64_t desc)      \
6748 {                                                                       \
6749     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6750                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6751 }                                                                       \
6752 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6753                                  target_ulong addr, uint64_t desc)      \
6754 {                                                                       \
6755     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6756                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6757 }                                                                       \
6758 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6759                                      target_ulong addr, uint64_t desc)  \
6760 {                                                                       \
6761     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6762                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6763 }                                                                       \
6764 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6765                                      target_ulong addr, uint64_t desc)  \
6766 {                                                                       \
6767     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6768                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6769 }
6770 
6771 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6772 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6773                                     target_ulong addr, uint64_t desc)   \
6774 {                                                                       \
6775     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6776                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6777 }                                                                       \
6778 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6779                                     target_ulong addr, uint64_t desc)   \
6780 {                                                                       \
6781     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6782                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6783 }                                                                       \
6784 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6785                                     target_ulong addr, uint64_t desc)   \
6786 {                                                                       \
6787     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6788                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6789 }                                                                       \
6790 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6791                                     target_ulong addr, uint64_t desc)   \
6792 {                                                                       \
6793     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6794                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6795 }                                                                       \
6796 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6797                                         target_ulong addr, uint64_t desc) \
6798 {                                                                       \
6799     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6800                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6801 }                                                                       \
6802 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6803                                         target_ulong addr, uint64_t desc) \
6804 {                                                                       \
6805     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6806                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6807 }                                                                       \
6808 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6809                                         target_ulong addr, uint64_t desc) \
6810 {                                                                       \
6811     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6812                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6813 }                                                                       \
6814 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6815                                         target_ulong addr, uint64_t desc) \
6816 {                                                                       \
6817     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6818                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6819 }
6820 
DO_LDFF1_LDNF1_1(bb,MO_8)6821 DO_LDFF1_LDNF1_1(bb,  MO_8)
6822 DO_LDFF1_LDNF1_1(bhu, MO_16)
6823 DO_LDFF1_LDNF1_1(bhs, MO_16)
6824 DO_LDFF1_LDNF1_1(bsu, MO_32)
6825 DO_LDFF1_LDNF1_1(bss, MO_32)
6826 DO_LDFF1_LDNF1_1(bdu, MO_64)
6827 DO_LDFF1_LDNF1_1(bds, MO_64)
6828 
6829 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6830 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6831 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6832 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6833 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6834 
6835 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6836 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6837 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6838 
6839 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6840 
6841 #undef DO_LDFF1_LDNF1_1
6842 #undef DO_LDFF1_LDNF1_2
6843 
6844 /*
6845  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6846  */
6847 
6848 static inline QEMU_ALWAYS_INLINE
6849 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6850                uint32_t desc, const uintptr_t retaddr,
6851                const int esz, const int msz, const int N, uint32_t mtedesc,
6852                sve_ldst1_host_fn *host_fn,
6853                sve_ldst1_tlb_fn *tlb_fn)
6854 {
6855     const unsigned rd = simd_data(desc);
6856     const intptr_t reg_max = simd_oprsz(desc);
6857     intptr_t reg_off, reg_last, mem_off;
6858     SVEContLdSt info;
6859     void *host;
6860     int i, flags;
6861 
6862     /* Find the active elements.  */
6863     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6864         /* The entire predicate was false; no store occurs.  */
6865         return;
6866     }
6867 
6868     /* Probe the page(s).  Exit with exception for any invalid page. */
6869     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6870 
6871     /* Handle watchpoints for all active elements. */
6872     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6873                               BP_MEM_WRITE, retaddr);
6874 
6875     /*
6876      * Handle mte checks for all active elements.
6877      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6878      */
6879     if (mtedesc) {
6880         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6881                                 mtedesc, retaddr);
6882     }
6883 
6884     flags = info.page[0].flags | info.page[1].flags;
6885     if (unlikely(flags != 0)) {
6886         /*
6887          * At least one page includes MMIO.
6888          * Any bus operation can fail with cpu_transaction_failed,
6889          * which for ARM will raise SyncExternal.  We cannot avoid
6890          * this fault and will leave with the store incomplete.
6891          */
6892         mem_off = info.mem_off_first[0];
6893         reg_off = info.reg_off_first[0];
6894         reg_last = info.reg_off_last[1];
6895         if (reg_last < 0) {
6896             reg_last = info.reg_off_split;
6897             if (reg_last < 0) {
6898                 reg_last = info.reg_off_last[0];
6899             }
6900         }
6901 
6902         do {
6903             uint64_t pg = vg[reg_off >> 6];
6904             do {
6905                 if ((pg >> (reg_off & 63)) & 1) {
6906                     for (i = 0; i < N; ++i) {
6907                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6908                                addr + mem_off + (i << msz), retaddr);
6909                     }
6910                 }
6911                 reg_off += 1 << esz;
6912                 mem_off += N << msz;
6913             } while (reg_off & 63);
6914         } while (reg_off <= reg_last);
6915         return;
6916     }
6917 
6918     mem_off = info.mem_off_first[0];
6919     reg_off = info.reg_off_first[0];
6920     reg_last = info.reg_off_last[0];
6921     host = info.page[0].host;
6922 
6923     set_helper_retaddr(retaddr);
6924 
6925     while (reg_off <= reg_last) {
6926         uint64_t pg = vg[reg_off >> 6];
6927         do {
6928             if ((pg >> (reg_off & 63)) & 1) {
6929                 for (i = 0; i < N; ++i) {
6930                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6931                             host + mem_off + (i << msz));
6932                 }
6933             }
6934             reg_off += 1 << esz;
6935             mem_off += N << msz;
6936         } while (reg_off <= reg_last && (reg_off & 63));
6937     }
6938 
6939     clear_helper_retaddr();
6940 
6941     /*
6942      * Use the slow path to manage the cross-page misalignment.
6943      * But we know this is RAM and cannot trap.
6944      */
6945     mem_off = info.mem_off_split;
6946     if (unlikely(mem_off >= 0)) {
6947         reg_off = info.reg_off_split;
6948         for (i = 0; i < N; ++i) {
6949             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6950                    addr + mem_off + (i << msz), retaddr);
6951         }
6952     }
6953 
6954     mem_off = info.mem_off_first[1];
6955     if (unlikely(mem_off >= 0)) {
6956         reg_off = info.reg_off_first[1];
6957         reg_last = info.reg_off_last[1];
6958         host = info.page[1].host;
6959 
6960         set_helper_retaddr(retaddr);
6961 
6962         do {
6963             uint64_t pg = vg[reg_off >> 6];
6964             do {
6965                 if ((pg >> (reg_off & 63)) & 1) {
6966                     for (i = 0; i < N; ++i) {
6967                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6968                                 host + mem_off + (i << msz));
6969                     }
6970                 }
6971                 reg_off += 1 << esz;
6972                 mem_off += N << msz;
6973             } while (reg_off & 63);
6974         } while (reg_off <= reg_last);
6975 
6976         clear_helper_retaddr();
6977     }
6978 }
6979 
6980 static inline QEMU_ALWAYS_INLINE
sve_stN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint64_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6981 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6982                    uint64_t desc, const uintptr_t ra,
6983                    const int esz, const int msz, const int N,
6984                    sve_ldst1_host_fn *host_fn,
6985                    sve_ldst1_tlb_fn *tlb_fn)
6986 {
6987     uint32_t mtedesc = desc >> 32;
6988     int bit55 = extract64(addr, 55, 1);
6989 
6990     /* Perform gross MTE suppression early. */
6991     if (!tbi_check(mtedesc, bit55) ||
6992         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6993         mtedesc = 0;
6994     }
6995 
6996     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6997 }
6998 
6999 #define DO_STN_1(N, NAME, ESZ)                                          \
7000 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
7001                                  target_ulong addr, uint64_t desc)      \
7002 {                                                                       \
7003     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
7004               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
7005 }                                                                       \
7006 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
7007                                      target_ulong addr, uint64_t desc)  \
7008 {                                                                       \
7009     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
7010                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
7011 }
7012 
7013 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
7014 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
7015                                     target_ulong addr, uint64_t desc)   \
7016 {                                                                       \
7017     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
7018               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
7019 }                                                                       \
7020 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
7021                                     target_ulong addr, uint64_t desc)   \
7022 {                                                                       \
7023     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
7024               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
7025 }                                                                       \
7026 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
7027                                         target_ulong addr, uint64_t desc) \
7028 {                                                                       \
7029     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
7030                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
7031 }                                                                       \
7032 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
7033                                         target_ulong addr, uint64_t desc) \
7034 {                                                                       \
7035     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
7036                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
7037 }
7038 
7039 DO_STN_1(1, bb, MO_8)
7040 DO_STN_1(1, bh, MO_16)
7041 DO_STN_1(1, bs, MO_32)
7042 DO_STN_1(1, bd, MO_64)
7043 DO_STN_1(2, bb, MO_8)
7044 DO_STN_1(3, bb, MO_8)
7045 DO_STN_1(4, bb, MO_8)
7046 
7047 DO_STN_2(1, hh, MO_16, MO_16)
7048 DO_STN_2(1, hs, MO_32, MO_16)
7049 DO_STN_2(1, hd, MO_64, MO_16)
7050 DO_STN_2(2, hh, MO_16, MO_16)
7051 DO_STN_2(3, hh, MO_16, MO_16)
7052 DO_STN_2(4, hh, MO_16, MO_16)
7053 
7054 DO_STN_2(1, ss, MO_32, MO_32)
7055 DO_STN_2(1, sd, MO_64, MO_32)
7056 DO_STN_2(2, ss, MO_32, MO_32)
7057 DO_STN_2(3, ss, MO_32, MO_32)
7058 DO_STN_2(4, ss, MO_32, MO_32)
7059 
7060 DO_STN_2(1, dd, MO_64, MO_64)
7061 DO_STN_2(2, dd, MO_64, MO_64)
7062 DO_STN_2(3, dd, MO_64, MO_64)
7063 DO_STN_2(4, dd, MO_64, MO_64)
7064 
7065 DO_STN_2(1, sq, MO_128, MO_32)
7066 DO_STN_2(1, dq, MO_128, MO_64)
7067 
7068 DO_STN_2(2, qq, MO_128, MO_128)
7069 DO_STN_2(3, qq, MO_128, MO_128)
7070 DO_STN_2(4, qq, MO_128, MO_128)
7071 
7072 #undef DO_STN_1
7073 #undef DO_STN_2
7074 
7075 /*
7076  * Loads with a vector index.
7077  */
7078 
7079 /*
7080  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
7081  */
7082 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
7083 
off_zsu_s(void * reg,intptr_t reg_ofs)7084 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
7085 {
7086     return *(uint32_t *)(reg + H1_4(reg_ofs));
7087 }
7088 
off_zss_s(void * reg,intptr_t reg_ofs)7089 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
7090 {
7091     return *(int32_t *)(reg + H1_4(reg_ofs));
7092 }
7093 
off_zsu_d(void * reg,intptr_t reg_ofs)7094 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
7095 {
7096     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
7097 }
7098 
off_zss_d(void * reg,intptr_t reg_ofs)7099 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
7100 {
7101     return (int32_t)*(uint64_t *)(reg + reg_ofs);
7102 }
7103 
off_zd_d(void * reg,intptr_t reg_ofs)7104 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
7105 {
7106     return *(uint64_t *)(reg + reg_ofs);
7107 }
7108 
7109 static inline QEMU_ALWAYS_INLINE
sve_ld1_z(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,uint32_t mtedesc,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7110 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7111                target_ulong base, uint32_t desc, uintptr_t retaddr,
7112                uint32_t mtedesc, int esize, int msize,
7113                zreg_off_fn *off_fn,
7114                sve_ldst1_host_fn *host_fn,
7115                sve_ldst1_tlb_fn *tlb_fn)
7116 {
7117     const int mmu_idx = arm_env_mmu_index(env);
7118     const intptr_t reg_max = simd_oprsz(desc);
7119     const int scale = simd_data(desc);
7120     ARMVectorReg scratch;
7121     intptr_t reg_off;
7122     SVEHostPage info, info2;
7123 
7124     memset(&scratch, 0, reg_max);
7125     reg_off = 0;
7126     do {
7127         uint64_t pg = vg[reg_off >> 6];
7128         do {
7129             if (likely(pg & 1)) {
7130                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7131                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7132 
7133                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
7134                                mmu_idx, retaddr);
7135 
7136                 if (likely(in_page >= msize)) {
7137                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
7138                         cpu_check_watchpoint(env_cpu(env), addr, msize,
7139                                              info.attrs, BP_MEM_READ, retaddr);
7140                     }
7141                     if (mtedesc && info.tagged) {
7142                         mte_check(env, mtedesc, addr, retaddr);
7143                     }
7144                     if (unlikely(info.flags & TLB_MMIO)) {
7145                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
7146                     } else {
7147                         set_helper_retaddr(retaddr);
7148                         host_fn(&scratch, reg_off, info.host);
7149                         clear_helper_retaddr();
7150                     }
7151                 } else {
7152                     /* Element crosses the page boundary. */
7153                     sve_probe_page(&info2, false, env, addr + in_page, 0,
7154                                    MMU_DATA_LOAD, mmu_idx, retaddr);
7155                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
7156                         cpu_check_watchpoint(env_cpu(env), addr,
7157                                              msize, info.attrs,
7158                                              BP_MEM_READ, retaddr);
7159                     }
7160                     if (mtedesc && info.tagged) {
7161                         mte_check(env, mtedesc, addr, retaddr);
7162                     }
7163                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
7164                 }
7165             }
7166             reg_off += esize;
7167             pg >>= esize;
7168         } while (reg_off & 63);
7169     } while (reg_off < reg_max);
7170 
7171     /* Wait until all exceptions have been raised to write back.  */
7172     memcpy(vd, &scratch, reg_max);
7173 }
7174 
7175 static inline QEMU_ALWAYS_INLINE
sve_ld1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint64_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7176 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7177                    target_ulong base, uint64_t desc, uintptr_t retaddr,
7178                    int esize, int msize, zreg_off_fn *off_fn,
7179                    sve_ldst1_host_fn *host_fn,
7180                    sve_ldst1_tlb_fn *tlb_fn)
7181 {
7182     uint32_t mtedesc = desc >> 32;
7183 
7184     /*
7185      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7186      * offset base entirely over the address space hole to change the
7187      * pointer tag, or change the bit55 selector.  So we could here
7188      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7189      */
7190     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7191               esize, msize, off_fn, host_fn, tlb_fn);
7192 }
7193 
7194 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
7195 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
7196                                  void *vm, target_ulong base, uint64_t desc) \
7197 {                                                                            \
7198     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
7199               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
7200 }                                                                            \
7201 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7202      void *vm, target_ulong base, uint64_t desc)                             \
7203 {                                                                            \
7204     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
7205                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
7206 }
7207 
7208 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
7209 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
7210                                  void *vm, target_ulong base, uint64_t desc) \
7211 {                                                                            \
7212     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
7213               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
7214 }                                                                            \
7215 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7216     void *vm, target_ulong base, uint64_t desc)                              \
7217 {                                                                            \
7218     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
7219                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
7220 }
7221 
7222 #define DO_LD1_ZPZ_Q(MEM, OFS, MSZ)                                          \
7223 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
7224                                  void *vm, target_ulong base, uint64_t desc) \
7225 {                                                                            \
7226     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 16, 1 << MSZ,         \
7227               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
7228 }                                                                            \
7229 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7230     void *vm, target_ulong base, uint64_t desc)                              \
7231 {                                                                            \
7232     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 16, 1 << MSZ,        \
7233                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
7234 }
7235 
DO_LD1_ZPZ_S(bsu,zsu,MO_8)7236 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
7237 DO_LD1_ZPZ_S(bsu, zss, MO_8)
7238 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
7239 DO_LD1_ZPZ_D(bdu, zss, MO_8)
7240 DO_LD1_ZPZ_D(bdu, zd, MO_8)
7241 
7242 DO_LD1_ZPZ_S(bss, zsu, MO_8)
7243 DO_LD1_ZPZ_S(bss, zss, MO_8)
7244 DO_LD1_ZPZ_D(bds, zsu, MO_8)
7245 DO_LD1_ZPZ_D(bds, zss, MO_8)
7246 DO_LD1_ZPZ_D(bds, zd, MO_8)
7247 
7248 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
7249 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
7250 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
7251 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
7252 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
7253 
7254 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
7255 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
7256 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
7257 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
7258 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
7259 
7260 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
7261 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
7262 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
7263 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
7264 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
7265 
7266 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
7267 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
7268 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
7269 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
7270 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
7271 
7272 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
7273 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
7274 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
7275 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
7276 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
7277 
7278 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
7279 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
7280 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
7281 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
7282 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
7283 
7284 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
7285 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
7286 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
7287 
7288 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
7289 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
7290 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
7291 
7292 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
7293 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
7294 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
7295 
7296 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
7297 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
7298 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
7299 
7300 DO_LD1_ZPZ_Q(qq_le, zd, MO_128)
7301 DO_LD1_ZPZ_Q(qq_be, zd, MO_128)
7302 
7303 #undef DO_LD1_ZPZ_S
7304 #undef DO_LD1_ZPZ_D
7305 
7306 /* First fault loads with a vector index.  */
7307 
7308 /*
7309  * Common helpers for all gather first-faulting loads.
7310  */
7311 
7312 static inline QEMU_ALWAYS_INLINE
7313 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7314                  target_ulong base, uint32_t desc, uintptr_t retaddr,
7315                  uint32_t mtedesc, const int esz, const int msz,
7316                  zreg_off_fn *off_fn,
7317                  sve_ldst1_host_fn *host_fn,
7318                  sve_ldst1_tlb_fn *tlb_fn)
7319 {
7320     const int mmu_idx = arm_env_mmu_index(env);
7321     const intptr_t reg_max = simd_oprsz(desc);
7322     const int scale = simd_data(desc);
7323     const int esize = 1 << esz;
7324     const int msize = 1 << msz;
7325     intptr_t reg_off;
7326     SVEHostPage info;
7327     target_ulong addr, in_page;
7328     ARMVectorReg scratch;
7329 
7330     /* Skip to the first true predicate.  */
7331     reg_off = find_next_active(vg, 0, reg_max, esz);
7332     if (unlikely(reg_off >= reg_max)) {
7333         /* The entire predicate was false; no load occurs.  */
7334         memset(vd, 0, reg_max);
7335         return;
7336     }
7337 
7338     /* Protect against overlap between vd and vm. */
7339     if (unlikely(vd == vm)) {
7340         vm = memcpy(&scratch, vm, reg_max);
7341     }
7342 
7343     /*
7344      * Probe the first element, allowing faults.
7345      */
7346     addr = base + (off_fn(vm, reg_off) << scale);
7347     if (mtedesc) {
7348         mte_check(env, mtedesc, addr, retaddr);
7349     }
7350     tlb_fn(env, vd, reg_off, addr, retaddr);
7351 
7352     /* After any fault, zero the other elements. */
7353     swap_memzero(vd, reg_off);
7354     reg_off += esize;
7355     swap_memzero(vd + reg_off, reg_max - reg_off);
7356 
7357     /*
7358      * Probe the remaining elements, not allowing faults.
7359      */
7360     while (reg_off < reg_max) {
7361         uint64_t pg = vg[reg_off >> 6];
7362         do {
7363             if (likely((pg >> (reg_off & 63)) & 1)) {
7364                 addr = base + (off_fn(vm, reg_off) << scale);
7365                 in_page = -(addr | TARGET_PAGE_MASK);
7366 
7367                 if (unlikely(in_page < msize)) {
7368                     /* Stop if the element crosses a page boundary. */
7369                     goto fault;
7370                 }
7371 
7372                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
7373                                mmu_idx, retaddr);
7374                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
7375                     goto fault;
7376                 }
7377                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
7378                     (cpu_watchpoint_address_matches
7379                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
7380                     goto fault;
7381                 }
7382                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
7383                     goto fault;
7384                 }
7385 
7386                 set_helper_retaddr(retaddr);
7387                 host_fn(vd, reg_off, info.host);
7388                 clear_helper_retaddr();
7389             }
7390             reg_off += esize;
7391         } while (reg_off & 63);
7392     }
7393     return;
7394 
7395  fault:
7396     record_fault(env, reg_off, reg_max);
7397 }
7398 
7399 static inline QEMU_ALWAYS_INLINE
sve_ldff1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint64_t desc,uintptr_t retaddr,const int esz,const int msz,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7400 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7401                      target_ulong base, uint64_t desc, uintptr_t retaddr,
7402                      const int esz, const int msz,
7403                      zreg_off_fn *off_fn,
7404                      sve_ldst1_host_fn *host_fn,
7405                      sve_ldst1_tlb_fn *tlb_fn)
7406 {
7407     uint32_t mtedesc = desc >> 32;
7408 
7409     /*
7410      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7411      * offset base entirely over the address space hole to change the
7412      * pointer tag, or change the bit55 selector.  So we could here
7413      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7414      */
7415     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7416                 esz, msz, off_fn, host_fn, tlb_fn);
7417 }
7418 
7419 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
7420 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7421     (CPUARMState *env, void *vd, void *vg,                              \
7422      void *vm, target_ulong base, uint64_t desc)                        \
7423 {                                                                       \
7424     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
7425                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7426 }                                                                       \
7427 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7428     (CPUARMState *env, void *vd, void *vg,                              \
7429      void *vm, target_ulong base, uint64_t desc)                        \
7430 {                                                                       \
7431     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
7432                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7433 }
7434 
7435 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
7436 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7437     (CPUARMState *env, void *vd, void *vg,                              \
7438      void *vm, target_ulong base, uint64_t desc)                        \
7439 {                                                                       \
7440     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
7441                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7442 }                                                                       \
7443 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7444     (CPUARMState *env, void *vd, void *vg,                              \
7445      void *vm, target_ulong base, uint64_t desc)                        \
7446 {                                                                       \
7447     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
7448                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7449 }
7450 
DO_LDFF1_ZPZ_S(bsu,zsu,MO_8)7451 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7452 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7453 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7454 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7455 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7456 
7457 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7458 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7459 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7460 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7461 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7462 
7463 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7464 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7465 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7466 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7467 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7468 
7469 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7470 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7471 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7472 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7473 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7474 
7475 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7476 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7477 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7478 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7479 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7480 
7481 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7482 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7483 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7484 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7485 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7486 
7487 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
7488 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
7489 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7490 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7491 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7492 
7493 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
7494 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
7495 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7496 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7497 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7498 
7499 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7500 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7501 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7502 
7503 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7504 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7505 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7506 
7507 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7508 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7509 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7510 
7511 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7512 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7513 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7514 
7515 /* Stores with a vector index.  */
7516 
7517 static inline QEMU_ALWAYS_INLINE
7518 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7519                target_ulong base, uint32_t desc, uintptr_t retaddr,
7520                uint32_t mtedesc, int esize, int msize,
7521                zreg_off_fn *off_fn,
7522                sve_ldst1_host_fn *host_fn,
7523                sve_ldst1_tlb_fn *tlb_fn)
7524 {
7525     const int mmu_idx = arm_env_mmu_index(env);
7526     const intptr_t reg_max = simd_oprsz(desc);
7527     const int scale = simd_data(desc);
7528     void *host[ARM_MAX_VQ * 4];
7529     intptr_t reg_off, i;
7530     SVEHostPage info, info2;
7531 
7532     /*
7533      * Probe all of the elements for host addresses and flags.
7534      */
7535     i = reg_off = 0;
7536     do {
7537         uint64_t pg = vg[reg_off >> 6];
7538         do {
7539             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7540             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7541 
7542             host[i] = NULL;
7543             if (likely((pg >> (reg_off & 63)) & 1)) {
7544                 if (likely(in_page >= msize)) {
7545                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7546                                    mmu_idx, retaddr);
7547                     if (!(info.flags & TLB_MMIO)) {
7548                         host[i] = info.host;
7549                     }
7550                 } else {
7551                     /*
7552                      * Element crosses the page boundary.
7553                      * Probe both pages, but do not record the host address,
7554                      * so that we use the slow path.
7555                      */
7556                     sve_probe_page(&info, false, env, addr, 0,
7557                                    MMU_DATA_STORE, mmu_idx, retaddr);
7558                     sve_probe_page(&info2, false, env, addr + in_page, 0,
7559                                    MMU_DATA_STORE, mmu_idx, retaddr);
7560                     info.flags |= info2.flags;
7561                 }
7562 
7563                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7564                     cpu_check_watchpoint(env_cpu(env), addr, msize,
7565                                          info.attrs, BP_MEM_WRITE, retaddr);
7566                 }
7567 
7568                 if (mtedesc && info.tagged) {
7569                     mte_check(env, mtedesc, addr, retaddr);
7570                 }
7571             }
7572             i += 1;
7573             reg_off += esize;
7574         } while (reg_off & 63);
7575     } while (reg_off < reg_max);
7576 
7577     /*
7578      * Now that we have recognized all exceptions except SyncExternal
7579      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7580      *
7581      * Note for the common case of an element in RAM, not crossing a page
7582      * boundary, we have stored the host address in host[].  This doubles
7583      * as a first-level check against the predicate, since only enabled
7584      * elements have non-null host addresses.
7585      */
7586     i = reg_off = 0;
7587     do {
7588         void *h = host[i];
7589         if (likely(h != NULL)) {
7590             set_helper_retaddr(retaddr);
7591             host_fn(vd, reg_off, h);
7592             clear_helper_retaddr();
7593         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7594             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7595             tlb_fn(env, vd, reg_off, addr, retaddr);
7596         }
7597         i += 1;
7598         reg_off += esize;
7599     } while (reg_off < reg_max);
7600 }
7601 
7602 static inline QEMU_ALWAYS_INLINE
sve_st1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint64_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7603 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7604                    target_ulong base, uint64_t desc, uintptr_t retaddr,
7605                    int esize, int msize, zreg_off_fn *off_fn,
7606                    sve_ldst1_host_fn *host_fn,
7607                    sve_ldst1_tlb_fn *tlb_fn)
7608 {
7609     uint32_t mtedesc = desc >> 32;
7610 
7611     /*
7612      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7613      * offset base entirely over the address space hole to change the
7614      * pointer tag, or change the bit55 selector.  So we could here
7615      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7616      */
7617     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7618               esize, msize, off_fn, host_fn, tlb_fn);
7619 }
7620 
7621 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7622 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7623                                  void *vm, target_ulong base, uint64_t desc) \
7624 {                                                                       \
7625     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7626               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7627 }                                                                       \
7628 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7629     void *vm, target_ulong base, uint64_t desc)                         \
7630 {                                                                       \
7631     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7632                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7633 }
7634 
7635 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7636 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7637                                  void *vm, target_ulong base, uint64_t desc) \
7638 {                                                                       \
7639     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7640               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7641 }                                                                       \
7642 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7643     void *vm, target_ulong base, uint64_t desc)                         \
7644 {                                                                       \
7645     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7646                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7647 }
7648 
7649 #define DO_ST1_ZPZ_Q(MEM, OFS, MSZ)                                     \
7650 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7651                                  void *vm, target_ulong base, uint64_t desc) \
7652 {                                                                       \
7653     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 16, 1 << MSZ,    \
7654               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7655 }                                                                       \
7656 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7657     void *vm, target_ulong base, uint64_t desc)                         \
7658 {                                                                       \
7659     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 16, 1 << MSZ,   \
7660                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7661 }
7662 
DO_ST1_ZPZ_S(bs,zsu,MO_8)7663 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7664 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7665 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7666 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7667 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7668 
7669 DO_ST1_ZPZ_S(bs, zss, MO_8)
7670 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7671 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7672 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7673 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7674 
7675 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7676 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7677 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7678 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7679 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7680 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7681 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7682 
7683 DO_ST1_ZPZ_D(bd, zss, MO_8)
7684 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7685 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7686 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7687 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7688 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7689 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7690 
7691 DO_ST1_ZPZ_D(bd, zd, MO_8)
7692 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7693 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7694 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7695 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7696 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7697 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7698 
7699 DO_ST1_ZPZ_Q(qq_le, zd, MO_128)
7700 DO_ST1_ZPZ_Q(qq_be, zd, MO_128)
7701 
7702 #undef DO_ST1_ZPZ_S
7703 #undef DO_ST1_ZPZ_D
7704 
7705 /*
7706  * SVE2.1 consecutive register load/store
7707  */
7708 
7709 static unsigned sve2p1_cont_ldst_elements(SVEContLdSt *info, vaddr addr,
7710                                           uint32_t png, intptr_t reg_max,
7711                                           int N, int v_esz)
7712 {
7713     const int esize = 1 << v_esz;
7714     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
7715     DecodeCounter p = decode_counter(png, reg_max, v_esz);
7716     unsigned b_count = p.count << v_esz;
7717     unsigned b_stride = 1 << (v_esz + p.lg2_stride);
7718     intptr_t page_split;
7719 
7720     /* Set all of the element indices to -1, and the TLB data to 0. */
7721     memset(info, -1, offsetof(SVEContLdSt, page));
7722     memset(info->page, 0, sizeof(info->page));
7723 
7724     if (p.invert) {
7725         if (b_count >= reg_max * N) {
7726             return 0;
7727         }
7728         reg_off_first = b_count;
7729         reg_off_last = reg_max * N - b_stride;
7730     } else {
7731         if (b_count == 0) {
7732             return 0;
7733         }
7734         reg_off_first = 0;
7735         reg_off_last = MIN(b_count - esize, reg_max * N - b_stride);
7736     }
7737 
7738     info->reg_off_first[0] = reg_off_first;
7739     info->mem_off_first[0] = reg_off_first;
7740 
7741     page_split = -(addr | TARGET_PAGE_MASK);
7742     if (reg_off_last + esize <= page_split || reg_off_first >= page_split) {
7743         /* The entire operation fits within a single page. */
7744         info->reg_off_last[0] = reg_off_last;
7745         return b_stride;
7746     }
7747 
7748     info->page_split = page_split;
7749     reg_off_split = ROUND_DOWN(page_split, esize);
7750 
7751     /*
7752      * This is the last full element on the first page, but it is not
7753      * necessarily active.  If there is no full element, i.e. the first
7754      * active element is the one that's split, this value remains -1.
7755      * It is useful as iteration bounds.
7756      */
7757     if (reg_off_split != 0) {
7758         info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride);
7759     }
7760 
7761     /* Determine if an unaligned element spans the pages.  */
7762     if (page_split & (esize - 1)) {
7763         /* It is helpful to know if the split element is active. */
7764         if ((reg_off_split & (b_stride - 1)) == 0) {
7765             info->reg_off_split = reg_off_split;
7766             info->mem_off_split = reg_off_split;
7767         }
7768         reg_off_split += esize;
7769     }
7770 
7771     /*
7772      * We do want the first active element on the second page, because
7773      * this may affect the address reported in an exception.
7774      */
7775     reg_off_split = ROUND_UP(reg_off_split, b_stride);
7776     if (reg_off_split <= reg_off_last) {
7777         info->reg_off_first[1] = reg_off_split;
7778         info->mem_off_first[1] = reg_off_split;
7779         info->reg_off_last[1] = reg_off_last;
7780     }
7781     return b_stride;
7782 }
7783 
sve2p1_cont_ldst_watchpoints(SVEContLdSt * info,CPUARMState * env,target_ulong addr,unsigned estride,int esize,int wp_access,uintptr_t ra)7784 static void sve2p1_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
7785                                          target_ulong addr, unsigned estride,
7786                                          int esize, int wp_access, uintptr_t ra)
7787 {
7788 #ifndef CONFIG_USER_ONLY
7789     intptr_t count_off, count_last;
7790     int flags0 = info->page[0].flags;
7791     int flags1 = info->page[1].flags;
7792 
7793     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
7794         return;
7795     }
7796 
7797     /* Indicate that watchpoints are handled. */
7798     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
7799     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
7800 
7801     if (flags0 & TLB_WATCHPOINT) {
7802         count_off = info->reg_off_first[0];
7803         count_last = info->reg_off_split;
7804         if (count_last < 0) {
7805             count_last = info->reg_off_last[0];
7806         }
7807         do {
7808             cpu_check_watchpoint(env_cpu(env), addr + count_off,
7809                                  esize, info->page[0].attrs, wp_access, ra);
7810             count_off += estride;
7811         } while (count_off <= count_last);
7812     }
7813 
7814     count_off = info->reg_off_first[1];
7815     if ((flags1 & TLB_WATCHPOINT) && count_off >= 0) {
7816         count_last = info->reg_off_last[1];
7817         do {
7818             cpu_check_watchpoint(env_cpu(env), addr + count_off,
7819                                  esize, info->page[1].attrs,
7820                                  wp_access, ra);
7821             count_off += estride;
7822         } while (count_off <= count_last);
7823     }
7824 #endif
7825 }
7826 
sve2p1_cont_ldst_mte_check(SVEContLdSt * info,CPUARMState * env,target_ulong addr,unsigned estride,int esize,uint32_t mtedesc,uintptr_t ra)7827 static void sve2p1_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
7828                                        target_ulong addr, unsigned estride,
7829                                        int esize, uint32_t mtedesc,
7830                                        uintptr_t ra)
7831 {
7832     intptr_t count_off, count_last;
7833 
7834     /*
7835      * TODO: estride is always a small power of two, <= 8.
7836      * Manipulate the stride within the loops such that
7837      *   - first iteration hits addr + off, as required,
7838      *   - second iteration hits ALIGN_UP(addr, 16),
7839      *   - other iterations advance addr by 16.
7840      * This will minimize the probing to once per MTE granule.
7841      */
7842 
7843     /* Process the page only if MemAttr == Tagged. */
7844     if (info->page[0].tagged) {
7845         count_off = info->reg_off_first[0];
7846         count_last = info->reg_off_split;
7847         if (count_last < 0) {
7848             count_last = info->reg_off_last[0];
7849         }
7850 
7851         do {
7852             mte_check(env, mtedesc, addr + count_off, ra);
7853             count_off += estride;
7854         } while (count_off <= count_last);
7855     }
7856 
7857     count_off = info->reg_off_first[1];
7858     if (count_off >= 0 && info->page[1].tagged) {
7859         count_last = info->reg_off_last[1];
7860         do {
7861             mte_check(env, mtedesc, addr + count_off, ra);
7862             count_off += estride;
7863         } while (count_off <= count_last);
7864     }
7865 }
7866 
7867 static inline QEMU_ALWAYS_INLINE
sve2p1_ld1_c(CPUARMState * env,ARMVectorReg * zd,const vaddr addr,uint32_t png,uint64_t desc64,const uintptr_t ra,const MemOp esz,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7868 void sve2p1_ld1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr,
7869                   uint32_t png, uint64_t desc64,
7870                   const uintptr_t ra, const MemOp esz,
7871                   sve_ldst1_host_fn *host_fn,
7872                   sve_ldst1_tlb_fn *tlb_fn)
7873 {
7874     uint32_t mtedesc = desc64 >> 32;
7875     uint32_t desc = desc64;
7876     const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2;
7877     const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4);
7878     const intptr_t reg_max = simd_oprsz(desc);
7879     const unsigned esize = 1 << esz;
7880     intptr_t count_off, count_last;
7881     intptr_t reg_off, reg_last, reg_n;
7882     SVEContLdSt info;
7883     unsigned estride, flags;
7884     void *host;
7885 
7886     estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz);
7887     if (estride == 0) {
7888         /* The entire predicate was false; no load occurs.  */
7889         for (unsigned n = 0; n < N; n++) {
7890             memset(zd + n * rstride, 0, reg_max);
7891         }
7892         return;
7893     }
7894 
7895     /* Probe the page(s).  Exit with exception for any invalid page. */
7896     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
7897 
7898     /* Handle watchpoints for all active elements. */
7899     sve2p1_cont_ldst_watchpoints(&info, env, addr, estride,
7900                                  esize, BP_MEM_READ, ra);
7901 
7902     /*
7903      * Handle mte checks for all active elements.
7904      * Since TBI must be set for MTE, !mtedesc => !mte_active.
7905      */
7906     if (mtedesc) {
7907         sve2p1_cont_ldst_mte_check(&info, env, estride, addr,
7908                                    esize, mtedesc, ra);
7909     }
7910 
7911     flags = info.page[0].flags | info.page[1].flags;
7912     if (unlikely(flags != 0)) {
7913         /*
7914          * At least one page includes MMIO.
7915          * Any bus operation can fail with cpu_transaction_failed,
7916          * which for ARM will raise SyncExternal.  Perform the load
7917          * into scratch memory to preserve register state until the end.
7918          */
7919         ARMVectorReg scratch[4] = { };
7920 
7921         count_off = info.reg_off_first[0];
7922         count_last = info.reg_off_last[1];
7923         if (count_last < 0) {
7924             count_last = info.reg_off_split;
7925             if (count_last < 0) {
7926                 count_last = info.reg_off_last[0];
7927             }
7928         }
7929         reg_off = count_off % reg_max;
7930         reg_n = count_off / reg_max;
7931 
7932         do {
7933             reg_last = MIN(count_last - count_off, reg_max - esize);
7934             do {
7935                 tlb_fn(env, &scratch[reg_n], reg_off, addr + count_off, ra);
7936                 reg_off += estride;
7937                 count_off += estride;
7938             } while (reg_off <= reg_last);
7939             reg_off = 0;
7940             reg_n++;
7941         } while (count_off <= count_last);
7942 
7943         for (unsigned n = 0; n < N; ++n) {
7944             memcpy(&zd[n * rstride], &scratch[n], reg_max);
7945         }
7946         return;
7947     }
7948 
7949     /* The entire operation is in RAM, on valid pages. */
7950 
7951     for (unsigned n = 0; n < N; ++n) {
7952         memset(&zd[n * rstride], 0, reg_max);
7953     }
7954 
7955     count_off = info.reg_off_first[0];
7956     count_last = info.reg_off_last[0];
7957     reg_off = count_off % reg_max;
7958     reg_n = count_off / reg_max;
7959     host = info.page[0].host;
7960 
7961     set_helper_retaddr(ra);
7962 
7963     do {
7964         reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
7965         do {
7966             host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
7967             reg_off += estride;
7968             count_off += estride;
7969         } while (reg_off <= reg_last);
7970         reg_off = 0;
7971         reg_n++;
7972     } while (count_off <= count_last);
7973 
7974     clear_helper_retaddr();
7975 
7976     /*
7977      * Use the slow path to manage the cross-page misalignment.
7978      * But we know this is RAM and cannot trap.
7979      */
7980     count_off = info.reg_off_split;
7981     if (unlikely(count_off >= 0)) {
7982         reg_off = count_off % reg_max;
7983         reg_n = count_off / reg_max;
7984         tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
7985     }
7986 
7987     count_off = info.reg_off_first[1];
7988     if (unlikely(count_off >= 0)) {
7989         count_last = info.reg_off_last[1];
7990         reg_off = count_off % reg_max;
7991         reg_n = count_off / reg_max;
7992         host = info.page[1].host;
7993 
7994         set_helper_retaddr(ra);
7995 
7996         do {
7997             reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
7998             do {
7999                 host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
8000                 reg_off += estride;
8001                 count_off += estride;
8002             } while (reg_off <= reg_last);
8003             reg_off = 0;
8004             reg_n++;
8005         } while (count_off <= count_last);
8006 
8007         clear_helper_retaddr();
8008     }
8009 }
8010 
HELPER(sve2p1_ld1bb_c)8011 void HELPER(sve2p1_ld1bb_c)(CPUARMState *env, void *vd, target_ulong addr,
8012                             uint32_t png, uint64_t desc)
8013 {
8014     sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), MO_8,
8015                  sve_ld1bb_host, sve_ld1bb_tlb);
8016 }
8017 
8018 #define DO_LD1_2(NAME, ESZ)                                             \
8019 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd,           \
8020                                   target_ulong addr, uint32_t png,      \
8021                                   uint64_t desc)                        \
8022 {                                                                       \
8023     sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ,                \
8024                  sve_##NAME##_le_host, sve_##NAME##_le_tlb);            \
8025 }                                                                       \
8026 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd,           \
8027                                   target_ulong addr, uint32_t png,      \
8028                                   uint64_t desc)                        \
8029 {                                                                       \
8030     sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ,                \
8031                  sve_##NAME##_be_host, sve_##NAME##_be_tlb);            \
8032 }
8033 
DO_LD1_2(ld1hh,MO_16)8034 DO_LD1_2(ld1hh, MO_16)
8035 DO_LD1_2(ld1ss, MO_32)
8036 DO_LD1_2(ld1dd, MO_64)
8037 
8038 #undef DO_LD1_2
8039 
8040 static inline QEMU_ALWAYS_INLINE
8041 void sve2p1_st1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr,
8042                   uint32_t png, uint64_t desc64,
8043                   const uintptr_t ra, const int esz,
8044                   sve_ldst1_host_fn *host_fn,
8045                   sve_ldst1_tlb_fn *tlb_fn)
8046 {
8047     uint32_t mtedesc = desc64 >> 32;
8048     uint32_t desc = desc64;
8049     const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2;
8050     const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4);
8051     const intptr_t reg_max = simd_oprsz(desc);
8052     const unsigned esize = 1 << esz;
8053     intptr_t count_off, count_last;
8054     intptr_t reg_off, reg_last, reg_n;
8055     SVEContLdSt info;
8056     unsigned estride, flags;
8057     void *host;
8058 
8059     estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz);
8060     if (estride == 0) {
8061         /* The entire predicate was false; no store occurs.  */
8062         return;
8063     }
8064 
8065     /* Probe the page(s).  Exit with exception for any invalid page. */
8066     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
8067 
8068     /* Handle watchpoints for all active elements. */
8069     sve2p1_cont_ldst_watchpoints(&info, env, addr, estride,
8070                                  esize, BP_MEM_WRITE, ra);
8071 
8072     /*
8073      * Handle mte checks for all active elements.
8074      * Since TBI must be set for MTE, !mtedesc => !mte_active.
8075      */
8076     if (mtedesc) {
8077         sve2p1_cont_ldst_mte_check(&info, env, estride, addr,
8078                                    esize, mtedesc, ra);
8079     }
8080 
8081     flags = info.page[0].flags | info.page[1].flags;
8082     if (unlikely(flags != 0)) {
8083         /*
8084          * At least one page includes MMIO.
8085          * Any bus operation can fail with cpu_transaction_failed,
8086          * which for ARM will raise SyncExternal.  Perform the load
8087          * into scratch memory to preserve register state until the end.
8088          */
8089         count_off = info.reg_off_first[0];
8090         count_last = info.reg_off_last[1];
8091         if (count_last < 0) {
8092             count_last = info.reg_off_split;
8093             if (count_last < 0) {
8094                 count_last = info.reg_off_last[0];
8095             }
8096         }
8097         reg_off = count_off % reg_max;
8098         reg_n = count_off / reg_max;
8099 
8100         do {
8101             reg_last = MIN(count_last - count_off, reg_max - esize);
8102             do {
8103                 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
8104                 reg_off += estride;
8105                 count_off += estride;
8106             } while (reg_off <= reg_last);
8107             reg_off = 0;
8108             reg_n++;
8109         } while (count_off <= count_last);
8110         return;
8111     }
8112 
8113     /* The entire operation is in RAM, on valid pages. */
8114 
8115     count_off = info.reg_off_first[0];
8116     count_last = info.reg_off_last[0];
8117     reg_off = count_off % reg_max;
8118     reg_n = count_off / reg_max;
8119     host = info.page[0].host;
8120 
8121     set_helper_retaddr(ra);
8122 
8123     do {
8124         reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
8125         do {
8126             host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
8127             reg_off += estride;
8128             count_off += estride;
8129         } while (reg_off <= reg_last);
8130         reg_off = 0;
8131         reg_n++;
8132     } while (count_off <= count_last);
8133 
8134     clear_helper_retaddr();
8135 
8136     /*
8137      * Use the slow path to manage the cross-page misalignment.
8138      * But we know this is RAM and cannot trap.
8139      */
8140     count_off = info.reg_off_split;
8141     if (unlikely(count_off >= 0)) {
8142         reg_off = count_off % reg_max;
8143         reg_n = count_off / reg_max;
8144         tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
8145     }
8146 
8147     count_off = info.reg_off_first[1];
8148     if (unlikely(count_off >= 0)) {
8149         count_last = info.reg_off_last[1];
8150         reg_off = count_off % reg_max;
8151         reg_n = count_off / reg_max;
8152         host = info.page[1].host;
8153 
8154         set_helper_retaddr(ra);
8155 
8156         do {
8157             reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
8158             do {
8159                 host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
8160                 reg_off += estride;
8161                 count_off += estride;
8162             } while (reg_off <= reg_last);
8163             reg_off = 0;
8164             reg_n++;
8165         } while (count_off <= count_last);
8166 
8167         clear_helper_retaddr();
8168     }
8169 }
8170 
HELPER(sve2p1_st1bb_c)8171 void HELPER(sve2p1_st1bb_c)(CPUARMState *env, void *vd, target_ulong addr,
8172                            uint32_t png, uint64_t desc)
8173 {
8174     sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), MO_8,
8175                  sve_st1bb_host, sve_st1bb_tlb);
8176 }
8177 
8178 #define DO_ST1_2(NAME, ESZ)                                             \
8179 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd,           \
8180                                   target_ulong addr, uint32_t png,      \
8181                                   uint64_t desc)                        \
8182 {                                                                       \
8183     sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ,                \
8184                  sve_##NAME##_le_host, sve_##NAME##_le_tlb);            \
8185 }                                                                       \
8186 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd,           \
8187                                   target_ulong addr, uint32_t png,      \
8188                                   uint64_t desc)                        \
8189 {                                                                       \
8190     sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ,                \
8191                  sve_##NAME##_be_host, sve_##NAME##_be_tlb);            \
8192 }
8193 
DO_ST1_2(st1hh,MO_16)8194 DO_ST1_2(st1hh, MO_16)
8195 DO_ST1_2(st1ss, MO_32)
8196 DO_ST1_2(st1dd, MO_64)
8197 
8198 #undef DO_ST1_2
8199 
8200 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8201 {
8202     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8203     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8204 
8205     for (i = 0; i < opr_sz; ++i) {
8206         d[i] = n[i] ^ m[i] ^ k[i];
8207     }
8208 }
8209 
HELPER(sve2_bcax)8210 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8211 {
8212     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8213     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8214 
8215     for (i = 0; i < opr_sz; ++i) {
8216         d[i] = n[i] ^ (m[i] & ~k[i]);
8217     }
8218 }
8219 
HELPER(sve2_bsl1n)8220 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8221 {
8222     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8223     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8224 
8225     for (i = 0; i < opr_sz; ++i) {
8226         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
8227     }
8228 }
8229 
HELPER(sve2_bsl2n)8230 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8231 {
8232     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8233     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8234 
8235     for (i = 0; i < opr_sz; ++i) {
8236         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
8237     }
8238 }
8239 
HELPER(sve2_nbsl)8240 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8241 {
8242     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8243     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8244 
8245     for (i = 0; i < opr_sz; ++i) {
8246         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
8247     }
8248 }
8249 
8250 /*
8251  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
8252  * See hasless(v,1) from
8253  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
8254  */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)8255 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
8256 {
8257     int bits = 8 << esz;
8258     uint64_t ones = dup_const(esz, 1);
8259     uint64_t signs = ones << (bits - 1);
8260     uint64_t cmp0, cmp1;
8261 
8262     cmp1 = dup_const(esz, n);
8263     cmp0 = cmp1 ^ m0;
8264     cmp1 = cmp1 ^ m1;
8265     cmp0 = (cmp0 - ones) & ~cmp0;
8266     cmp1 = (cmp1 - ones) & ~cmp1;
8267     return (cmp0 | cmp1) & signs;
8268 }
8269 
do_match(void * vd,void * vn,void * vm,void * vg,uint32_t desc,int esz,bool nmatch)8270 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
8271                                 uint32_t desc, int esz, bool nmatch)
8272 {
8273     uint16_t esz_mask = pred_esz_masks[esz];
8274     intptr_t opr_sz = simd_oprsz(desc);
8275     uint32_t flags = PREDTEST_INIT;
8276     intptr_t i, j, k;
8277 
8278     for (i = 0; i < opr_sz; i += 16) {
8279         uint64_t m0 = *(uint64_t *)(vm + i);
8280         uint64_t m1 = *(uint64_t *)(vm + i + 8);
8281         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
8282         uint16_t out = 0;
8283 
8284         for (j = 0; j < 16; j += 8) {
8285             uint64_t n = *(uint64_t *)(vn + i + j);
8286 
8287             for (k = 0; k < 8; k += 1 << esz) {
8288                 if (pg & (1 << (j + k))) {
8289                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
8290                     out |= (o ^ nmatch) << (j + k);
8291                 }
8292             }
8293         }
8294         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
8295         flags = iter_predtest_fwd(out, pg, flags);
8296     }
8297     return flags;
8298 }
8299 
8300 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
8301 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
8302 {                                                                             \
8303     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
8304 }
8305 
DO_PPZZ_MATCH(sve2_match_ppzz_b,MO_8,false)8306 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
8307 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
8308 
8309 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
8310 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
8311 
8312 #undef DO_PPZZ_MATCH
8313 
8314 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
8315                             uint32_t desc)
8316 {
8317     ARMVectorReg scratch;
8318     intptr_t i, j;
8319     intptr_t opr_sz = simd_oprsz(desc);
8320     uint32_t *d = vd, *n = vn, *m = vm;
8321     uint8_t *pg = vg;
8322 
8323     if (d == n) {
8324         n = memcpy(&scratch, n, opr_sz);
8325         if (d == m) {
8326             m = n;
8327         }
8328     } else if (d == m) {
8329         m = memcpy(&scratch, m, opr_sz);
8330     }
8331 
8332     for (i = 0; i < opr_sz; i += 4) {
8333         uint64_t count = 0;
8334         uint8_t pred;
8335 
8336         pred = pg[H1(i >> 3)] >> (i & 7);
8337         if (pred & 1) {
8338             uint32_t nn = n[H4(i >> 2)];
8339 
8340             for (j = 0; j <= i; j += 4) {
8341                 pred = pg[H1(j >> 3)] >> (j & 7);
8342                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
8343                     ++count;
8344                 }
8345             }
8346         }
8347         d[H4(i >> 2)] = count;
8348     }
8349 }
8350 
HELPER(sve2_histcnt_d)8351 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
8352                             uint32_t desc)
8353 {
8354     ARMVectorReg scratch;
8355     intptr_t i, j;
8356     intptr_t opr_sz = simd_oprsz(desc);
8357     uint64_t *d = vd, *n = vn, *m = vm;
8358     uint8_t *pg = vg;
8359 
8360     if (d == n) {
8361         n = memcpy(&scratch, n, opr_sz);
8362         if (d == m) {
8363             m = n;
8364         }
8365     } else if (d == m) {
8366         m = memcpy(&scratch, m, opr_sz);
8367     }
8368 
8369     for (i = 0; i < opr_sz / 8; ++i) {
8370         uint64_t count = 0;
8371         if (pg[H1(i)] & 1) {
8372             uint64_t nn = n[i];
8373             for (j = 0; j <= i; ++j) {
8374                 if ((pg[H1(j)] & 1) && nn == m[j]) {
8375                     ++count;
8376                 }
8377             }
8378         }
8379         d[i] = count;
8380     }
8381 }
8382 
8383 /*
8384  * Returns the number of bytes in m0 and m1 that match n.
8385  * Unlike do_match2 we don't just need true/false, we need an exact count.
8386  * This requires two extra logical operations.
8387  */
do_histseg_cnt(uint8_t n,uint64_t m0,uint64_t m1)8388 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
8389 {
8390     const uint64_t mask = dup_const(MO_8, 0x7f);
8391     uint64_t cmp0, cmp1;
8392 
8393     cmp1 = dup_const(MO_8, n);
8394     cmp0 = cmp1 ^ m0;
8395     cmp1 = cmp1 ^ m1;
8396 
8397     /*
8398      * 1: clear msb of each byte to avoid carry to next byte (& mask)
8399      * 2: carry in to msb if byte != 0 (+ mask)
8400      * 3: set msb if cmp has msb set (| cmp)
8401      * 4: set ~msb to ignore them (| mask)
8402      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
8403      * 5: invert, resulting in 0x80 if and only if byte == 0.
8404      */
8405     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
8406     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
8407 
8408     /*
8409      * Combine the two compares in a way that the bits do
8410      * not overlap, and so preserves the count of set bits.
8411      * If the host has an efficient instruction for ctpop,
8412      * then ctpop(x) + ctpop(y) has the same number of
8413      * operations as ctpop(x | (y >> 1)).  If the host does
8414      * not have an efficient ctpop, then we only want to
8415      * use it once.
8416      */
8417     return ctpop64(cmp0 | (cmp1 >> 1));
8418 }
8419 
HELPER(sve2_histseg)8420 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
8421 {
8422     intptr_t i, j;
8423     intptr_t opr_sz = simd_oprsz(desc);
8424 
8425     for (i = 0; i < opr_sz; i += 16) {
8426         uint64_t n0 = *(uint64_t *)(vn + i);
8427         uint64_t m0 = *(uint64_t *)(vm + i);
8428         uint64_t n1 = *(uint64_t *)(vn + i + 8);
8429         uint64_t m1 = *(uint64_t *)(vm + i + 8);
8430         uint64_t out0 = 0;
8431         uint64_t out1 = 0;
8432 
8433         for (j = 0; j < 64; j += 8) {
8434             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
8435             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
8436             out0 |= cnt0 << j;
8437             out1 |= cnt1 << j;
8438         }
8439 
8440         *(uint64_t *)(vd + i) = out0;
8441         *(uint64_t *)(vd + i + 8) = out1;
8442     }
8443 }
8444 
HELPER(sve2_xar_b)8445 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
8446 {
8447     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8448     int shr = simd_data(desc);
8449     int shl = 8 - shr;
8450     uint64_t mask = dup_const(MO_8, 0xff >> shr);
8451     uint64_t *d = vd, *n = vn, *m = vm;
8452 
8453     for (i = 0; i < opr_sz; ++i) {
8454         uint64_t t = n[i] ^ m[i];
8455         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
8456     }
8457 }
8458 
HELPER(sve2_xar_h)8459 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
8460 {
8461     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8462     int shr = simd_data(desc);
8463     int shl = 16 - shr;
8464     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
8465     uint64_t *d = vd, *n = vn, *m = vm;
8466 
8467     for (i = 0; i < opr_sz; ++i) {
8468         uint64_t t = n[i] ^ m[i];
8469         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
8470     }
8471 }
8472 
HELPER(sve2_xar_s)8473 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
8474 {
8475     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
8476     int shr = simd_data(desc);
8477     uint32_t *d = vd, *n = vn, *m = vm;
8478 
8479     for (i = 0; i < opr_sz; ++i) {
8480         d[i] = ror32(n[i] ^ m[i], shr);
8481     }
8482 }
8483 
HELPER(fmmla_s)8484 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
8485                      float_status *status, uint32_t desc)
8486 {
8487     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
8488 
8489     for (s = 0; s < opr_sz; ++s) {
8490         float32 *n = vn + s * sizeof(float32) * 4;
8491         float32 *m = vm + s * sizeof(float32) * 4;
8492         float32 *a = va + s * sizeof(float32) * 4;
8493         float32 *d = vd + s * sizeof(float32) * 4;
8494         float32 n00 = n[H4(0)], n01 = n[H4(1)];
8495         float32 n10 = n[H4(2)], n11 = n[H4(3)];
8496         float32 m00 = m[H4(0)], m01 = m[H4(1)];
8497         float32 m10 = m[H4(2)], m11 = m[H4(3)];
8498         float32 p0, p1;
8499 
8500         /* i = 0, j = 0 */
8501         p0 = float32_mul(n00, m00, status);
8502         p1 = float32_mul(n01, m01, status);
8503         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
8504 
8505         /* i = 0, j = 1 */
8506         p0 = float32_mul(n00, m10, status);
8507         p1 = float32_mul(n01, m11, status);
8508         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
8509 
8510         /* i = 1, j = 0 */
8511         p0 = float32_mul(n10, m00, status);
8512         p1 = float32_mul(n11, m01, status);
8513         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
8514 
8515         /* i = 1, j = 1 */
8516         p0 = float32_mul(n10, m10, status);
8517         p1 = float32_mul(n11, m11, status);
8518         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
8519     }
8520 }
8521 
HELPER(fmmla_d)8522 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
8523                      float_status *status, uint32_t desc)
8524 {
8525     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
8526 
8527     for (s = 0; s < opr_sz; ++s) {
8528         float64 *n = vn + s * sizeof(float64) * 4;
8529         float64 *m = vm + s * sizeof(float64) * 4;
8530         float64 *a = va + s * sizeof(float64) * 4;
8531         float64 *d = vd + s * sizeof(float64) * 4;
8532         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
8533         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
8534         float64 p0, p1;
8535 
8536         /* i = 0, j = 0 */
8537         p0 = float64_mul(n00, m00, status);
8538         p1 = float64_mul(n01, m01, status);
8539         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
8540 
8541         /* i = 0, j = 1 */
8542         p0 = float64_mul(n00, m10, status);
8543         p1 = float64_mul(n01, m11, status);
8544         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
8545 
8546         /* i = 1, j = 0 */
8547         p0 = float64_mul(n10, m00, status);
8548         p1 = float64_mul(n11, m01, status);
8549         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
8550 
8551         /* i = 1, j = 1 */
8552         p0 = float64_mul(n10, m10, status);
8553         p1 = float64_mul(n11, m11, status);
8554         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
8555     }
8556 }
8557 
8558 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
8559 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
8560                   float_status *status, uint32_t desc)                        \
8561 {                                                                             \
8562     intptr_t i = simd_oprsz(desc);                                            \
8563     uint64_t *g = vg;                                                         \
8564     do {                                                                      \
8565         uint64_t pg = g[(i - 1) >> 6];                                        \
8566         do {                                                                  \
8567             i -= sizeof(TYPEW);                                               \
8568             if (likely((pg >> (i & 63)) & 1)) {                               \
8569                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
8570                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
8571             }                                                                 \
8572         } while (i & 63);                                                     \
8573     } while (i != 0);                                                         \
8574 }
8575 
DO_FCVTNT(sve_bfcvtnt,uint32_t,uint16_t,H1_4,H1_2,float32_to_bfloat16)8576 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
8577 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
8578 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
8579 
8580 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
8581 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
8582                   float_status *status, uint32_t desc)                        \
8583 {                                                                             \
8584     intptr_t i = simd_oprsz(desc);                                            \
8585     uint64_t *g = vg;                                                         \
8586     do {                                                                      \
8587         uint64_t pg = g[(i - 1) >> 6];                                        \
8588         do {                                                                  \
8589             i -= sizeof(TYPEW);                                               \
8590             if (likely((pg >> (i & 63)) & 1)) {                               \
8591                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
8592                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
8593             }                                                                 \
8594         } while (i & 63);                                                     \
8595     } while (i != 0);                                                         \
8596 }
8597 
8598 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
8599 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
8600 
8601 #undef DO_FCVTLT
8602 #undef DO_FCVTNT
8603 
8604 void HELPER(pext)(void *vd, uint32_t png, uint32_t desc)
8605 {
8606     int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
8607     int vl = pl * 8;
8608     unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
8609     int part = FIELD_EX32(desc, PREDDESC, DATA);
8610     DecodeCounter p = decode_counter(png, vl, v_esz);
8611     uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride];
8612     ARMPredicateReg *d = vd;
8613 
8614     /*
8615      * Convert from element count to byte count and adjust
8616      * for the portion of the 4*VL counter to be extracted.
8617      */
8618     int b_count = (p.count << v_esz) - vl * part;
8619 
8620     memset(d, 0, sizeof(*d));
8621     if (p.invert) {
8622         if (b_count <= 0) {
8623             do_whilel(vd, mask, vl, vl);
8624         } else if (b_count < vl) {
8625             do_whileg(vd, mask, vl - b_count, vl);
8626         }
8627     } else if (b_count > 0) {
8628         do_whilel(vd, mask, MIN(b_count, vl), vl);
8629     }
8630 }
8631