xref: /openbmc/qemu/target/arm/tcg/sve_helper.c (revision ecd6f6a8)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg.h"
28 #include "vec_internal.h"
29 #include "sve_ldst_internal.h"
30 #include "hw/core/tcg-cpu-ops.h"
31 
32 
33 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
34  *
35  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
36  * and bit 0 set if C is set.  Compare the definitions of these variables
37  * within CPUARMState.
38  */
39 
40 /* For no G bits set, NZCV = C.  */
41 #define PREDTEST_INIT  1
42 
43 /* This is an iterative function, called for each Pd and Pg word
44  * moving forward.
45  */
46 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
47 {
48     if (likely(g)) {
49         /* Compute N from first D & G.
50            Use bit 2 to signal first G bit seen.  */
51         if (!(flags & 4)) {
52             flags |= ((d & (g & -g)) != 0) << 31;
53             flags |= 4;
54         }
55 
56         /* Accumulate Z from each D & G.  */
57         flags |= ((d & g) != 0) << 1;
58 
59         /* Compute C from last !(D & G).  Replace previous.  */
60         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
61     }
62     return flags;
63 }
64 
65 /* This is an iterative function, called for each Pd and Pg word
66  * moving backward.
67  */
68 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
69 {
70     if (likely(g)) {
71         /* Compute C from first (i.e last) !(D & G).
72            Use bit 2 to signal first G bit seen.  */
73         if (!(flags & 4)) {
74             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
75             flags |= (d & pow2floor(g)) == 0;
76         }
77 
78         /* Accumulate Z from each D & G.  */
79         flags |= ((d & g) != 0) << 1;
80 
81         /* Compute N from last (i.e first) D & G.  Replace previous.  */
82         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
83     }
84     return flags;
85 }
86 
87 /* The same for a single word predicate.  */
88 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
89 {
90     return iter_predtest_fwd(d, g, PREDTEST_INIT);
91 }
92 
93 /* The same for a multi-word predicate.  */
94 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
95 {
96     uint32_t flags = PREDTEST_INIT;
97     uint64_t *d = vd, *g = vg;
98     uintptr_t i = 0;
99 
100     do {
101         flags = iter_predtest_fwd(d[i], g[i], flags);
102     } while (++i < words);
103 
104     return flags;
105 }
106 
107 /* Similarly for single word elements.  */
108 static inline uint64_t expand_pred_s(uint8_t byte)
109 {
110     static const uint64_t word[] = {
111         [0x01] = 0x00000000ffffffffull,
112         [0x10] = 0xffffffff00000000ull,
113         [0x11] = 0xffffffffffffffffull,
114     };
115     return word[byte & 0x11];
116 }
117 
118 #define LOGICAL_PPPP(NAME, FUNC) \
119 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
120 {                                                                         \
121     uintptr_t opr_sz = simd_oprsz(desc);                                  \
122     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
123     uintptr_t i;                                                          \
124     for (i = 0; i < opr_sz / 8; ++i) {                                    \
125         d[i] = FUNC(n[i], m[i], g[i]);                                    \
126     }                                                                     \
127 }
128 
129 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
130 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
131 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
132 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
133 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
134 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
135 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
136 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
137 
138 LOGICAL_PPPP(sve_and_pppp, DO_AND)
139 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
140 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
141 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
142 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
143 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
144 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
145 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
146 
147 #undef DO_AND
148 #undef DO_BIC
149 #undef DO_EOR
150 #undef DO_ORR
151 #undef DO_ORN
152 #undef DO_NOR
153 #undef DO_NAND
154 #undef DO_SEL
155 #undef LOGICAL_PPPP
156 
157 /* Fully general three-operand expander, controlled by a predicate.
158  * This is complicated by the host-endian storage of the register file.
159  */
160 /* ??? I don't expect the compiler could ever vectorize this itself.
161  * With some tables we can convert bit masks to byte masks, and with
162  * extra care wrt byte/word ordering we could use gcc generic vectors
163  * and do 16 bytes at a time.
164  */
165 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
166 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
167 {                                                                       \
168     intptr_t i, opr_sz = simd_oprsz(desc);                              \
169     for (i = 0; i < opr_sz; ) {                                         \
170         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
171         do {                                                            \
172             if (pg & 1) {                                               \
173                 TYPE nn = *(TYPE *)(vn + H(i));                         \
174                 TYPE mm = *(TYPE *)(vm + H(i));                         \
175                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
176             }                                                           \
177             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
178         } while (i & 15);                                               \
179     }                                                                   \
180 }
181 
182 /* Similarly, specialized for 64-bit operands.  */
183 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
184 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
185 {                                                               \
186     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
187     TYPE *d = vd, *n = vn, *m = vm;                             \
188     uint8_t *pg = vg;                                           \
189     for (i = 0; i < opr_sz; i += 1) {                           \
190         if (pg[H1(i)] & 1) {                                    \
191             TYPE nn = n[i], mm = m[i];                          \
192             d[i] = OP(nn, mm);                                  \
193         }                                                       \
194     }                                                           \
195 }
196 
197 #define DO_AND(N, M)  (N & M)
198 #define DO_EOR(N, M)  (N ^ M)
199 #define DO_ORR(N, M)  (N | M)
200 #define DO_BIC(N, M)  (N & ~M)
201 #define DO_ADD(N, M)  (N + M)
202 #define DO_SUB(N, M)  (N - M)
203 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
204 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
205 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
206 #define DO_MUL(N, M)  (N * M)
207 
208 
209 /*
210  * We must avoid the C undefined behaviour cases: division by
211  * zero and signed division of INT_MIN by -1. Both of these
212  * have architecturally defined required results for Arm.
213  * We special case all signed divisions by -1 to avoid having
214  * to deduce the minimum integer for the type involved.
215  */
216 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
217 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
218 
219 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
220 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
221 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
222 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
223 
224 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
225 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
226 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
227 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
228 
229 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
230 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
231 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
232 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
233 
234 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
235 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
236 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
237 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
238 
239 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
240 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
241 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
242 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
243 
244 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
245 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
246 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
247 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
248 
249 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
250 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
251 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
252 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
253 
254 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
255 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
256 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
257 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
258 
259 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
260 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
261 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
262 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
263 
264 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
265 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
266 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
267 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
268 
269 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
270 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
271 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
272 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
273 
274 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
275 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
276 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
277 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
278 
279 /* Because the computation type is at least twice as large as required,
280    these work for both signed and unsigned source types.  */
281 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
282 {
283     return (n * m) >> 8;
284 }
285 
286 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
287 {
288     return (n * m) >> 16;
289 }
290 
291 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
292 {
293     return (n * m) >> 32;
294 }
295 
296 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
297 {
298     uint64_t lo, hi;
299     muls64(&lo, &hi, n, m);
300     return hi;
301 }
302 
303 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
304 {
305     uint64_t lo, hi;
306     mulu64(&lo, &hi, n, m);
307     return hi;
308 }
309 
310 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
311 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
312 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
313 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
314 
315 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
316 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
317 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
318 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
319 
320 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
321 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
322 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
323 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
324 
325 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
326 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
327 
328 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
329 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
330 
331 /* Note that all bits of the shift are significant
332    and not modulo the element size.  */
333 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
334 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
335 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
336 
337 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
338 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
339 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
340 
341 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
342 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
343 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
344 
345 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
346 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
347 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
348 
349 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
350 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
351 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
352 
353 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
354 {
355     int8_t n1 = n, n2 = n >> 8;
356     return m + n1 + n2;
357 }
358 
359 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
360 {
361     int16_t n1 = n, n2 = n >> 16;
362     return m + n1 + n2;
363 }
364 
365 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
366 {
367     int32_t n1 = n, n2 = n >> 32;
368     return m + n1 + n2;
369 }
370 
371 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
372 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
373 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
374 
375 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
376 {
377     uint8_t n1 = n, n2 = n >> 8;
378     return m + n1 + n2;
379 }
380 
381 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
382 {
383     uint16_t n1 = n, n2 = n >> 16;
384     return m + n1 + n2;
385 }
386 
387 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
388 {
389     uint32_t n1 = n, n2 = n >> 32;
390     return m + n1 + n2;
391 }
392 
393 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
394 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
395 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
396 
397 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
398 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
399 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
400 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
401 
402 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
403 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
404 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
405 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
406 
407 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
408 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
409 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
410 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
411 
412 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
413 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
414 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
415 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
416 
417 /*
418  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
419  * We pass in a pointer to a dummy saturation field to trigger
420  * the saturating arithmetic but discard the information about
421  * whether it has occurred.
422  */
423 #define do_sqshl_b(n, m) \
424    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
425 #define do_sqshl_h(n, m) \
426    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
427 #define do_sqshl_s(n, m) \
428    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
429 #define do_sqshl_d(n, m) \
430    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
431 
432 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
433 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
434 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
435 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
436 
437 #define do_uqshl_b(n, m) \
438    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
439 #define do_uqshl_h(n, m) \
440    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
441 #define do_uqshl_s(n, m) \
442    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
443 #define do_uqshl_d(n, m) \
444    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
445 
446 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
447 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
448 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
449 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
450 
451 #define do_sqrshl_b(n, m) \
452    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
453 #define do_sqrshl_h(n, m) \
454    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
455 #define do_sqrshl_s(n, m) \
456    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
457 #define do_sqrshl_d(n, m) \
458    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
459 
460 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
461 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
462 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
463 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
464 
465 #undef do_sqrshl_d
466 
467 #define do_uqrshl_b(n, m) \
468    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
469 #define do_uqrshl_h(n, m) \
470    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
471 #define do_uqrshl_s(n, m) \
472    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
473 #define do_uqrshl_d(n, m) \
474    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
475 
476 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
477 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
478 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
479 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
480 
481 #undef do_uqrshl_d
482 
483 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
484 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
485 
486 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
487 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
488 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
489 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
490 
491 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
492 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
493 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
494 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
495 
496 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
497 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
498 
499 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
500 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
501 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
502 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
503 
504 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
505 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
506 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
507 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
508 
509 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
510 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
511 
512 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
513 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
514 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
515 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
516 
517 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
518 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
519 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
520 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
521 
522 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
523 {
524     return val >= max ? max : val <= min ? min : val;
525 }
526 
527 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
528 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
529 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
530 
531 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
532 {
533     int64_t r = n + m;
534     if (((r ^ n) & ~(n ^ m)) < 0) {
535         /* Signed overflow.  */
536         return r < 0 ? INT64_MAX : INT64_MIN;
537     }
538     return r;
539 }
540 
541 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
542 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
543 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
544 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
545 
546 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
547 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
548 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
549 
550 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
551 {
552     uint64_t r = n + m;
553     return r < n ? UINT64_MAX : r;
554 }
555 
556 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
557 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
558 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
559 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
560 
561 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
562 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
563 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
564 
565 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
566 {
567     int64_t r = n - m;
568     if (((r ^ n) & (n ^ m)) < 0) {
569         /* Signed overflow.  */
570         return r < 0 ? INT64_MAX : INT64_MIN;
571     }
572     return r;
573 }
574 
575 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
576 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
577 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
578 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
579 
580 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
581 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
582 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
583 
584 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
585 {
586     return n > m ? n - m : 0;
587 }
588 
589 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
590 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
591 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
592 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
593 
594 #define DO_SUQADD_B(n, m) \
595     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
596 #define DO_SUQADD_H(n, m) \
597     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
598 #define DO_SUQADD_S(n, m) \
599     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
600 
601 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
602 {
603     uint64_t r = n + m;
604 
605     if (n < 0) {
606         /* Note that m - abs(n) cannot underflow. */
607         if (r > INT64_MAX) {
608             /* Result is either very large positive or negative. */
609             if (m > -n) {
610                 /* m > abs(n), so r is a very large positive. */
611                 return INT64_MAX;
612             }
613             /* Result is negative. */
614         }
615     } else {
616         /* Both inputs are positive: check for overflow.  */
617         if (r < m || r > INT64_MAX) {
618             return INT64_MAX;
619         }
620     }
621     return r;
622 }
623 
624 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
625 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
626 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
627 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
628 
629 #define DO_USQADD_B(n, m) \
630     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
631 #define DO_USQADD_H(n, m) \
632     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
633 #define DO_USQADD_S(n, m) \
634     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
635 
636 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
637 {
638     uint64_t r = n + m;
639 
640     if (m < 0) {
641         return n < -m ? 0 : r;
642     }
643     return r < n ? UINT64_MAX : r;
644 }
645 
646 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
647 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
648 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
649 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
650 
651 #undef DO_ZPZZ
652 #undef DO_ZPZZ_D
653 
654 /*
655  * Three operand expander, operating on element pairs.
656  * If the slot I is even, the elements from from VN {I, I+1}.
657  * If the slot I is odd, the elements from from VM {I-1, I}.
658  * Load all of the input elements in each pair before overwriting output.
659  */
660 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
661 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
662 {                                                               \
663     intptr_t i, opr_sz = simd_oprsz(desc);                      \
664     for (i = 0; i < opr_sz; ) {                                 \
665         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
666         do {                                                    \
667             TYPE n0 = *(TYPE *)(vn + H(i));                     \
668             TYPE m0 = *(TYPE *)(vm + H(i));                     \
669             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
670             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
671             if (pg & 1) {                                       \
672                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
673             }                                                   \
674             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
675             if (pg & 1) {                                       \
676                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
677             }                                                   \
678             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
679         } while (i & 15);                                       \
680     }                                                           \
681 }
682 
683 /* Similarly, specialized for 64-bit operands.  */
684 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
685 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
686 {                                                               \
687     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
688     TYPE *d = vd, *n = vn, *m = vm;                             \
689     uint8_t *pg = vg;                                           \
690     for (i = 0; i < opr_sz; i += 2) {                           \
691         TYPE n0 = n[i], n1 = n[i + 1];                          \
692         TYPE m0 = m[i], m1 = m[i + 1];                          \
693         if (pg[H1(i)] & 1) {                                    \
694             d[i] = OP(n0, n1);                                  \
695         }                                                       \
696         if (pg[H1(i + 1)] & 1) {                                \
697             d[i + 1] = OP(m0, m1);                              \
698         }                                                       \
699     }                                                           \
700 }
701 
702 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
703 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
704 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
705 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
706 
707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
709 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
710 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
711 
712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
714 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
715 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
716 
717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
719 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
720 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
721 
722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
724 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
725 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
726 
727 #undef DO_ZPZZ_PAIR
728 #undef DO_ZPZZ_PAIR_D
729 
730 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
731 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
732                   void *status, uint32_t desc)                          \
733 {                                                                       \
734     intptr_t i, opr_sz = simd_oprsz(desc);                              \
735     for (i = 0; i < opr_sz; ) {                                         \
736         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
737         do {                                                            \
738             TYPE n0 = *(TYPE *)(vn + H(i));                             \
739             TYPE m0 = *(TYPE *)(vm + H(i));                             \
740             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
741             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
742             if (pg & 1) {                                               \
743                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
744             }                                                           \
745             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
746             if (pg & 1) {                                               \
747                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
748             }                                                           \
749             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
750         } while (i & 15);                                               \
751     }                                                                   \
752 }
753 
754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
756 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
757 
758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
760 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
761 
762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
764 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
765 
766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
768 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
769 
770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
772 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
773 
774 #undef DO_ZPZZ_PAIR_FP
775 
776 /* Three-operand expander, controlled by a predicate, in which the
777  * third operand is "wide".  That is, for D = N op M, the same 64-bit
778  * value of M is used with all of the narrower values of N.
779  */
780 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
781 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
782 {                                                                       \
783     intptr_t i, opr_sz = simd_oprsz(desc);                              \
784     for (i = 0; i < opr_sz; ) {                                         \
785         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
786         TYPEW mm = *(TYPEW *)(vm + i);                                  \
787         do {                                                            \
788             if (pg & 1) {                                               \
789                 TYPE nn = *(TYPE *)(vn + H(i));                         \
790                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
791             }                                                           \
792             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
793         } while (i & 7);                                                \
794     }                                                                   \
795 }
796 
797 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
798 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
799 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
800 
801 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
802 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
803 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
804 
805 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
806 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
807 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
808 
809 #undef DO_ZPZW
810 
811 /* Fully general two-operand expander, controlled by a predicate.
812  */
813 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
814 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
815 {                                                               \
816     intptr_t i, opr_sz = simd_oprsz(desc);                      \
817     for (i = 0; i < opr_sz; ) {                                 \
818         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
819         do {                                                    \
820             if (pg & 1) {                                       \
821                 TYPE nn = *(TYPE *)(vn + H(i));                 \
822                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
823             }                                                   \
824             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
825         } while (i & 15);                                       \
826     }                                                           \
827 }
828 
829 /* Similarly, specialized for 64-bit operands.  */
830 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
831 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
832 {                                                               \
833     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
834     TYPE *d = vd, *n = vn;                                      \
835     uint8_t *pg = vg;                                           \
836     for (i = 0; i < opr_sz; i += 1) {                           \
837         if (pg[H1(i)] & 1) {                                    \
838             TYPE nn = n[i];                                     \
839             d[i] = OP(nn);                                      \
840         }                                                       \
841     }                                                           \
842 }
843 
844 #define DO_CLS_B(N)   (clrsb32(N) - 24)
845 #define DO_CLS_H(N)   (clrsb32(N) - 16)
846 
847 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
848 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
849 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
850 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
851 
852 #define DO_CLZ_B(N)   (clz32(N) - 24)
853 #define DO_CLZ_H(N)   (clz32(N) - 16)
854 
855 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
856 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
857 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
858 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
859 
860 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
861 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
862 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
863 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
864 
865 #define DO_CNOT(N)    (N == 0)
866 
867 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
868 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
869 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
870 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
871 
872 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
873 
874 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
875 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
876 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
877 
878 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
879 
880 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
881 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
882 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
883 
884 #define DO_NOT(N)    (~N)
885 
886 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
887 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
888 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
889 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
890 
891 #define DO_SXTB(N)    ((int8_t)N)
892 #define DO_SXTH(N)    ((int16_t)N)
893 #define DO_SXTS(N)    ((int32_t)N)
894 #define DO_UXTB(N)    ((uint8_t)N)
895 #define DO_UXTH(N)    ((uint16_t)N)
896 #define DO_UXTS(N)    ((uint32_t)N)
897 
898 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
899 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
900 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
901 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
902 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
903 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
904 
905 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
906 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
907 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
908 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
909 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
910 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
911 
912 #define DO_ABS(N)    (N < 0 ? -N : N)
913 
914 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
915 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
916 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
917 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
918 
919 #define DO_NEG(N)    (-N)
920 
921 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
922 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
923 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
924 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
925 
926 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
927 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
928 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
929 
930 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
931 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
932 
933 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
934 
935 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
936 {
937     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
938     uint64_t *d = vd, *n = vn;
939     uint8_t *pg = vg;
940 
941     for (i = 0; i < opr_sz; i += 2) {
942         if (pg[H1(i)] & 1) {
943             uint64_t n0 = n[i + 0];
944             uint64_t n1 = n[i + 1];
945             d[i + 0] = n1;
946             d[i + 1] = n0;
947         }
948     }
949 }
950 
951 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
952 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
953 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
954 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
955 
956 #define DO_SQABS(X) \
957     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
958        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
959 
960 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
961 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
962 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
963 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
964 
965 #define DO_SQNEG(X) \
966     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
967        x_ == min_ ? -min_ - 1 : -x_; })
968 
969 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
970 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
971 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
972 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
973 
974 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
975 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
976 
977 /* Three-operand expander, unpredicated, in which the third operand is "wide".
978  */
979 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
980 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
981 {                                                              \
982     intptr_t i, opr_sz = simd_oprsz(desc);                     \
983     for (i = 0; i < opr_sz; ) {                                \
984         TYPEW mm = *(TYPEW *)(vm + i);                         \
985         do {                                                   \
986             TYPE nn = *(TYPE *)(vn + H(i));                    \
987             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
988             i += sizeof(TYPE);                                 \
989         } while (i & 7);                                       \
990     }                                                          \
991 }
992 
993 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
994 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
995 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
996 
997 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
998 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
999 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1000 
1001 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1002 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1003 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1004 
1005 #undef DO_ZZW
1006 
1007 #undef DO_CLS_B
1008 #undef DO_CLS_H
1009 #undef DO_CLZ_B
1010 #undef DO_CLZ_H
1011 #undef DO_CNOT
1012 #undef DO_FABS
1013 #undef DO_FNEG
1014 #undef DO_ABS
1015 #undef DO_NEG
1016 #undef DO_ZPZ
1017 #undef DO_ZPZ_D
1018 
1019 /*
1020  * Three-operand expander, unpredicated, in which the two inputs are
1021  * selected from the top or bottom half of the wide column.
1022  */
1023 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1024 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1025 {                                                                       \
1026     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1027     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1028     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1029     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1030         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1031         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1032         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1033     }                                                                   \
1034 }
1035 
1036 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1037 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1038 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1039 
1040 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1041 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1042 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1043 
1044 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1045 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1046 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1047 
1048 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1049 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1050 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1051 
1052 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1053 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1054 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1055 
1056 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1057 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1058 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1059 
1060 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1061 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1062 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1063 
1064 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1065 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1066 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1067 
1068 /* Note that the multiply cannot overflow, but the doubling can. */
1069 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1070 {
1071     int16_t val = n * m;
1072     return DO_SQADD_H(val, val);
1073 }
1074 
1075 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1076 {
1077     int32_t val = n * m;
1078     return DO_SQADD_S(val, val);
1079 }
1080 
1081 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1082 {
1083     int64_t val = n * m;
1084     return do_sqadd_d(val, val);
1085 }
1086 
1087 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1088 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1089 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1090 
1091 #undef DO_ZZZ_TB
1092 
1093 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1094 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1095 {                                                              \
1096     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1097     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1098     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1099         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1100         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1101         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1102     }                                                          \
1103 }
1104 
1105 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1106 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1107 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1108 
1109 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1110 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1111 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1112 
1113 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1114 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1115 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1116 
1117 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1118 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1119 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1120 
1121 #undef DO_ZZZ_WTB
1122 
1123 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1124 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1125 {                                                                       \
1126     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1127     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1128     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1129     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1130         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1131         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1132         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1133     }                                                                   \
1134 }
1135 
1136 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1137 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1138 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1139 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1140 
1141 #undef DO_ZZZ_NTB
1142 
1143 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1144 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1145 {                                                               \
1146     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1147     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1148     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1149         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1150         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1151         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1152         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1153     }                                                           \
1154 }
1155 
1156 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1157 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1158 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1159 
1160 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1161 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1162 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1163 
1164 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1165 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1166 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1167 
1168 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1169 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1170 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1171 
1172 #define DO_NMUL(N, M)  -(N * M)
1173 
1174 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1176 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1177 
1178 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1180 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1181 
1182 #undef DO_ZZZW_ACC
1183 
1184 #define DO_XTNB(NAME, TYPE, OP) \
1185 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1186 {                                                            \
1187     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1188     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1189         TYPE nn = *(TYPE *)(vn + i);                         \
1190         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1191         *(TYPE *)(vd + i) = nn;                              \
1192     }                                                        \
1193 }
1194 
1195 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1196 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1197 {                                                                       \
1198     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1199     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1200         TYPE nn = *(TYPE *)(vn + i);                                    \
1201         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1202     }                                                                   \
1203 }
1204 
1205 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1206 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1207 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1208 
1209 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1210 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1211 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1212 
1213 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1214 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1215 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1216 
1217 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1218 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1219 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1220 
1221 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1222 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1223 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1224 
1225 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1226 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1227 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1228 
1229 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1230 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1231 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1232 
1233 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1234 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1235 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1236 
1237 #undef DO_XTNB
1238 #undef DO_XTNT
1239 
1240 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1241 {
1242     intptr_t i, opr_sz = simd_oprsz(desc);
1243     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1244     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1245     uint32_t *a = va, *n = vn;
1246     uint64_t *d = vd, *m = vm;
1247 
1248     for (i = 0; i < opr_sz / 8; ++i) {
1249         uint32_t e1 = a[2 * i + H4(0)];
1250         uint32_t e2 = n[2 * i + sel] ^ inv;
1251         uint64_t c = extract64(m[i], 32, 1);
1252         /* Compute and store the entire 33-bit result at once. */
1253         d[i] = c + e1 + e2;
1254     }
1255 }
1256 
1257 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1258 {
1259     intptr_t i, opr_sz = simd_oprsz(desc);
1260     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1261     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1262     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1263 
1264     for (i = 0; i < opr_sz / 8; i += 2) {
1265         Int128 e1 = int128_make64(a[i]);
1266         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1267         Int128 c = int128_make64(m[i + 1] & 1);
1268         Int128 r = int128_add(int128_add(e1, e2), c);
1269         d[i + 0] = int128_getlo(r);
1270         d[i + 1] = int128_gethi(r);
1271     }
1272 }
1273 
1274 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1275 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1276 {                                                                       \
1277     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1278     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1279     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1280     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1281         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1282         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1283         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1284         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1285     }                                                                   \
1286 }
1287 
1288 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1289            do_sqdmull_h, DO_SQADD_H)
1290 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1291            do_sqdmull_s, DO_SQADD_S)
1292 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1293            do_sqdmull_d, do_sqadd_d)
1294 
1295 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1296            do_sqdmull_h, DO_SQSUB_H)
1297 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1298            do_sqdmull_s, DO_SQSUB_S)
1299 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1300            do_sqdmull_d, do_sqsub_d)
1301 
1302 #undef DO_SQDMLAL
1303 
1304 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1305 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1306 {                                                               \
1307     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1308     int rot = simd_data(desc);                                  \
1309     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1310     bool sub_r = rot == 1 || rot == 2;                          \
1311     bool sub_i = rot >= 2;                                      \
1312     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1313     for (i = 0; i < opr_sz; i += 2) {                           \
1314         TYPE elt1_a = n[H(i + sel_a)];                          \
1315         TYPE elt2_a = m[H(i + sel_a)];                          \
1316         TYPE elt2_b = m[H(i + sel_b)];                          \
1317         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1318         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1319     }                                                           \
1320 }
1321 
1322 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1323 
1324 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1325 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1326 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1327 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1328 
1329 #define DO_SQRDMLAH_B(N, M, A, S) \
1330     do_sqrdmlah_b(N, M, A, S, true)
1331 #define DO_SQRDMLAH_H(N, M, A, S) \
1332     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1333 #define DO_SQRDMLAH_S(N, M, A, S) \
1334     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1335 #define DO_SQRDMLAH_D(N, M, A, S) \
1336     do_sqrdmlah_d(N, M, A, S, true)
1337 
1338 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1341 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1342 
1343 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1344 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1345 {                                                                           \
1346     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1347     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1348     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1349     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1350     bool sub_r = rot == 1 || rot == 2;                                      \
1351     bool sub_i = rot >= 2;                                                  \
1352     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1353     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1354         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1355         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1356         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1357             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1358             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1359             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1360         }                                                                   \
1361     }                                                                       \
1362 }
1363 
1364 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1365 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1366 
1367 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1368 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1369 
1370 #undef DO_CMLA
1371 #undef DO_CMLA_FUNC
1372 #undef DO_CMLA_IDX_FUNC
1373 #undef DO_SQRDMLAH_B
1374 #undef DO_SQRDMLAH_H
1375 #undef DO_SQRDMLAH_S
1376 #undef DO_SQRDMLAH_D
1377 
1378 /* Note N and M are 4 elements bundled into one unit. */
1379 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1380                          int sel_a, int sel_b, int sub_i)
1381 {
1382     for (int i = 0; i <= 1; i++) {
1383         int32_t elt1_r = (int8_t)(n >> (16 * i));
1384         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1385         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1386         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1387 
1388         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1389     }
1390     return a;
1391 }
1392 
1393 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1394                          int sel_a, int sel_b, int sub_i)
1395 {
1396     for (int i = 0; i <= 1; i++) {
1397         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1398         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1399         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1400         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1401 
1402         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1403     }
1404     return a;
1405 }
1406 
1407 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1408                               void *va, uint32_t desc)
1409 {
1410     int opr_sz = simd_oprsz(desc);
1411     int rot = simd_data(desc);
1412     int sel_a = rot & 1;
1413     int sel_b = sel_a ^ 1;
1414     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1415     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1416 
1417     for (int e = 0; e < opr_sz / 4; e++) {
1418         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1419     }
1420 }
1421 
1422 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1423                               void *va, uint32_t desc)
1424 {
1425     int opr_sz = simd_oprsz(desc);
1426     int rot = simd_data(desc);
1427     int sel_a = rot & 1;
1428     int sel_b = sel_a ^ 1;
1429     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1430     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1431 
1432     for (int e = 0; e < opr_sz / 8; e++) {
1433         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1434     }
1435 }
1436 
1437 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1438                              void *va, uint32_t desc)
1439 {
1440     int opr_sz = simd_oprsz(desc);
1441     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1442     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1443     int sel_a = rot & 1;
1444     int sel_b = sel_a ^ 1;
1445     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1446     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1447 
1448     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1449         uint32_t seg_m = m[seg + idx];
1450         for (int e = 0; e < 4; e++) {
1451             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1452                                    sel_a, sel_b, sub_i);
1453         }
1454     }
1455 }
1456 
1457 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1458                              void *va, uint32_t desc)
1459 {
1460     int seg, opr_sz = simd_oprsz(desc);
1461     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1462     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1463     int sel_a = rot & 1;
1464     int sel_b = sel_a ^ 1;
1465     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1466     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1467 
1468     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1469         uint64_t seg_m = m[seg + idx];
1470         for (int e = 0; e < 2; e++) {
1471             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1472                                    sel_a, sel_b, sub_i);
1473         }
1474     }
1475 }
1476 
1477 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1478 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1479 {                                                                       \
1480     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1481     intptr_t i, j, idx = simd_data(desc);                               \
1482     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1483     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1484         TYPE mm = m[i];                                                 \
1485         for (j = 0; j < segment; j++) {                                 \
1486             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1487         }                                                               \
1488     }                                                                   \
1489 }
1490 
1491 #define DO_SQRDMLAH_H(N, M, A) \
1492     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1493 #define DO_SQRDMLAH_S(N, M, A) \
1494     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1495 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1496 
1497 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1498 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1499 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1500 
1501 #define DO_SQRDMLSH_H(N, M, A) \
1502     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1503 #define DO_SQRDMLSH_S(N, M, A) \
1504     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1505 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1506 
1507 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1508 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1509 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1510 
1511 #undef DO_ZZXZ
1512 
1513 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1514 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1515 {                                                                         \
1516     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1517     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1518     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1519     for (i = 0; i < oprsz; i += 16) {                                     \
1520         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1521         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1522             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1523             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1524             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1525         }                                                                 \
1526     }                                                                     \
1527 }
1528 
1529 #define DO_MLA(N, M, A)  (A + N * M)
1530 
1531 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1532 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1533 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1534 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1535 
1536 #define DO_MLS(N, M, A)  (A - N * M)
1537 
1538 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1539 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1540 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1541 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1542 
1543 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1544 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1545 
1546 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1547 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1548 
1549 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1550 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1551 
1552 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1553 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1554 
1555 #undef DO_MLA
1556 #undef DO_MLS
1557 #undef DO_ZZXW
1558 
1559 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1560 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1561 {                                                                         \
1562     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1563     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1564     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1565     for (i = 0; i < oprsz; i += 16) {                                     \
1566         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1567         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1568             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1569             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1570         }                                                                 \
1571     }                                                                     \
1572 }
1573 
1574 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1575 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1576 
1577 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1578 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1579 
1580 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1581 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1582 
1583 #undef DO_ZZX
1584 
1585 #define DO_BITPERM(NAME, TYPE, OP) \
1586 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1587 {                                                              \
1588     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1589     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1590         TYPE nn = *(TYPE *)(vn + i);                           \
1591         TYPE mm = *(TYPE *)(vm + i);                           \
1592         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1593     }                                                          \
1594 }
1595 
1596 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1597 {
1598     uint64_t res = 0;
1599     int db, rb = 0;
1600 
1601     for (db = 0; db < n; ++db) {
1602         if ((mask >> db) & 1) {
1603             res |= ((data >> db) & 1) << rb;
1604             ++rb;
1605         }
1606     }
1607     return res;
1608 }
1609 
1610 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1611 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1612 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1613 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1614 
1615 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1616 {
1617     uint64_t res = 0;
1618     int rb, db = 0;
1619 
1620     for (rb = 0; rb < n; ++rb) {
1621         if ((mask >> rb) & 1) {
1622             res |= ((data >> db) & 1) << rb;
1623             ++db;
1624         }
1625     }
1626     return res;
1627 }
1628 
1629 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1630 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1631 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1632 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1633 
1634 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1635 {
1636     uint64_t resm = 0, resu = 0;
1637     int db, rbm = 0, rbu = 0;
1638 
1639     for (db = 0; db < n; ++db) {
1640         uint64_t val = (data >> db) & 1;
1641         if ((mask >> db) & 1) {
1642             resm |= val << rbm++;
1643         } else {
1644             resu |= val << rbu++;
1645         }
1646     }
1647 
1648     return resm | (resu << rbm);
1649 }
1650 
1651 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1652 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1653 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1654 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1655 
1656 #undef DO_BITPERM
1657 
1658 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1659 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1660 {                                                               \
1661     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1662     int sub_r = simd_data(desc);                                \
1663     if (sub_r) {                                                \
1664         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1665             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1666             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1667             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1668             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1669             acc_r = ADD_OP(acc_r, el2_i);                       \
1670             acc_i = SUB_OP(acc_i, el2_r);                       \
1671             *(TYPE *)(vd + H(i)) = acc_r;                       \
1672             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1673         }                                                       \
1674     } else {                                                    \
1675         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1676             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1677             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1678             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1679             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1680             acc_r = SUB_OP(acc_r, el2_i);                       \
1681             acc_i = ADD_OP(acc_i, el2_r);                       \
1682             *(TYPE *)(vd + H(i)) = acc_r;                       \
1683             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1684         }                                                       \
1685     }                                                           \
1686 }
1687 
1688 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1689 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1690 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1691 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1692 
1693 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1694 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1695 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1696 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1697 
1698 #undef DO_CADD
1699 
1700 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1701 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1702 {                                                              \
1703     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1704     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1705     int shift = simd_data(desc) >> 1;                          \
1706     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1707         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1708         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1709     }                                                          \
1710 }
1711 
1712 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1713 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1714 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1715 
1716 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1717 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1718 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1719 
1720 #undef DO_ZZI_SHLL
1721 
1722 /* Two-operand reduction expander, controlled by a predicate.
1723  * The difference between TYPERED and TYPERET has to do with
1724  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1725  * but TYPERET must be unsigned so that e.g. a 32-bit value
1726  * is not sign-extended to the ABI uint64_t return type.
1727  */
1728 /* ??? If we were to vectorize this by hand the reduction ordering
1729  * would change.  For integer operands, this is perfectly fine.
1730  */
1731 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1732 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1733 {                                                          \
1734     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1735     TYPERED ret = INIT;                                    \
1736     for (i = 0; i < opr_sz; ) {                            \
1737         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1738         do {                                               \
1739             if (pg & 1) {                                  \
1740                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1741                 ret = OP(ret, nn);                         \
1742             }                                              \
1743             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1744         } while (i & 15);                                  \
1745     }                                                      \
1746     return (TYPERET)ret;                                   \
1747 }
1748 
1749 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1750 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1751 {                                                          \
1752     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1753     TYPEE *n = vn;                                         \
1754     uint8_t *pg = vg;                                      \
1755     TYPER ret = INIT;                                      \
1756     for (i = 0; i < opr_sz; i += 1) {                      \
1757         if (pg[H1(i)] & 1) {                               \
1758             TYPEE nn = n[i];                               \
1759             ret = OP(ret, nn);                             \
1760         }                                                  \
1761     }                                                      \
1762     return ret;                                            \
1763 }
1764 
1765 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1766 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1767 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1768 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1769 
1770 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1771 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1772 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1773 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1774 
1775 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1776 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1777 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1778 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1779 
1780 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1781 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1782 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1783 
1784 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1785 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1786 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1787 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1788 
1789 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1790 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1791 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1792 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1793 
1794 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1795 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1796 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1797 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1798 
1799 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1800 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1801 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1802 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1803 
1804 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1805 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1806 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1807 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1808 
1809 #undef DO_VPZ
1810 #undef DO_VPZ_D
1811 
1812 /* Two vector operand, one scalar operand, unpredicated.  */
1813 #define DO_ZZI(NAME, TYPE, OP)                                       \
1814 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1815 {                                                                    \
1816     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1817     TYPE s = s64, *d = vd, *n = vn;                                  \
1818     for (i = 0; i < opr_sz; ++i) {                                   \
1819         d[i] = OP(n[i], s);                                          \
1820     }                                                                \
1821 }
1822 
1823 #define DO_SUBR(X, Y)   (Y - X)
1824 
1825 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1826 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1827 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1828 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1829 
1830 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1831 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1832 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1833 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1834 
1835 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1836 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1837 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1838 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1839 
1840 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1841 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1842 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1843 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1844 
1845 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1846 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1847 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1848 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1849 
1850 #undef DO_ZZI
1851 
1852 #undef DO_AND
1853 #undef DO_ORR
1854 #undef DO_EOR
1855 #undef DO_BIC
1856 #undef DO_ADD
1857 #undef DO_SUB
1858 #undef DO_MAX
1859 #undef DO_MIN
1860 #undef DO_ABD
1861 #undef DO_MUL
1862 #undef DO_DIV
1863 #undef DO_ASR
1864 #undef DO_LSR
1865 #undef DO_LSL
1866 #undef DO_SUBR
1867 
1868 /* Similar to the ARM LastActiveElement pseudocode function, except the
1869    result is multiplied by the element size.  This includes the not found
1870    indication; e.g. not found for esz=3 is -8.  */
1871 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1872 {
1873     uint64_t mask = pred_esz_masks[esz];
1874     intptr_t i = words;
1875 
1876     do {
1877         uint64_t this_g = g[--i] & mask;
1878         if (this_g) {
1879             return i * 64 + (63 - clz64(this_g));
1880         }
1881     } while (i > 0);
1882     return (intptr_t)-1 << esz;
1883 }
1884 
1885 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1886 {
1887     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1888     uint32_t flags = PREDTEST_INIT;
1889     uint64_t *d = vd, *g = vg;
1890     intptr_t i = 0;
1891 
1892     do {
1893         uint64_t this_d = d[i];
1894         uint64_t this_g = g[i];
1895 
1896         if (this_g) {
1897             if (!(flags & 4)) {
1898                 /* Set in D the first bit of G.  */
1899                 this_d |= this_g & -this_g;
1900                 d[i] = this_d;
1901             }
1902             flags = iter_predtest_fwd(this_d, this_g, flags);
1903         }
1904     } while (++i < words);
1905 
1906     return flags;
1907 }
1908 
1909 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1910 {
1911     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1912     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1913     uint32_t flags = PREDTEST_INIT;
1914     uint64_t *d = vd, *g = vg, esz_mask;
1915     intptr_t i, next;
1916 
1917     next = last_active_element(vd, words, esz) + (1 << esz);
1918     esz_mask = pred_esz_masks[esz];
1919 
1920     /* Similar to the pseudocode for pnext, but scaled by ESZ
1921        so that we find the correct bit.  */
1922     if (next < words * 64) {
1923         uint64_t mask = -1;
1924 
1925         if (next & 63) {
1926             mask = ~((1ull << (next & 63)) - 1);
1927             next &= -64;
1928         }
1929         do {
1930             uint64_t this_g = g[next / 64] & esz_mask & mask;
1931             if (this_g != 0) {
1932                 next = (next & -64) + ctz64(this_g);
1933                 break;
1934             }
1935             next += 64;
1936             mask = -1;
1937         } while (next < words * 64);
1938     }
1939 
1940     i = 0;
1941     do {
1942         uint64_t this_d = 0;
1943         if (i == next / 64) {
1944             this_d = 1ull << (next & 63);
1945         }
1946         d[i] = this_d;
1947         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1948     } while (++i < words);
1949 
1950     return flags;
1951 }
1952 
1953 /*
1954  * Copy Zn into Zd, and store zero into inactive elements.
1955  * If inv, store zeros into the active elements.
1956  */
1957 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1958 {
1959     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1960     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1961     uint64_t *d = vd, *n = vn;
1962     uint8_t *pg = vg;
1963 
1964     for (i = 0; i < opr_sz; i += 1) {
1965         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1966     }
1967 }
1968 
1969 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1970 {
1971     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1972     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1973     uint64_t *d = vd, *n = vn;
1974     uint8_t *pg = vg;
1975 
1976     for (i = 0; i < opr_sz; i += 1) {
1977         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1978     }
1979 }
1980 
1981 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1982 {
1983     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1984     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1985     uint64_t *d = vd, *n = vn;
1986     uint8_t *pg = vg;
1987 
1988     for (i = 0; i < opr_sz; i += 1) {
1989         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1990     }
1991 }
1992 
1993 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1994 {
1995     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1996     uint64_t *d = vd, *n = vn;
1997     uint8_t *pg = vg;
1998     uint8_t inv = simd_data(desc);
1999 
2000     for (i = 0; i < opr_sz; i += 1) {
2001         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2002     }
2003 }
2004 
2005 /* Three-operand expander, immediate operand, controlled by a predicate.
2006  */
2007 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2008 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2009 {                                                               \
2010     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2011     TYPE imm = simd_data(desc);                                 \
2012     for (i = 0; i < opr_sz; ) {                                 \
2013         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2014         do {                                                    \
2015             if (pg & 1) {                                       \
2016                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2017                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2018             }                                                   \
2019             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2020         } while (i & 15);                                       \
2021     }                                                           \
2022 }
2023 
2024 /* Similarly, specialized for 64-bit operands.  */
2025 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2026 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2027 {                                                               \
2028     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2029     TYPE *d = vd, *n = vn;                                      \
2030     TYPE imm = simd_data(desc);                                 \
2031     uint8_t *pg = vg;                                           \
2032     for (i = 0; i < opr_sz; i += 1) {                           \
2033         if (pg[H1(i)] & 1) {                                    \
2034             TYPE nn = n[i];                                     \
2035             d[i] = OP(nn, imm);                                 \
2036         }                                                       \
2037     }                                                           \
2038 }
2039 
2040 #define DO_SHR(N, M)  (N >> M)
2041 #define DO_SHL(N, M)  (N << M)
2042 
2043 /* Arithmetic shift right for division.  This rounds negative numbers
2044    toward zero as per signed division.  Therefore before shifting,
2045    when N is negative, add 2**M-1.  */
2046 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2047 
2048 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2049 {
2050     if (likely(sh < 64)) {
2051         return (x >> sh) + ((x >> (sh - 1)) & 1);
2052     } else if (sh == 64) {
2053         return x >> 63;
2054     } else {
2055         return 0;
2056     }
2057 }
2058 
2059 static inline int64_t do_srshr(int64_t x, unsigned sh)
2060 {
2061     if (likely(sh < 64)) {
2062         return (x >> sh) + ((x >> (sh - 1)) & 1);
2063     } else {
2064         /* Rounding the sign bit always produces 0. */
2065         return 0;
2066     }
2067 }
2068 
2069 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2070 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2071 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2072 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2073 
2074 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2075 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2076 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2077 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2078 
2079 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2080 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2081 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2082 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2083 
2084 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2085 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2086 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2087 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2088 
2089 /* SVE2 bitwise shift by immediate */
2090 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2091 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2092 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2093 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2094 
2095 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2096 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2097 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2098 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2099 
2100 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2101 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2102 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2103 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2104 
2105 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2106 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2107 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2108 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2109 
2110 #define do_suqrshl_b(n, m) \
2111    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2112 #define do_suqrshl_h(n, m) \
2113    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2114 #define do_suqrshl_s(n, m) \
2115    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2116 #define do_suqrshl_d(n, m) \
2117    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2118 
2119 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2120 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2121 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2122 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2123 
2124 #undef DO_ASRD
2125 #undef DO_ZPZI
2126 #undef DO_ZPZI_D
2127 
2128 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2129 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2130 {                                                            \
2131     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2132     int shift = simd_data(desc);                             \
2133     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2134         TYPEW nn = *(TYPEW *)(vn + i);                       \
2135         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2136     }                                                        \
2137 }
2138 
2139 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2140 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2141 {                                                                 \
2142     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2143     int shift = simd_data(desc);                                  \
2144     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2145         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2146         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2147     }                                                             \
2148 }
2149 
2150 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2151 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2152 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2153 
2154 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2155 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2156 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2157 
2158 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2159 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2160 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2161 
2162 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2163 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2164 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2165 
2166 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2167 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2168 #define DO_SQSHRUN_D(x, sh) \
2169     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2170 
2171 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2172 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2173 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2174 
2175 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2176 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2177 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2178 
2179 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2180 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2181 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2182 
2183 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2184 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2185 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2186 
2187 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2188 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2189 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2190 
2191 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2192 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2193 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2194 
2195 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2196 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2197 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2198 
2199 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2200 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2201 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2202 
2203 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2204 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2205 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2206 
2207 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2208 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2209 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2210 
2211 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2212 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2213 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2214 
2215 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2216 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2217 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2218 
2219 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2220 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2221 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2222 
2223 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2224 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2225 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2226 
2227 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2228 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2229 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2230 
2231 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2232 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2233 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2234 
2235 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2236 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2237 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2238 
2239 #undef DO_SHRNB
2240 #undef DO_SHRNT
2241 
2242 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2243 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2244 {                                                                           \
2245     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2246     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2247         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2248         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2249         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2250     }                                                                       \
2251 }
2252 
2253 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2254 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2255 {                                                                           \
2256     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2257     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2258         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2259         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2260         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2261     }                                                                       \
2262 }
2263 
2264 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2265 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2266 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2267 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2268 
2269 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2270 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2271 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2272 
2273 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2274 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2275 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2276 
2277 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2278 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2279 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2280 
2281 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2282 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2283 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2284 
2285 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2286 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2287 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2288 
2289 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2290 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2291 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2292 
2293 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2294 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2295 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2296 
2297 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2298 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2299 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2300 
2301 #undef DO_RSUBHN
2302 #undef DO_SUBHN
2303 #undef DO_RADDHN
2304 #undef DO_ADDHN
2305 
2306 #undef DO_BINOPNB
2307 
2308 /* Fully general four-operand expander, controlled by a predicate.
2309  */
2310 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2311 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2312                   void *vg, uint32_t desc)                    \
2313 {                                                             \
2314     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2315     for (i = 0; i < opr_sz; ) {                               \
2316         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2317         do {                                                  \
2318             if (pg & 1) {                                     \
2319                 TYPE nn = *(TYPE *)(vn + H(i));               \
2320                 TYPE mm = *(TYPE *)(vm + H(i));               \
2321                 TYPE aa = *(TYPE *)(va + H(i));               \
2322                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2323             }                                                 \
2324             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2325         } while (i & 15);                                     \
2326     }                                                         \
2327 }
2328 
2329 /* Similarly, specialized for 64-bit operands.  */
2330 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2331 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2332                   void *vg, uint32_t desc)                    \
2333 {                                                             \
2334     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2335     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2336     uint8_t *pg = vg;                                         \
2337     for (i = 0; i < opr_sz; i += 1) {                         \
2338         if (pg[H1(i)] & 1) {                                  \
2339             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2340             d[i] = OP(aa, nn, mm);                            \
2341         }                                                     \
2342     }                                                         \
2343 }
2344 
2345 #define DO_MLA(A, N, M)  (A + N * M)
2346 #define DO_MLS(A, N, M)  (A - N * M)
2347 
2348 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2349 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2350 
2351 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2352 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2353 
2354 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2355 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2356 
2357 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2358 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2359 
2360 #undef DO_MLA
2361 #undef DO_MLS
2362 #undef DO_ZPZZZ
2363 #undef DO_ZPZZZ_D
2364 
2365 void HELPER(sve_index_b)(void *vd, uint32_t start,
2366                          uint32_t incr, uint32_t desc)
2367 {
2368     intptr_t i, opr_sz = simd_oprsz(desc);
2369     uint8_t *d = vd;
2370     for (i = 0; i < opr_sz; i += 1) {
2371         d[H1(i)] = start + i * incr;
2372     }
2373 }
2374 
2375 void HELPER(sve_index_h)(void *vd, uint32_t start,
2376                          uint32_t incr, uint32_t desc)
2377 {
2378     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2379     uint16_t *d = vd;
2380     for (i = 0; i < opr_sz; i += 1) {
2381         d[H2(i)] = start + i * incr;
2382     }
2383 }
2384 
2385 void HELPER(sve_index_s)(void *vd, uint32_t start,
2386                          uint32_t incr, uint32_t desc)
2387 {
2388     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2389     uint32_t *d = vd;
2390     for (i = 0; i < opr_sz; i += 1) {
2391         d[H4(i)] = start + i * incr;
2392     }
2393 }
2394 
2395 void HELPER(sve_index_d)(void *vd, uint64_t start,
2396                          uint64_t incr, uint32_t desc)
2397 {
2398     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2399     uint64_t *d = vd;
2400     for (i = 0; i < opr_sz; i += 1) {
2401         d[i] = start + i * incr;
2402     }
2403 }
2404 
2405 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2406 {
2407     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2408     uint32_t sh = simd_data(desc);
2409     uint32_t *d = vd, *n = vn, *m = vm;
2410     for (i = 0; i < opr_sz; i += 1) {
2411         d[i] = n[i] + (m[i] << sh);
2412     }
2413 }
2414 
2415 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2416 {
2417     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2418     uint64_t sh = simd_data(desc);
2419     uint64_t *d = vd, *n = vn, *m = vm;
2420     for (i = 0; i < opr_sz; i += 1) {
2421         d[i] = n[i] + (m[i] << sh);
2422     }
2423 }
2424 
2425 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2426 {
2427     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2428     uint64_t sh = simd_data(desc);
2429     uint64_t *d = vd, *n = vn, *m = vm;
2430     for (i = 0; i < opr_sz; i += 1) {
2431         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2432     }
2433 }
2434 
2435 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2436 {
2437     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2438     uint64_t sh = simd_data(desc);
2439     uint64_t *d = vd, *n = vn, *m = vm;
2440     for (i = 0; i < opr_sz; i += 1) {
2441         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2442     }
2443 }
2444 
2445 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2446 {
2447     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2448     static const uint16_t coeff[] = {
2449         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2450         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2451         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2452         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2453     };
2454     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2455     uint16_t *d = vd, *n = vn;
2456 
2457     for (i = 0; i < opr_sz; i++) {
2458         uint16_t nn = n[i];
2459         intptr_t idx = extract32(nn, 0, 5);
2460         uint16_t exp = extract32(nn, 5, 5);
2461         d[i] = coeff[idx] | (exp << 10);
2462     }
2463 }
2464 
2465 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2466 {
2467     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2468     static const uint32_t coeff[] = {
2469         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2470         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2471         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2472         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2473         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2474         0x1ef532, 0x20b051, 0x227043, 0x243516,
2475         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2476         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2477         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2478         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2479         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2480         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2481         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2482         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2483         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2484         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2485     };
2486     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2487     uint32_t *d = vd, *n = vn;
2488 
2489     for (i = 0; i < opr_sz; i++) {
2490         uint32_t nn = n[i];
2491         intptr_t idx = extract32(nn, 0, 6);
2492         uint32_t exp = extract32(nn, 6, 8);
2493         d[i] = coeff[idx] | (exp << 23);
2494     }
2495 }
2496 
2497 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2498 {
2499     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2500     static const uint64_t coeff[] = {
2501         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2502         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2503         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2504         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2505         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2506         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2507         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2508         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2509         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2510         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2511         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2512         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2513         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2514         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2515         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2516         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2517         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2518         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2519         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2520         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2521         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2522         0xFA7C1819E90D8ull,
2523     };
2524     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2525     uint64_t *d = vd, *n = vn;
2526 
2527     for (i = 0; i < opr_sz; i++) {
2528         uint64_t nn = n[i];
2529         intptr_t idx = extract32(nn, 0, 6);
2530         uint64_t exp = extract32(nn, 6, 11);
2531         d[i] = coeff[idx] | (exp << 52);
2532     }
2533 }
2534 
2535 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2536 {
2537     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2538     uint16_t *d = vd, *n = vn, *m = vm;
2539     for (i = 0; i < opr_sz; i += 1) {
2540         uint16_t nn = n[i];
2541         uint16_t mm = m[i];
2542         if (mm & 1) {
2543             nn = float16_one;
2544         }
2545         d[i] = nn ^ (mm & 2) << 14;
2546     }
2547 }
2548 
2549 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2550 {
2551     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2552     uint32_t *d = vd, *n = vn, *m = vm;
2553     for (i = 0; i < opr_sz; i += 1) {
2554         uint32_t nn = n[i];
2555         uint32_t mm = m[i];
2556         if (mm & 1) {
2557             nn = float32_one;
2558         }
2559         d[i] = nn ^ (mm & 2) << 30;
2560     }
2561 }
2562 
2563 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2564 {
2565     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2566     uint64_t *d = vd, *n = vn, *m = vm;
2567     for (i = 0; i < opr_sz; i += 1) {
2568         uint64_t nn = n[i];
2569         uint64_t mm = m[i];
2570         if (mm & 1) {
2571             nn = float64_one;
2572         }
2573         d[i] = nn ^ (mm & 2) << 62;
2574     }
2575 }
2576 
2577 /*
2578  * Signed saturating addition with scalar operand.
2579  */
2580 
2581 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2582 {
2583     intptr_t i, oprsz = simd_oprsz(desc);
2584 
2585     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2586         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2587     }
2588 }
2589 
2590 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2591 {
2592     intptr_t i, oprsz = simd_oprsz(desc);
2593 
2594     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2595         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2596     }
2597 }
2598 
2599 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2600 {
2601     intptr_t i, oprsz = simd_oprsz(desc);
2602 
2603     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2604         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2605     }
2606 }
2607 
2608 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2609 {
2610     intptr_t i, oprsz = simd_oprsz(desc);
2611 
2612     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2613         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2614     }
2615 }
2616 
2617 /*
2618  * Unsigned saturating addition with scalar operand.
2619  */
2620 
2621 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2622 {
2623     intptr_t i, oprsz = simd_oprsz(desc);
2624 
2625     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2626         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2627     }
2628 }
2629 
2630 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2631 {
2632     intptr_t i, oprsz = simd_oprsz(desc);
2633 
2634     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2635         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2636     }
2637 }
2638 
2639 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2640 {
2641     intptr_t i, oprsz = simd_oprsz(desc);
2642 
2643     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2644         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2645     }
2646 }
2647 
2648 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2649 {
2650     intptr_t i, oprsz = simd_oprsz(desc);
2651 
2652     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2653         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2654     }
2655 }
2656 
2657 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2658 {
2659     intptr_t i, oprsz = simd_oprsz(desc);
2660 
2661     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2662         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2663     }
2664 }
2665 
2666 /* Two operand predicated copy immediate with merge.  All valid immediates
2667  * can fit within 17 signed bits in the simd_data field.
2668  */
2669 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2670                          uint64_t mm, uint32_t desc)
2671 {
2672     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2673     uint64_t *d = vd, *n = vn;
2674     uint8_t *pg = vg;
2675 
2676     mm = dup_const(MO_8, mm);
2677     for (i = 0; i < opr_sz; i += 1) {
2678         uint64_t nn = n[i];
2679         uint64_t pp = expand_pred_b(pg[H1(i)]);
2680         d[i] = (mm & pp) | (nn & ~pp);
2681     }
2682 }
2683 
2684 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2685                          uint64_t mm, uint32_t desc)
2686 {
2687     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2688     uint64_t *d = vd, *n = vn;
2689     uint8_t *pg = vg;
2690 
2691     mm = dup_const(MO_16, mm);
2692     for (i = 0; i < opr_sz; i += 1) {
2693         uint64_t nn = n[i];
2694         uint64_t pp = expand_pred_h(pg[H1(i)]);
2695         d[i] = (mm & pp) | (nn & ~pp);
2696     }
2697 }
2698 
2699 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2700                          uint64_t mm, uint32_t desc)
2701 {
2702     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2703     uint64_t *d = vd, *n = vn;
2704     uint8_t *pg = vg;
2705 
2706     mm = dup_const(MO_32, mm);
2707     for (i = 0; i < opr_sz; i += 1) {
2708         uint64_t nn = n[i];
2709         uint64_t pp = expand_pred_s(pg[H1(i)]);
2710         d[i] = (mm & pp) | (nn & ~pp);
2711     }
2712 }
2713 
2714 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2715                          uint64_t mm, uint32_t desc)
2716 {
2717     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2718     uint64_t *d = vd, *n = vn;
2719     uint8_t *pg = vg;
2720 
2721     for (i = 0; i < opr_sz; i += 1) {
2722         uint64_t nn = n[i];
2723         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2724     }
2725 }
2726 
2727 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2728 {
2729     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2730     uint64_t *d = vd;
2731     uint8_t *pg = vg;
2732 
2733     val = dup_const(MO_8, val);
2734     for (i = 0; i < opr_sz; i += 1) {
2735         d[i] = val & expand_pred_b(pg[H1(i)]);
2736     }
2737 }
2738 
2739 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2740 {
2741     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2742     uint64_t *d = vd;
2743     uint8_t *pg = vg;
2744 
2745     val = dup_const(MO_16, val);
2746     for (i = 0; i < opr_sz; i += 1) {
2747         d[i] = val & expand_pred_h(pg[H1(i)]);
2748     }
2749 }
2750 
2751 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2752 {
2753     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2754     uint64_t *d = vd;
2755     uint8_t *pg = vg;
2756 
2757     val = dup_const(MO_32, val);
2758     for (i = 0; i < opr_sz; i += 1) {
2759         d[i] = val & expand_pred_s(pg[H1(i)]);
2760     }
2761 }
2762 
2763 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2764 {
2765     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2766     uint64_t *d = vd;
2767     uint8_t *pg = vg;
2768 
2769     for (i = 0; i < opr_sz; i += 1) {
2770         d[i] = (pg[H1(i)] & 1 ? val : 0);
2771     }
2772 }
2773 
2774 /* Big-endian hosts need to frob the byte indices.  If the copy
2775  * happens to be 8-byte aligned, then no frobbing necessary.
2776  */
2777 static void swap_memmove(void *vd, void *vs, size_t n)
2778 {
2779     uintptr_t d = (uintptr_t)vd;
2780     uintptr_t s = (uintptr_t)vs;
2781     uintptr_t o = (d | s | n) & 7;
2782     size_t i;
2783 
2784 #if !HOST_BIG_ENDIAN
2785     o = 0;
2786 #endif
2787     switch (o) {
2788     case 0:
2789         memmove(vd, vs, n);
2790         break;
2791 
2792     case 4:
2793         if (d < s || d >= s + n) {
2794             for (i = 0; i < n; i += 4) {
2795                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2796             }
2797         } else {
2798             for (i = n; i > 0; ) {
2799                 i -= 4;
2800                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2801             }
2802         }
2803         break;
2804 
2805     case 2:
2806     case 6:
2807         if (d < s || d >= s + n) {
2808             for (i = 0; i < n; i += 2) {
2809                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2810             }
2811         } else {
2812             for (i = n; i > 0; ) {
2813                 i -= 2;
2814                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2815             }
2816         }
2817         break;
2818 
2819     default:
2820         if (d < s || d >= s + n) {
2821             for (i = 0; i < n; i++) {
2822                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2823             }
2824         } else {
2825             for (i = n; i > 0; ) {
2826                 i -= 1;
2827                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2828             }
2829         }
2830         break;
2831     }
2832 }
2833 
2834 /* Similarly for memset of 0.  */
2835 static void swap_memzero(void *vd, size_t n)
2836 {
2837     uintptr_t d = (uintptr_t)vd;
2838     uintptr_t o = (d | n) & 7;
2839     size_t i;
2840 
2841     /* Usually, the first bit of a predicate is set, so N is 0.  */
2842     if (likely(n == 0)) {
2843         return;
2844     }
2845 
2846 #if !HOST_BIG_ENDIAN
2847     o = 0;
2848 #endif
2849     switch (o) {
2850     case 0:
2851         memset(vd, 0, n);
2852         break;
2853 
2854     case 4:
2855         for (i = 0; i < n; i += 4) {
2856             *(uint32_t *)H1_4(d + i) = 0;
2857         }
2858         break;
2859 
2860     case 2:
2861     case 6:
2862         for (i = 0; i < n; i += 2) {
2863             *(uint16_t *)H1_2(d + i) = 0;
2864         }
2865         break;
2866 
2867     default:
2868         for (i = 0; i < n; i++) {
2869             *(uint8_t *)H1(d + i) = 0;
2870         }
2871         break;
2872     }
2873 }
2874 
2875 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2876 {
2877     intptr_t opr_sz = simd_oprsz(desc);
2878     size_t n_ofs = simd_data(desc);
2879     size_t n_siz = opr_sz - n_ofs;
2880 
2881     if (vd != vm) {
2882         swap_memmove(vd, vn + n_ofs, n_siz);
2883         swap_memmove(vd + n_siz, vm, n_ofs);
2884     } else if (vd != vn) {
2885         swap_memmove(vd + n_siz, vd, n_ofs);
2886         swap_memmove(vd, vn + n_ofs, n_siz);
2887     } else {
2888         /* vd == vn == vm.  Need temp space.  */
2889         ARMVectorReg tmp;
2890         swap_memmove(&tmp, vm, n_ofs);
2891         swap_memmove(vd, vd + n_ofs, n_siz);
2892         memcpy(vd + n_siz, &tmp, n_ofs);
2893     }
2894 }
2895 
2896 #define DO_INSR(NAME, TYPE, H) \
2897 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2898 {                                                                  \
2899     intptr_t opr_sz = simd_oprsz(desc);                            \
2900     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2901     *(TYPE *)(vd + H(0)) = val;                                    \
2902 }
2903 
2904 DO_INSR(sve_insr_b, uint8_t, H1)
2905 DO_INSR(sve_insr_h, uint16_t, H1_2)
2906 DO_INSR(sve_insr_s, uint32_t, H1_4)
2907 DO_INSR(sve_insr_d, uint64_t, H1_8)
2908 
2909 #undef DO_INSR
2910 
2911 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2912 {
2913     intptr_t i, j, opr_sz = simd_oprsz(desc);
2914     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2915         uint64_t f = *(uint64_t *)(vn + i);
2916         uint64_t b = *(uint64_t *)(vn + j);
2917         *(uint64_t *)(vd + i) = bswap64(b);
2918         *(uint64_t *)(vd + j) = bswap64(f);
2919     }
2920 }
2921 
2922 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2923 {
2924     intptr_t i, j, opr_sz = simd_oprsz(desc);
2925     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2926         uint64_t f = *(uint64_t *)(vn + i);
2927         uint64_t b = *(uint64_t *)(vn + j);
2928         *(uint64_t *)(vd + i) = hswap64(b);
2929         *(uint64_t *)(vd + j) = hswap64(f);
2930     }
2931 }
2932 
2933 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2934 {
2935     intptr_t i, j, opr_sz = simd_oprsz(desc);
2936     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2937         uint64_t f = *(uint64_t *)(vn + i);
2938         uint64_t b = *(uint64_t *)(vn + j);
2939         *(uint64_t *)(vd + i) = rol64(b, 32);
2940         *(uint64_t *)(vd + j) = rol64(f, 32);
2941     }
2942 }
2943 
2944 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2945 {
2946     intptr_t i, j, opr_sz = simd_oprsz(desc);
2947     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2948         uint64_t f = *(uint64_t *)(vn + i);
2949         uint64_t b = *(uint64_t *)(vn + j);
2950         *(uint64_t *)(vd + i) = b;
2951         *(uint64_t *)(vd + j) = f;
2952     }
2953 }
2954 
2955 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2956 
2957 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2958                            bool is_tbx, tb_impl_fn *fn)
2959 {
2960     ARMVectorReg scratch;
2961     uintptr_t oprsz = simd_oprsz(desc);
2962 
2963     if (unlikely(vd == vn)) {
2964         vn = memcpy(&scratch, vn, oprsz);
2965     }
2966 
2967     fn(vd, vn, NULL, vm, oprsz, is_tbx);
2968 }
2969 
2970 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2971                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2972 {
2973     ARMVectorReg scratch;
2974     uintptr_t oprsz = simd_oprsz(desc);
2975 
2976     if (unlikely(vd == vn0)) {
2977         vn0 = memcpy(&scratch, vn0, oprsz);
2978         if (vd == vn1) {
2979             vn1 = vn0;
2980         }
2981     } else if (unlikely(vd == vn1)) {
2982         vn1 = memcpy(&scratch, vn1, oprsz);
2983     }
2984 
2985     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2986 }
2987 
2988 #define DO_TB(SUFF, TYPE, H)                                            \
2989 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
2990                                 void *vm, uintptr_t oprsz, bool is_tbx) \
2991 {                                                                       \
2992     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
2993     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
2994     for (i = 0; i < nelem; ++i) {                                       \
2995         TYPE index = indexes[H1(i)], val = 0;                           \
2996         if (index < nelem) {                                            \
2997             val = tbl0[H(index)];                                       \
2998         } else {                                                        \
2999             index -= nelem;                                             \
3000             if (tbl1 && index < nelem) {                                \
3001                 val = tbl1[H(index)];                                   \
3002             } else if (is_tbx) {                                        \
3003                 continue;                                               \
3004             }                                                           \
3005         }                                                               \
3006         d[H(i)] = val;                                                  \
3007     }                                                                   \
3008 }                                                                       \
3009 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3010 {                                                                       \
3011     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3012 }                                                                       \
3013 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3014                              void *vm, uint32_t desc)                   \
3015 {                                                                       \
3016     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3017 }                                                                       \
3018 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3019 {                                                                       \
3020     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3021 }
3022 
3023 DO_TB(b, uint8_t, H1)
3024 DO_TB(h, uint16_t, H2)
3025 DO_TB(s, uint32_t, H4)
3026 DO_TB(d, uint64_t, H8)
3027 
3028 #undef DO_TB
3029 
3030 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3031 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3032 {                                                              \
3033     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3034     TYPED *d = vd;                                             \
3035     TYPES *n = vn;                                             \
3036     ARMVectorReg tmp;                                          \
3037     if (unlikely(vn - vd < opr_sz)) {                          \
3038         n = memcpy(&tmp, n, opr_sz / 2);                       \
3039     }                                                          \
3040     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3041         d[HD(i)] = n[HS(i)];                                   \
3042     }                                                          \
3043 }
3044 
3045 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3046 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3047 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3048 
3049 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3050 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3051 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3052 
3053 #undef DO_UNPK
3054 
3055 /* Mask of bits included in the even numbered predicates of width esz.
3056  * We also use this for expand_bits/compress_bits, and so extend the
3057  * same pattern out to 16-bit units.
3058  */
3059 static const uint64_t even_bit_esz_masks[5] = {
3060     0x5555555555555555ull,
3061     0x3333333333333333ull,
3062     0x0f0f0f0f0f0f0f0full,
3063     0x00ff00ff00ff00ffull,
3064     0x0000ffff0000ffffull,
3065 };
3066 
3067 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3068  * For N==0, this corresponds to the operation that in qemu/bitops.h
3069  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3070  * section 7-2 Shuffling Bits.
3071  */
3072 static uint64_t expand_bits(uint64_t x, int n)
3073 {
3074     int i;
3075 
3076     x &= 0xffffffffu;
3077     for (i = 4; i >= n; i--) {
3078         int sh = 1 << i;
3079         x = ((x << sh) | x) & even_bit_esz_masks[i];
3080     }
3081     return x;
3082 }
3083 
3084 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3085  * For N==0, this corresponds to the operation that in qemu/bitops.h
3086  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3087  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3088  */
3089 static uint64_t compress_bits(uint64_t x, int n)
3090 {
3091     int i;
3092 
3093     for (i = n; i <= 4; i++) {
3094         int sh = 1 << i;
3095         x &= even_bit_esz_masks[i];
3096         x = (x >> sh) | x;
3097     }
3098     return x & 0xffffffffu;
3099 }
3100 
3101 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3102 {
3103     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3104     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3105     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3106     int esize = 1 << esz;
3107     uint64_t *d = vd;
3108     intptr_t i;
3109 
3110     if (oprsz <= 8) {
3111         uint64_t nn = *(uint64_t *)vn;
3112         uint64_t mm = *(uint64_t *)vm;
3113         int half = 4 * oprsz;
3114 
3115         nn = extract64(nn, high * half, half);
3116         mm = extract64(mm, high * half, half);
3117         nn = expand_bits(nn, esz);
3118         mm = expand_bits(mm, esz);
3119         d[0] = nn | (mm << esize);
3120     } else {
3121         ARMPredicateReg tmp;
3122 
3123         /* We produce output faster than we consume input.
3124            Therefore we must be mindful of possible overlap.  */
3125         if (vd == vn) {
3126             vn = memcpy(&tmp, vn, oprsz);
3127             if (vd == vm) {
3128                 vm = vn;
3129             }
3130         } else if (vd == vm) {
3131             vm = memcpy(&tmp, vm, oprsz);
3132         }
3133         if (high) {
3134             high = oprsz >> 1;
3135         }
3136 
3137         if ((oprsz & 7) == 0) {
3138             uint32_t *n = vn, *m = vm;
3139             high >>= 2;
3140 
3141             for (i = 0; i < oprsz / 8; i++) {
3142                 uint64_t nn = n[H4(high + i)];
3143                 uint64_t mm = m[H4(high + i)];
3144 
3145                 nn = expand_bits(nn, esz);
3146                 mm = expand_bits(mm, esz);
3147                 d[i] = nn | (mm << esize);
3148             }
3149         } else {
3150             uint8_t *n = vn, *m = vm;
3151             uint16_t *d16 = vd;
3152 
3153             for (i = 0; i < oprsz / 2; i++) {
3154                 uint16_t nn = n[H1(high + i)];
3155                 uint16_t mm = m[H1(high + i)];
3156 
3157                 nn = expand_bits(nn, esz);
3158                 mm = expand_bits(mm, esz);
3159                 d16[H2(i)] = nn | (mm << esize);
3160             }
3161         }
3162     }
3163 }
3164 
3165 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3166 {
3167     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3168     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3169     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3170     uint64_t *d = vd, *n = vn, *m = vm;
3171     uint64_t l, h;
3172     intptr_t i;
3173 
3174     if (oprsz <= 8) {
3175         l = compress_bits(n[0] >> odd, esz);
3176         h = compress_bits(m[0] >> odd, esz);
3177         d[0] = l | (h << (4 * oprsz));
3178     } else {
3179         ARMPredicateReg tmp_m;
3180         intptr_t oprsz_16 = oprsz / 16;
3181 
3182         if ((vm - vd) < (uintptr_t)oprsz) {
3183             m = memcpy(&tmp_m, vm, oprsz);
3184         }
3185 
3186         for (i = 0; i < oprsz_16; i++) {
3187             l = n[2 * i + 0];
3188             h = n[2 * i + 1];
3189             l = compress_bits(l >> odd, esz);
3190             h = compress_bits(h >> odd, esz);
3191             d[i] = l | (h << 32);
3192         }
3193 
3194         /*
3195          * For VL which is not a multiple of 512, the results from M do not
3196          * align nicely with the uint64_t for D.  Put the aligned results
3197          * from M into TMP_M and then copy it into place afterward.
3198          */
3199         if (oprsz & 15) {
3200             int final_shift = (oprsz & 15) * 2;
3201 
3202             l = n[2 * i + 0];
3203             h = n[2 * i + 1];
3204             l = compress_bits(l >> odd, esz);
3205             h = compress_bits(h >> odd, esz);
3206             d[i] = l | (h << final_shift);
3207 
3208             for (i = 0; i < oprsz_16; i++) {
3209                 l = m[2 * i + 0];
3210                 h = m[2 * i + 1];
3211                 l = compress_bits(l >> odd, esz);
3212                 h = compress_bits(h >> odd, esz);
3213                 tmp_m.p[i] = l | (h << 32);
3214             }
3215             l = m[2 * i + 0];
3216             h = m[2 * i + 1];
3217             l = compress_bits(l >> odd, esz);
3218             h = compress_bits(h >> odd, esz);
3219             tmp_m.p[i] = l | (h << final_shift);
3220 
3221             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3222         } else {
3223             for (i = 0; i < oprsz_16; i++) {
3224                 l = m[2 * i + 0];
3225                 h = m[2 * i + 1];
3226                 l = compress_bits(l >> odd, esz);
3227                 h = compress_bits(h >> odd, esz);
3228                 d[oprsz_16 + i] = l | (h << 32);
3229             }
3230         }
3231     }
3232 }
3233 
3234 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3235 {
3236     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3237     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3238     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3239     uint64_t *d = vd, *n = vn, *m = vm;
3240     uint64_t mask;
3241     int shr, shl;
3242     intptr_t i;
3243 
3244     shl = 1 << esz;
3245     shr = 0;
3246     mask = even_bit_esz_masks[esz];
3247     if (odd) {
3248         mask <<= shl;
3249         shr = shl;
3250         shl = 0;
3251     }
3252 
3253     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3254         uint64_t nn = (n[i] & mask) >> shr;
3255         uint64_t mm = (m[i] & mask) << shl;
3256         d[i] = nn + mm;
3257     }
3258 }
3259 
3260 /* Reverse units of 2**N bits.  */
3261 static uint64_t reverse_bits_64(uint64_t x, int n)
3262 {
3263     int i, sh;
3264 
3265     x = bswap64(x);
3266     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3267         uint64_t mask = even_bit_esz_masks[i];
3268         x = ((x & mask) << sh) | ((x >> sh) & mask);
3269     }
3270     return x;
3271 }
3272 
3273 static uint8_t reverse_bits_8(uint8_t x, int n)
3274 {
3275     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3276     int i, sh;
3277 
3278     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3279         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3280     }
3281     return x;
3282 }
3283 
3284 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3285 {
3286     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3287     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3288     intptr_t i, oprsz_2 = oprsz / 2;
3289 
3290     if (oprsz <= 8) {
3291         uint64_t l = *(uint64_t *)vn;
3292         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3293         *(uint64_t *)vd = l;
3294     } else if ((oprsz & 15) == 0) {
3295         for (i = 0; i < oprsz_2; i += 8) {
3296             intptr_t ih = oprsz - 8 - i;
3297             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3298             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3299             *(uint64_t *)(vd + i) = h;
3300             *(uint64_t *)(vd + ih) = l;
3301         }
3302     } else {
3303         for (i = 0; i < oprsz_2; i += 1) {
3304             intptr_t il = H1(i);
3305             intptr_t ih = H1(oprsz - 1 - i);
3306             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3307             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3308             *(uint8_t *)(vd + il) = h;
3309             *(uint8_t *)(vd + ih) = l;
3310         }
3311     }
3312 }
3313 
3314 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3315 {
3316     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3317     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3318     uint64_t *d = vd;
3319     intptr_t i;
3320 
3321     if (oprsz <= 8) {
3322         uint64_t nn = *(uint64_t *)vn;
3323         int half = 4 * oprsz;
3324 
3325         nn = extract64(nn, high * half, half);
3326         nn = expand_bits(nn, 0);
3327         d[0] = nn;
3328     } else {
3329         ARMPredicateReg tmp_n;
3330 
3331         /* We produce output faster than we consume input.
3332            Therefore we must be mindful of possible overlap.  */
3333         if ((vn - vd) < (uintptr_t)oprsz) {
3334             vn = memcpy(&tmp_n, vn, oprsz);
3335         }
3336         if (high) {
3337             high = oprsz >> 1;
3338         }
3339 
3340         if ((oprsz & 7) == 0) {
3341             uint32_t *n = vn;
3342             high >>= 2;
3343 
3344             for (i = 0; i < oprsz / 8; i++) {
3345                 uint64_t nn = n[H4(high + i)];
3346                 d[i] = expand_bits(nn, 0);
3347             }
3348         } else {
3349             uint16_t *d16 = vd;
3350             uint8_t *n = vn;
3351 
3352             for (i = 0; i < oprsz / 2; i++) {
3353                 uint16_t nn = n[H1(high + i)];
3354                 d16[H2(i)] = expand_bits(nn, 0);
3355             }
3356         }
3357     }
3358 }
3359 
3360 #define DO_ZIP(NAME, TYPE, H) \
3361 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3362 {                                                                    \
3363     intptr_t oprsz = simd_oprsz(desc);                               \
3364     intptr_t odd_ofs = simd_data(desc);                              \
3365     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3366     ARMVectorReg tmp_n, tmp_m;                                       \
3367     /* We produce output faster than we consume input.               \
3368        Therefore we must be mindful of possible overlap.  */         \
3369     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3370         vn = memcpy(&tmp_n, vn, oprsz);                              \
3371     }                                                                \
3372     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3373         vm = memcpy(&tmp_m, vm, oprsz);                              \
3374     }                                                                \
3375     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3376         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3377         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3378             *(TYPE *)(vm + odd_ofs + H(i));                          \
3379     }                                                                \
3380     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3381         memset(vd + oprsz - 16, 0, 16);                              \
3382     }                                                                \
3383 }
3384 
3385 DO_ZIP(sve_zip_b, uint8_t, H1)
3386 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3387 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3388 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3389 DO_ZIP(sve2_zip_q, Int128, )
3390 
3391 #define DO_UZP(NAME, TYPE, H) \
3392 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3393 {                                                                      \
3394     intptr_t oprsz = simd_oprsz(desc);                                 \
3395     intptr_t odd_ofs = simd_data(desc);                                \
3396     intptr_t i, p;                                                     \
3397     ARMVectorReg tmp_m;                                                \
3398     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3399         vm = memcpy(&tmp_m, vm, oprsz);                                \
3400     }                                                                  \
3401     i = 0, p = odd_ofs;                                                \
3402     do {                                                               \
3403         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3404         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3405     } while (p < oprsz);                                               \
3406     p -= oprsz;                                                        \
3407     do {                                                               \
3408         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3409         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3410     } while (p < oprsz);                                               \
3411     tcg_debug_assert(i == oprsz);                                      \
3412 }
3413 
3414 DO_UZP(sve_uzp_b, uint8_t, H1)
3415 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3416 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3417 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3418 DO_UZP(sve2_uzp_q, Int128, )
3419 
3420 #define DO_TRN(NAME, TYPE, H) \
3421 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3422 {                                                                      \
3423     intptr_t oprsz = simd_oprsz(desc);                                 \
3424     intptr_t odd_ofs = simd_data(desc);                                \
3425     intptr_t i;                                                        \
3426     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3427         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3428         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3429         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3430         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3431     }                                                                  \
3432     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3433         memset(vd + oprsz - 16, 0, 16);                                \
3434     }                                                                  \
3435 }
3436 
3437 DO_TRN(sve_trn_b, uint8_t, H1)
3438 DO_TRN(sve_trn_h, uint16_t, H1_2)
3439 DO_TRN(sve_trn_s, uint32_t, H1_4)
3440 DO_TRN(sve_trn_d, uint64_t, H1_8)
3441 DO_TRN(sve2_trn_q, Int128, )
3442 
3443 #undef DO_ZIP
3444 #undef DO_UZP
3445 #undef DO_TRN
3446 
3447 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3448 {
3449     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3450     uint32_t *d = vd, *n = vn;
3451     uint8_t *pg = vg;
3452 
3453     for (i = j = 0; i < opr_sz; i++) {
3454         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3455             d[H4(j)] = n[H4(i)];
3456             j++;
3457         }
3458     }
3459     for (; j < opr_sz; j++) {
3460         d[H4(j)] = 0;
3461     }
3462 }
3463 
3464 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3465 {
3466     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3467     uint64_t *d = vd, *n = vn;
3468     uint8_t *pg = vg;
3469 
3470     for (i = j = 0; i < opr_sz; i++) {
3471         if (pg[H1(i)] & 1) {
3472             d[j] = n[i];
3473             j++;
3474         }
3475     }
3476     for (; j < opr_sz; j++) {
3477         d[j] = 0;
3478     }
3479 }
3480 
3481 /* Similar to the ARM LastActiveElement pseudocode function, except the
3482  * result is multiplied by the element size.  This includes the not found
3483  * indication; e.g. not found for esz=3 is -8.
3484  */
3485 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3486 {
3487     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3488     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3489 
3490     return last_active_element(vg, words, esz);
3491 }
3492 
3493 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3494 {
3495     intptr_t opr_sz = simd_oprsz(desc) / 8;
3496     int esz = simd_data(desc);
3497     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3498     intptr_t i, first_i, last_i;
3499     ARMVectorReg tmp;
3500 
3501     first_i = last_i = 0;
3502     first_g = last_g = 0;
3503 
3504     /* Find the extent of the active elements within VG.  */
3505     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3506         pg = *(uint64_t *)(vg + i) & mask;
3507         if (pg) {
3508             if (last_g == 0) {
3509                 last_g = pg;
3510                 last_i = i;
3511             }
3512             first_g = pg;
3513             first_i = i;
3514         }
3515     }
3516 
3517     len = 0;
3518     if (first_g != 0) {
3519         first_i = first_i * 8 + ctz64(first_g);
3520         last_i = last_i * 8 + 63 - clz64(last_g);
3521         len = last_i - first_i + (1 << esz);
3522         if (vd == vm) {
3523             vm = memcpy(&tmp, vm, opr_sz * 8);
3524         }
3525         swap_memmove(vd, vn + first_i, len);
3526     }
3527     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3528 }
3529 
3530 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3531                             void *vg, uint32_t desc)
3532 {
3533     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3534     uint64_t *d = vd, *n = vn, *m = vm;
3535     uint8_t *pg = vg;
3536 
3537     for (i = 0; i < opr_sz; i += 1) {
3538         uint64_t nn = n[i], mm = m[i];
3539         uint64_t pp = expand_pred_b(pg[H1(i)]);
3540         d[i] = (nn & pp) | (mm & ~pp);
3541     }
3542 }
3543 
3544 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3545                             void *vg, uint32_t desc)
3546 {
3547     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3548     uint64_t *d = vd, *n = vn, *m = vm;
3549     uint8_t *pg = vg;
3550 
3551     for (i = 0; i < opr_sz; i += 1) {
3552         uint64_t nn = n[i], mm = m[i];
3553         uint64_t pp = expand_pred_h(pg[H1(i)]);
3554         d[i] = (nn & pp) | (mm & ~pp);
3555     }
3556 }
3557 
3558 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3559                             void *vg, uint32_t desc)
3560 {
3561     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3562     uint64_t *d = vd, *n = vn, *m = vm;
3563     uint8_t *pg = vg;
3564 
3565     for (i = 0; i < opr_sz; i += 1) {
3566         uint64_t nn = n[i], mm = m[i];
3567         uint64_t pp = expand_pred_s(pg[H1(i)]);
3568         d[i] = (nn & pp) | (mm & ~pp);
3569     }
3570 }
3571 
3572 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3573                             void *vg, uint32_t desc)
3574 {
3575     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3576     uint64_t *d = vd, *n = vn, *m = vm;
3577     uint8_t *pg = vg;
3578 
3579     for (i = 0; i < opr_sz; i += 1) {
3580         uint64_t nn = n[i], mm = m[i];
3581         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3582     }
3583 }
3584 
3585 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3586                             void *vg, uint32_t desc)
3587 {
3588     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3589     Int128 *d = vd, *n = vn, *m = vm;
3590     uint16_t *pg = vg;
3591 
3592     for (i = 0; i < opr_sz; i += 1) {
3593         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3594     }
3595 }
3596 
3597 /* Two operand comparison controlled by a predicate.
3598  * ??? It is very tempting to want to be able to expand this inline
3599  * with x86 instructions, e.g.
3600  *
3601  *    vcmpeqw    zm, zn, %ymm0
3602  *    vpmovmskb  %ymm0, %eax
3603  *    and        $0x5555, %eax
3604  *    and        pg, %eax
3605  *
3606  * or even aarch64, e.g.
3607  *
3608  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3609  *    cmeq       v0.8h, zn, zm
3610  *    and        v0.8h, v0.8h, mask
3611  *    addv       h0, v0.8h
3612  *    and        v0.8b, pg
3613  *
3614  * However, coming up with an abstraction that allows vector inputs and
3615  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3616  * scalar outputs, is tricky.
3617  */
3618 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3619 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3620 {                                                                            \
3621     intptr_t opr_sz = simd_oprsz(desc);                                      \
3622     uint32_t flags = PREDTEST_INIT;                                          \
3623     intptr_t i = opr_sz;                                                     \
3624     do {                                                                     \
3625         uint64_t out = 0, pg;                                                \
3626         do {                                                                 \
3627             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3628             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3629             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3630             out |= nn OP mm;                                                 \
3631         } while (i & 63);                                                    \
3632         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3633         out &= pg;                                                           \
3634         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3635         flags = iter_predtest_bwd(out, pg, flags);                           \
3636     } while (i > 0);                                                         \
3637     return flags;                                                            \
3638 }
3639 
3640 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3641     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3642 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3643     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3644 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3645     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3646 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3647     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3648 
3649 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3650 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3651 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3652 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3653 
3654 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3655 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3656 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3657 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3658 
3659 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3660 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3661 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3662 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3663 
3664 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3665 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3666 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3667 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3668 
3669 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3670 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3671 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3672 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3673 
3674 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3675 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3676 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3677 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3678 
3679 #undef DO_CMP_PPZZ_B
3680 #undef DO_CMP_PPZZ_H
3681 #undef DO_CMP_PPZZ_S
3682 #undef DO_CMP_PPZZ_D
3683 #undef DO_CMP_PPZZ
3684 
3685 /* Similar, but the second source is "wide".  */
3686 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3687 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3688 {                                                                            \
3689     intptr_t opr_sz = simd_oprsz(desc);                                      \
3690     uint32_t flags = PREDTEST_INIT;                                          \
3691     intptr_t i = opr_sz;                                                     \
3692     do {                                                                     \
3693         uint64_t out = 0, pg;                                                \
3694         do {                                                                 \
3695             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3696             do {                                                             \
3697                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3698                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3699                 out |= nn OP mm;                                             \
3700             } while (i & 7);                                                 \
3701         } while (i & 63);                                                    \
3702         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3703         out &= pg;                                                           \
3704         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3705         flags = iter_predtest_bwd(out, pg, flags);                           \
3706     } while (i > 0);                                                         \
3707     return flags;                                                            \
3708 }
3709 
3710 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3711     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3712 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3713     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3714 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3715     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3716 
3717 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3718 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3719 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3720 
3721 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3722 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3723 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3724 
3725 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3726 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3727 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3728 
3729 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3730 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3731 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3732 
3733 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3734 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3735 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3736 
3737 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3738 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3739 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3740 
3741 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3742 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3743 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3744 
3745 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3746 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3747 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3748 
3749 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3750 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3751 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3752 
3753 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3754 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3755 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3756 
3757 #undef DO_CMP_PPZW_B
3758 #undef DO_CMP_PPZW_H
3759 #undef DO_CMP_PPZW_S
3760 #undef DO_CMP_PPZW
3761 
3762 /* Similar, but the second source is immediate.  */
3763 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3764 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3765 {                                                                    \
3766     intptr_t opr_sz = simd_oprsz(desc);                              \
3767     uint32_t flags = PREDTEST_INIT;                                  \
3768     TYPE mm = simd_data(desc);                                       \
3769     intptr_t i = opr_sz;                                             \
3770     do {                                                             \
3771         uint64_t out = 0, pg;                                        \
3772         do {                                                         \
3773             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3774             TYPE nn = *(TYPE *)(vn + H(i));                          \
3775             out |= nn OP mm;                                         \
3776         } while (i & 63);                                            \
3777         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3778         out &= pg;                                                   \
3779         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3780         flags = iter_predtest_bwd(out, pg, flags);                   \
3781     } while (i > 0);                                                 \
3782     return flags;                                                    \
3783 }
3784 
3785 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3786     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3787 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3788     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3789 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3790     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3791 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3792     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3793 
3794 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3795 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3796 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3797 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3798 
3799 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3800 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3801 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3802 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3803 
3804 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3805 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3806 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3807 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3808 
3809 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3810 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3811 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3812 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3813 
3814 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3815 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3816 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3817 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3818 
3819 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3820 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3821 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3822 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3823 
3824 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3825 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3826 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3827 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3828 
3829 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3830 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3831 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3832 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3833 
3834 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3835 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3836 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3837 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3838 
3839 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3840 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3841 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3842 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3843 
3844 #undef DO_CMP_PPZI_B
3845 #undef DO_CMP_PPZI_H
3846 #undef DO_CMP_PPZI_S
3847 #undef DO_CMP_PPZI_D
3848 #undef DO_CMP_PPZI
3849 
3850 /* Similar to the ARM LastActive pseudocode function.  */
3851 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3852 {
3853     intptr_t i;
3854 
3855     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3856         uint64_t pg = *(uint64_t *)(vg + i);
3857         if (pg) {
3858             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3859         }
3860     }
3861     return 0;
3862 }
3863 
3864 /* Compute a mask into RETB that is true for all G, up to and including
3865  * (if after) or excluding (if !after) the first G & N.
3866  * Return true if BRK found.
3867  */
3868 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3869                         bool brk, bool after)
3870 {
3871     uint64_t b;
3872 
3873     if (brk) {
3874         b = 0;
3875     } else if ((g & n) == 0) {
3876         /* For all G, no N are set; break not found.  */
3877         b = g;
3878     } else {
3879         /* Break somewhere in N.  Locate it.  */
3880         b = g & n;            /* guard true, pred true */
3881         b = b & -b;           /* first such */
3882         if (after) {
3883             b = b | (b - 1);  /* break after same */
3884         } else {
3885             b = b - 1;        /* break before same */
3886         }
3887         brk = true;
3888     }
3889 
3890     *retb = b;
3891     return brk;
3892 }
3893 
3894 /* Compute a zeroing BRK.  */
3895 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3896                           intptr_t oprsz, bool after)
3897 {
3898     bool brk = false;
3899     intptr_t i;
3900 
3901     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3902         uint64_t this_b, this_g = g[i];
3903 
3904         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3905         d[i] = this_b & this_g;
3906     }
3907 }
3908 
3909 /* Likewise, but also compute flags.  */
3910 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3911                                intptr_t oprsz, bool after)
3912 {
3913     uint32_t flags = PREDTEST_INIT;
3914     bool brk = false;
3915     intptr_t i;
3916 
3917     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3918         uint64_t this_b, this_d, this_g = g[i];
3919 
3920         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3921         d[i] = this_d = this_b & this_g;
3922         flags = iter_predtest_fwd(this_d, this_g, flags);
3923     }
3924     return flags;
3925 }
3926 
3927 /* Compute a merging BRK.  */
3928 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3929                           intptr_t oprsz, bool after)
3930 {
3931     bool brk = false;
3932     intptr_t i;
3933 
3934     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3935         uint64_t this_b, this_g = g[i];
3936 
3937         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3938         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3939     }
3940 }
3941 
3942 /* Likewise, but also compute flags.  */
3943 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3944                                intptr_t oprsz, bool after)
3945 {
3946     uint32_t flags = PREDTEST_INIT;
3947     bool brk = false;
3948     intptr_t i;
3949 
3950     for (i = 0; i < oprsz / 8; ++i) {
3951         uint64_t this_b, this_d = d[i], this_g = g[i];
3952 
3953         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3954         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3955         flags = iter_predtest_fwd(this_d, this_g, flags);
3956     }
3957     return flags;
3958 }
3959 
3960 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3961 {
3962     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3963      * The compiler should turn this into 4 64-bit integer stores.
3964      */
3965     memset(d, 0, sizeof(ARMPredicateReg));
3966     return PREDTEST_INIT;
3967 }
3968 
3969 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3970                        uint32_t pred_desc)
3971 {
3972     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3973     if (last_active_pred(vn, vg, oprsz)) {
3974         compute_brk_z(vd, vm, vg, oprsz, true);
3975     } else {
3976         do_zero(vd, oprsz);
3977     }
3978 }
3979 
3980 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3981                             uint32_t pred_desc)
3982 {
3983     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3984     if (last_active_pred(vn, vg, oprsz)) {
3985         return compute_brks_z(vd, vm, vg, oprsz, true);
3986     } else {
3987         return do_zero(vd, oprsz);
3988     }
3989 }
3990 
3991 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3992                        uint32_t pred_desc)
3993 {
3994     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3995     if (last_active_pred(vn, vg, oprsz)) {
3996         compute_brk_z(vd, vm, vg, oprsz, false);
3997     } else {
3998         do_zero(vd, oprsz);
3999     }
4000 }
4001 
4002 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4003                             uint32_t pred_desc)
4004 {
4005     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4006     if (last_active_pred(vn, vg, oprsz)) {
4007         return compute_brks_z(vd, vm, vg, oprsz, false);
4008     } else {
4009         return do_zero(vd, oprsz);
4010     }
4011 }
4012 
4013 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4014 {
4015     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4016     compute_brk_z(vd, vn, vg, oprsz, true);
4017 }
4018 
4019 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4020 {
4021     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4022     return compute_brks_z(vd, vn, vg, oprsz, true);
4023 }
4024 
4025 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4026 {
4027     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4028     compute_brk_z(vd, vn, vg, oprsz, false);
4029 }
4030 
4031 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4032 {
4033     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4034     return compute_brks_z(vd, vn, vg, oprsz, false);
4035 }
4036 
4037 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4038 {
4039     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4040     compute_brk_m(vd, vn, vg, oprsz, true);
4041 }
4042 
4043 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4044 {
4045     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4046     return compute_brks_m(vd, vn, vg, oprsz, true);
4047 }
4048 
4049 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4050 {
4051     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4052     compute_brk_m(vd, vn, vg, oprsz, false);
4053 }
4054 
4055 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4056 {
4057     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4058     return compute_brks_m(vd, vn, vg, oprsz, false);
4059 }
4060 
4061 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4062 {
4063     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4064     if (!last_active_pred(vn, vg, oprsz)) {
4065         do_zero(vd, oprsz);
4066     }
4067 }
4068 
4069 /* As if PredTest(Ones(PL), D, esz).  */
4070 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4071                               uint64_t esz_mask)
4072 {
4073     uint32_t flags = PREDTEST_INIT;
4074     intptr_t i;
4075 
4076     for (i = 0; i < oprsz / 8; i++) {
4077         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4078     }
4079     if (oprsz & 7) {
4080         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4081         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4082     }
4083     return flags;
4084 }
4085 
4086 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4087 {
4088     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4089     if (last_active_pred(vn, vg, oprsz)) {
4090         return predtest_ones(vd, oprsz, -1);
4091     } else {
4092         return do_zero(vd, oprsz);
4093     }
4094 }
4095 
4096 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4097 {
4098     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4099     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4100     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4101     intptr_t i;
4102 
4103     for (i = 0; i < words; ++i) {
4104         uint64_t t = n[i] & g[i] & mask;
4105         sum += ctpop64(t);
4106     }
4107     return sum;
4108 }
4109 
4110 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4111 {
4112     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4113     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4114     uint64_t esz_mask = pred_esz_masks[esz];
4115     ARMPredicateReg *d = vd;
4116     uint32_t flags;
4117     intptr_t i;
4118 
4119     /* Begin with a zero predicate register.  */
4120     flags = do_zero(d, oprsz);
4121     if (count == 0) {
4122         return flags;
4123     }
4124 
4125     /* Set all of the requested bits.  */
4126     for (i = 0; i < count / 64; ++i) {
4127         d->p[i] = esz_mask;
4128     }
4129     if (count & 63) {
4130         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4131     }
4132 
4133     return predtest_ones(d, oprsz, esz_mask);
4134 }
4135 
4136 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4137 {
4138     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4139     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4140     uint64_t esz_mask = pred_esz_masks[esz];
4141     ARMPredicateReg *d = vd;
4142     intptr_t i, invcount, oprbits;
4143     uint64_t bits;
4144 
4145     if (count == 0) {
4146         return do_zero(d, oprsz);
4147     }
4148 
4149     oprbits = oprsz * 8;
4150     tcg_debug_assert(count <= oprbits);
4151 
4152     bits = esz_mask;
4153     if (oprbits & 63) {
4154         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4155     }
4156 
4157     invcount = oprbits - count;
4158     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4159         d->p[i] = bits;
4160         bits = esz_mask;
4161     }
4162 
4163     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4164 
4165     while (--i >= 0) {
4166         d->p[i] = 0;
4167     }
4168 
4169     return predtest_ones(d, oprsz, esz_mask);
4170 }
4171 
4172 /* Recursive reduction on a function;
4173  * C.f. the ARM ARM function ReducePredicated.
4174  *
4175  * While it would be possible to write this without the DATA temporary,
4176  * it is much simpler to process the predicate register this way.
4177  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4178  * little to gain with a more complex non-recursive form.
4179  */
4180 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4181 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4182 {                                                                     \
4183     if (n == 1) {                                                     \
4184         return *data;                                                 \
4185     } else {                                                          \
4186         uintptr_t half = n / 2;                                       \
4187         TYPE lo = NAME##_reduce(data, status, half);                  \
4188         TYPE hi = NAME##_reduce(data + half, status, half);           \
4189         return TYPE##_##FUNC(lo, hi, status);                         \
4190     }                                                                 \
4191 }                                                                     \
4192 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
4193 {                                                                     \
4194     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4195     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4196     for (i = 0; i < oprsz; ) {                                        \
4197         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4198         do {                                                          \
4199             TYPE nn = *(TYPE *)(vn + H(i));                           \
4200             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4201             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4202         } while (i & 15);                                             \
4203     }                                                                 \
4204     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4205         *(TYPE *)((void *)data + i) = IDENT;                          \
4206     }                                                                 \
4207     return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
4208 }
4209 
4210 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4211 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4212 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4213 
4214 /* Identity is floatN_default_nan, without the function call.  */
4215 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4216 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4217 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4218 
4219 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4220 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4221 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4222 
4223 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4224 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4225 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4226 
4227 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4228 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4229 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4230 
4231 #undef DO_REDUCE
4232 
4233 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4234                              void *status, uint32_t desc)
4235 {
4236     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4237     float16 result = nn;
4238 
4239     do {
4240         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4241         do {
4242             if (pg & 1) {
4243                 float16 mm = *(float16 *)(vm + H1_2(i));
4244                 result = float16_add(result, mm, status);
4245             }
4246             i += sizeof(float16), pg >>= sizeof(float16);
4247         } while (i & 15);
4248     } while (i < opr_sz);
4249 
4250     return result;
4251 }
4252 
4253 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4254                              void *status, uint32_t desc)
4255 {
4256     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4257     float32 result = nn;
4258 
4259     do {
4260         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4261         do {
4262             if (pg & 1) {
4263                 float32 mm = *(float32 *)(vm + H1_2(i));
4264                 result = float32_add(result, mm, status);
4265             }
4266             i += sizeof(float32), pg >>= sizeof(float32);
4267         } while (i & 15);
4268     } while (i < opr_sz);
4269 
4270     return result;
4271 }
4272 
4273 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4274                              void *status, uint32_t desc)
4275 {
4276     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4277     uint64_t *m = vm;
4278     uint8_t *pg = vg;
4279 
4280     for (i = 0; i < opr_sz; i++) {
4281         if (pg[H1(i)] & 1) {
4282             nn = float64_add(nn, m[i], status);
4283         }
4284     }
4285 
4286     return nn;
4287 }
4288 
4289 /* Fully general three-operand expander, controlled by a predicate,
4290  * With the extra float_status parameter.
4291  */
4292 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4293 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4294                   void *status, uint32_t desc)                  \
4295 {                                                               \
4296     intptr_t i = simd_oprsz(desc);                              \
4297     uint64_t *g = vg;                                           \
4298     do {                                                        \
4299         uint64_t pg = g[(i - 1) >> 6];                          \
4300         do {                                                    \
4301             i -= sizeof(TYPE);                                  \
4302             if (likely((pg >> (i & 63)) & 1)) {                 \
4303                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4304                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4305                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4306             }                                                   \
4307         } while (i & 63);                                       \
4308     } while (i != 0);                                           \
4309 }
4310 
4311 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4312 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4313 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4314 
4315 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4316 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4317 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4318 
4319 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4320 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4321 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4322 
4323 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4324 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4325 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4326 
4327 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4328 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4329 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4330 
4331 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4332 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4333 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4334 
4335 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4336 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4337 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4338 
4339 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4340 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4341 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4342 
4343 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4344 {
4345     return float16_abs(float16_sub(a, b, s));
4346 }
4347 
4348 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4349 {
4350     return float32_abs(float32_sub(a, b, s));
4351 }
4352 
4353 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4354 {
4355     return float64_abs(float64_sub(a, b, s));
4356 }
4357 
4358 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4359 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4360 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4361 
4362 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4363 {
4364     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4365     return float64_scalbn(a, b_int, s);
4366 }
4367 
4368 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4369 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4370 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4371 
4372 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4373 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4374 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4375 
4376 #undef DO_ZPZZ_FP
4377 
4378 /* Three-operand expander, with one scalar operand, controlled by
4379  * a predicate, with the extra float_status parameter.
4380  */
4381 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4382 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4383                   void *status, uint32_t desc)                    \
4384 {                                                                 \
4385     intptr_t i = simd_oprsz(desc);                                \
4386     uint64_t *g = vg;                                             \
4387     TYPE mm = scalar;                                             \
4388     do {                                                          \
4389         uint64_t pg = g[(i - 1) >> 6];                            \
4390         do {                                                      \
4391             i -= sizeof(TYPE);                                    \
4392             if (likely((pg >> (i & 63)) & 1)) {                   \
4393                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4394                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4395             }                                                     \
4396         } while (i & 63);                                         \
4397     } while (i != 0);                                             \
4398 }
4399 
4400 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4401 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4402 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4403 
4404 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4405 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4406 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4407 
4408 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4409 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4410 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4411 
4412 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4413 {
4414     return float16_sub(b, a, s);
4415 }
4416 
4417 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4418 {
4419     return float32_sub(b, a, s);
4420 }
4421 
4422 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4423 {
4424     return float64_sub(b, a, s);
4425 }
4426 
4427 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4428 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4429 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4430 
4431 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4432 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4433 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4434 
4435 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4436 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4437 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4438 
4439 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4440 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4441 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4442 
4443 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4444 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4445 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4446 
4447 /* Fully general two-operand expander, controlled by a predicate,
4448  * With the extra float_status parameter.
4449  */
4450 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4451 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4452 {                                                                     \
4453     intptr_t i = simd_oprsz(desc);                                    \
4454     uint64_t *g = vg;                                                 \
4455     do {                                                              \
4456         uint64_t pg = g[(i - 1) >> 6];                                \
4457         do {                                                          \
4458             i -= sizeof(TYPE);                                        \
4459             if (likely((pg >> (i & 63)) & 1)) {                       \
4460                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4461                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4462             }                                                         \
4463         } while (i & 63);                                             \
4464     } while (i != 0);                                                 \
4465 }
4466 
4467 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4468  * FZ16.  When converting from fp16, this affects flushing input denormals;
4469  * when converting to fp16, this affects flushing output denormals.
4470  */
4471 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4472 {
4473     bool save = get_flush_inputs_to_zero(fpst);
4474     float32 ret;
4475 
4476     set_flush_inputs_to_zero(false, fpst);
4477     ret = float16_to_float32(f, true, fpst);
4478     set_flush_inputs_to_zero(save, fpst);
4479     return ret;
4480 }
4481 
4482 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4483 {
4484     bool save = get_flush_inputs_to_zero(fpst);
4485     float64 ret;
4486 
4487     set_flush_inputs_to_zero(false, fpst);
4488     ret = float16_to_float64(f, true, fpst);
4489     set_flush_inputs_to_zero(save, fpst);
4490     return ret;
4491 }
4492 
4493 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4494 {
4495     bool save = get_flush_to_zero(fpst);
4496     float16 ret;
4497 
4498     set_flush_to_zero(false, fpst);
4499     ret = float32_to_float16(f, true, fpst);
4500     set_flush_to_zero(save, fpst);
4501     return ret;
4502 }
4503 
4504 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4505 {
4506     bool save = get_flush_to_zero(fpst);
4507     float16 ret;
4508 
4509     set_flush_to_zero(false, fpst);
4510     ret = float64_to_float16(f, true, fpst);
4511     set_flush_to_zero(save, fpst);
4512     return ret;
4513 }
4514 
4515 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4516 {
4517     if (float16_is_any_nan(f)) {
4518         float_raise(float_flag_invalid, s);
4519         return 0;
4520     }
4521     return float16_to_int16_round_to_zero(f, s);
4522 }
4523 
4524 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4525 {
4526     if (float16_is_any_nan(f)) {
4527         float_raise(float_flag_invalid, s);
4528         return 0;
4529     }
4530     return float16_to_int64_round_to_zero(f, s);
4531 }
4532 
4533 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4534 {
4535     if (float32_is_any_nan(f)) {
4536         float_raise(float_flag_invalid, s);
4537         return 0;
4538     }
4539     return float32_to_int64_round_to_zero(f, s);
4540 }
4541 
4542 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4543 {
4544     if (float64_is_any_nan(f)) {
4545         float_raise(float_flag_invalid, s);
4546         return 0;
4547     }
4548     return float64_to_int64_round_to_zero(f, s);
4549 }
4550 
4551 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4552 {
4553     if (float16_is_any_nan(f)) {
4554         float_raise(float_flag_invalid, s);
4555         return 0;
4556     }
4557     return float16_to_uint16_round_to_zero(f, s);
4558 }
4559 
4560 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4561 {
4562     if (float16_is_any_nan(f)) {
4563         float_raise(float_flag_invalid, s);
4564         return 0;
4565     }
4566     return float16_to_uint64_round_to_zero(f, s);
4567 }
4568 
4569 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4570 {
4571     if (float32_is_any_nan(f)) {
4572         float_raise(float_flag_invalid, s);
4573         return 0;
4574     }
4575     return float32_to_uint64_round_to_zero(f, s);
4576 }
4577 
4578 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4579 {
4580     if (float64_is_any_nan(f)) {
4581         float_raise(float_flag_invalid, s);
4582         return 0;
4583     }
4584     return float64_to_uint64_round_to_zero(f, s);
4585 }
4586 
4587 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4588 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4589 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4590 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4591 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4592 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4593 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4594 
4595 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4596 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4597 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4598 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4599 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4600 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4601 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4602 
4603 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4604 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4605 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4606 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4607 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4608 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4609 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4610 
4611 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4612 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4613 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4614 
4615 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4616 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4617 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4618 
4619 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4620 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4621 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4622 
4623 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4624 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4625 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4626 
4627 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4628 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4629 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4630 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4631 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4632 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4633 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4634 
4635 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4636 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4637 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4638 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4639 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4640 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4641 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4642 
4643 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4644 {
4645     /* Extract frac to the top of the uint32_t. */
4646     uint32_t frac = (uint32_t)a << (16 + 6);
4647     int16_t exp = extract32(a, 10, 5);
4648 
4649     if (unlikely(exp == 0)) {
4650         if (frac != 0) {
4651             if (!get_flush_inputs_to_zero(s)) {
4652                 /* denormal: bias - fractional_zeros */
4653                 return -15 - clz32(frac);
4654             }
4655             /* flush to zero */
4656             float_raise(float_flag_input_denormal, s);
4657         }
4658     } else if (unlikely(exp == 0x1f)) {
4659         if (frac == 0) {
4660             return INT16_MAX; /* infinity */
4661         }
4662     } else {
4663         /* normal: exp - bias */
4664         return exp - 15;
4665     }
4666     /* nan or zero */
4667     float_raise(float_flag_invalid, s);
4668     return INT16_MIN;
4669 }
4670 
4671 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4672 {
4673     /* Extract frac to the top of the uint32_t. */
4674     uint32_t frac = a << 9;
4675     int32_t exp = extract32(a, 23, 8);
4676 
4677     if (unlikely(exp == 0)) {
4678         if (frac != 0) {
4679             if (!get_flush_inputs_to_zero(s)) {
4680                 /* denormal: bias - fractional_zeros */
4681                 return -127 - clz32(frac);
4682             }
4683             /* flush to zero */
4684             float_raise(float_flag_input_denormal, s);
4685         }
4686     } else if (unlikely(exp == 0xff)) {
4687         if (frac == 0) {
4688             return INT32_MAX; /* infinity */
4689         }
4690     } else {
4691         /* normal: exp - bias */
4692         return exp - 127;
4693     }
4694     /* nan or zero */
4695     float_raise(float_flag_invalid, s);
4696     return INT32_MIN;
4697 }
4698 
4699 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4700 {
4701     /* Extract frac to the top of the uint64_t. */
4702     uint64_t frac = a << 12;
4703     int64_t exp = extract64(a, 52, 11);
4704 
4705     if (unlikely(exp == 0)) {
4706         if (frac != 0) {
4707             if (!get_flush_inputs_to_zero(s)) {
4708                 /* denormal: bias - fractional_zeros */
4709                 return -1023 - clz64(frac);
4710             }
4711             /* flush to zero */
4712             float_raise(float_flag_input_denormal, s);
4713         }
4714     } else if (unlikely(exp == 0x7ff)) {
4715         if (frac == 0) {
4716             return INT64_MAX; /* infinity */
4717         }
4718     } else {
4719         /* normal: exp - bias */
4720         return exp - 1023;
4721     }
4722     /* nan or zero */
4723     float_raise(float_flag_invalid, s);
4724     return INT64_MIN;
4725 }
4726 
4727 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4728 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4729 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4730 
4731 #undef DO_ZPZ_FP
4732 
4733 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4734                             float_status *status, uint32_t desc,
4735                             uint16_t neg1, uint16_t neg3)
4736 {
4737     intptr_t i = simd_oprsz(desc);
4738     uint64_t *g = vg;
4739 
4740     do {
4741         uint64_t pg = g[(i - 1) >> 6];
4742         do {
4743             i -= 2;
4744             if (likely((pg >> (i & 63)) & 1)) {
4745                 float16 e1, e2, e3, r;
4746 
4747                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4748                 e2 = *(uint16_t *)(vm + H1_2(i));
4749                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4750                 r = float16_muladd(e1, e2, e3, 0, status);
4751                 *(uint16_t *)(vd + H1_2(i)) = r;
4752             }
4753         } while (i & 63);
4754     } while (i != 0);
4755 }
4756 
4757 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4758                               void *vg, void *status, uint32_t desc)
4759 {
4760     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4761 }
4762 
4763 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4764                               void *vg, void *status, uint32_t desc)
4765 {
4766     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4767 }
4768 
4769 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4770                                void *vg, void *status, uint32_t desc)
4771 {
4772     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4773 }
4774 
4775 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4776                                void *vg, void *status, uint32_t desc)
4777 {
4778     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4779 }
4780 
4781 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4782                             float_status *status, uint32_t desc,
4783                             uint32_t neg1, uint32_t neg3)
4784 {
4785     intptr_t i = simd_oprsz(desc);
4786     uint64_t *g = vg;
4787 
4788     do {
4789         uint64_t pg = g[(i - 1) >> 6];
4790         do {
4791             i -= 4;
4792             if (likely((pg >> (i & 63)) & 1)) {
4793                 float32 e1, e2, e3, r;
4794 
4795                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4796                 e2 = *(uint32_t *)(vm + H1_4(i));
4797                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4798                 r = float32_muladd(e1, e2, e3, 0, status);
4799                 *(uint32_t *)(vd + H1_4(i)) = r;
4800             }
4801         } while (i & 63);
4802     } while (i != 0);
4803 }
4804 
4805 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4806                               void *vg, void *status, uint32_t desc)
4807 {
4808     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4809 }
4810 
4811 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4812                               void *vg, void *status, uint32_t desc)
4813 {
4814     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4815 }
4816 
4817 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4818                                void *vg, void *status, uint32_t desc)
4819 {
4820     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4821 }
4822 
4823 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4824                                void *vg, void *status, uint32_t desc)
4825 {
4826     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4827 }
4828 
4829 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4830                             float_status *status, uint32_t desc,
4831                             uint64_t neg1, uint64_t neg3)
4832 {
4833     intptr_t i = simd_oprsz(desc);
4834     uint64_t *g = vg;
4835 
4836     do {
4837         uint64_t pg = g[(i - 1) >> 6];
4838         do {
4839             i -= 8;
4840             if (likely((pg >> (i & 63)) & 1)) {
4841                 float64 e1, e2, e3, r;
4842 
4843                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4844                 e2 = *(uint64_t *)(vm + i);
4845                 e3 = *(uint64_t *)(va + i) ^ neg3;
4846                 r = float64_muladd(e1, e2, e3, 0, status);
4847                 *(uint64_t *)(vd + i) = r;
4848             }
4849         } while (i & 63);
4850     } while (i != 0);
4851 }
4852 
4853 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4854                               void *vg, void *status, uint32_t desc)
4855 {
4856     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4857 }
4858 
4859 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4860                               void *vg, void *status, uint32_t desc)
4861 {
4862     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4863 }
4864 
4865 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4866                                void *vg, void *status, uint32_t desc)
4867 {
4868     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4869 }
4870 
4871 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4872                                void *vg, void *status, uint32_t desc)
4873 {
4874     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4875 }
4876 
4877 /* Two operand floating-point comparison controlled by a predicate.
4878  * Unlike the integer version, we are not allowed to optimistically
4879  * compare operands, since the comparison may have side effects wrt
4880  * the FPSR.
4881  */
4882 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
4883 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
4884                   void *status, uint32_t desc)                          \
4885 {                                                                       \
4886     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
4887     uint64_t *d = vd, *g = vg;                                          \
4888     do {                                                                \
4889         uint64_t out = 0, pg = g[j];                                    \
4890         do {                                                            \
4891             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
4892             if (likely((pg >> (i & 63)) & 1)) {                         \
4893                 TYPE nn = *(TYPE *)(vn + H(i));                         \
4894                 TYPE mm = *(TYPE *)(vm + H(i));                         \
4895                 out |= OP(TYPE, nn, mm, status);                        \
4896             }                                                           \
4897         } while (i & 63);                                               \
4898         d[j--] = out;                                                   \
4899     } while (i > 0);                                                    \
4900 }
4901 
4902 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4903     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4904 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4905     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4906 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4907     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4908 
4909 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4910     DO_FPCMP_PPZZ_H(NAME, OP)   \
4911     DO_FPCMP_PPZZ_S(NAME, OP)   \
4912     DO_FPCMP_PPZZ_D(NAME, OP)
4913 
4914 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
4915 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
4916 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
4917 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
4918 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
4919 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
4920 #define DO_FCMUO(TYPE, X, Y, ST)  \
4921     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4922 #define DO_FACGE(TYPE, X, Y, ST)  \
4923     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4924 #define DO_FACGT(TYPE, X, Y, ST)  \
4925     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4926 
4927 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4928 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4929 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4930 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4931 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4932 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4933 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4934 
4935 #undef DO_FPCMP_PPZZ_ALL
4936 #undef DO_FPCMP_PPZZ_D
4937 #undef DO_FPCMP_PPZZ_S
4938 #undef DO_FPCMP_PPZZ_H
4939 #undef DO_FPCMP_PPZZ
4940 
4941 /* One operand floating-point comparison against zero, controlled
4942  * by a predicate.
4943  */
4944 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
4945 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
4946                   void *status, uint32_t desc)             \
4947 {                                                          \
4948     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
4949     uint64_t *d = vd, *g = vg;                             \
4950     do {                                                   \
4951         uint64_t out = 0, pg = g[j];                       \
4952         do {                                               \
4953             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
4954             if ((pg >> (i & 63)) & 1) {                    \
4955                 TYPE nn = *(TYPE *)(vn + H(i));            \
4956                 out |= OP(TYPE, nn, 0, status);            \
4957             }                                              \
4958         } while (i & 63);                                  \
4959         d[j--] = out;                                      \
4960     } while (i > 0);                                       \
4961 }
4962 
4963 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4964     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4965 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4966     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4967 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4968     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4969 
4970 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4971     DO_FPCMP_PPZ0_H(NAME, OP)   \
4972     DO_FPCMP_PPZ0_S(NAME, OP)   \
4973     DO_FPCMP_PPZ0_D(NAME, OP)
4974 
4975 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4976 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4977 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4978 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4979 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4980 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4981 
4982 /* FP Trig Multiply-Add. */
4983 
4984 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4985 {
4986     static const float16 coeff[16] = {
4987         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4988         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989     };
4990     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4991     intptr_t x = simd_data(desc);
4992     float16 *d = vd, *n = vn, *m = vm;
4993     for (i = 0; i < opr_sz; i++) {
4994         float16 mm = m[i];
4995         intptr_t xx = x;
4996         if (float16_is_neg(mm)) {
4997             mm = float16_abs(mm);
4998             xx += 8;
4999         }
5000         d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5001     }
5002 }
5003 
5004 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5005 {
5006     static const float32 coeff[16] = {
5007         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5008         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5009         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5010         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5011     };
5012     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5013     intptr_t x = simd_data(desc);
5014     float32 *d = vd, *n = vn, *m = vm;
5015     for (i = 0; i < opr_sz; i++) {
5016         float32 mm = m[i];
5017         intptr_t xx = x;
5018         if (float32_is_neg(mm)) {
5019             mm = float32_abs(mm);
5020             xx += 8;
5021         }
5022         d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5023     }
5024 }
5025 
5026 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5027 {
5028     static const float64 coeff[16] = {
5029         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5030         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5031         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5032         0x3de5d8408868552full, 0x0000000000000000ull,
5033         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5034         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5035         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5036         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5037     };
5038     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5039     intptr_t x = simd_data(desc);
5040     float64 *d = vd, *n = vn, *m = vm;
5041     for (i = 0; i < opr_sz; i++) {
5042         float64 mm = m[i];
5043         intptr_t xx = x;
5044         if (float64_is_neg(mm)) {
5045             mm = float64_abs(mm);
5046             xx += 8;
5047         }
5048         d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5049     }
5050 }
5051 
5052 /*
5053  * FP Complex Add
5054  */
5055 
5056 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5057                          void *vs, uint32_t desc)
5058 {
5059     intptr_t j, i = simd_oprsz(desc);
5060     uint64_t *g = vg;
5061     float16 neg_imag = float16_set_sign(0, simd_data(desc));
5062     float16 neg_real = float16_chs(neg_imag);
5063 
5064     do {
5065         uint64_t pg = g[(i - 1) >> 6];
5066         do {
5067             float16 e0, e1, e2, e3;
5068 
5069             /* I holds the real index; J holds the imag index.  */
5070             j = i - sizeof(float16);
5071             i -= 2 * sizeof(float16);
5072 
5073             e0 = *(float16 *)(vn + H1_2(i));
5074             e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5075             e2 = *(float16 *)(vn + H1_2(j));
5076             e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5077 
5078             if (likely((pg >> (i & 63)) & 1)) {
5079                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5080             }
5081             if (likely((pg >> (j & 63)) & 1)) {
5082                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5083             }
5084         } while (i & 63);
5085     } while (i != 0);
5086 }
5087 
5088 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5089                          void *vs, uint32_t desc)
5090 {
5091     intptr_t j, i = simd_oprsz(desc);
5092     uint64_t *g = vg;
5093     float32 neg_imag = float32_set_sign(0, simd_data(desc));
5094     float32 neg_real = float32_chs(neg_imag);
5095 
5096     do {
5097         uint64_t pg = g[(i - 1) >> 6];
5098         do {
5099             float32 e0, e1, e2, e3;
5100 
5101             /* I holds the real index; J holds the imag index.  */
5102             j = i - sizeof(float32);
5103             i -= 2 * sizeof(float32);
5104 
5105             e0 = *(float32 *)(vn + H1_2(i));
5106             e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5107             e2 = *(float32 *)(vn + H1_2(j));
5108             e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5109 
5110             if (likely((pg >> (i & 63)) & 1)) {
5111                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5112             }
5113             if (likely((pg >> (j & 63)) & 1)) {
5114                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5115             }
5116         } while (i & 63);
5117     } while (i != 0);
5118 }
5119 
5120 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5121                          void *vs, uint32_t desc)
5122 {
5123     intptr_t j, i = simd_oprsz(desc);
5124     uint64_t *g = vg;
5125     float64 neg_imag = float64_set_sign(0, simd_data(desc));
5126     float64 neg_real = float64_chs(neg_imag);
5127 
5128     do {
5129         uint64_t pg = g[(i - 1) >> 6];
5130         do {
5131             float64 e0, e1, e2, e3;
5132 
5133             /* I holds the real index; J holds the imag index.  */
5134             j = i - sizeof(float64);
5135             i -= 2 * sizeof(float64);
5136 
5137             e0 = *(float64 *)(vn + H1_2(i));
5138             e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5139             e2 = *(float64 *)(vn + H1_2(j));
5140             e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5141 
5142             if (likely((pg >> (i & 63)) & 1)) {
5143                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5144             }
5145             if (likely((pg >> (j & 63)) & 1)) {
5146                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5147             }
5148         } while (i & 63);
5149     } while (i != 0);
5150 }
5151 
5152 /*
5153  * FP Complex Multiply
5154  */
5155 
5156 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5157                                void *vg, void *status, uint32_t desc)
5158 {
5159     intptr_t j, i = simd_oprsz(desc);
5160     unsigned rot = simd_data(desc);
5161     bool flip = rot & 1;
5162     float16 neg_imag, neg_real;
5163     uint64_t *g = vg;
5164 
5165     neg_imag = float16_set_sign(0, (rot & 2) != 0);
5166     neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5167 
5168     do {
5169         uint64_t pg = g[(i - 1) >> 6];
5170         do {
5171             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5172 
5173             /* I holds the real index; J holds the imag index.  */
5174             j = i - sizeof(float16);
5175             i -= 2 * sizeof(float16);
5176 
5177             nr = *(float16 *)(vn + H1_2(i));
5178             ni = *(float16 *)(vn + H1_2(j));
5179             mr = *(float16 *)(vm + H1_2(i));
5180             mi = *(float16 *)(vm + H1_2(j));
5181 
5182             e2 = (flip ? ni : nr);
5183             e1 = (flip ? mi : mr) ^ neg_real;
5184             e4 = e2;
5185             e3 = (flip ? mr : mi) ^ neg_imag;
5186 
5187             if (likely((pg >> (i & 63)) & 1)) {
5188                 d = *(float16 *)(va + H1_2(i));
5189                 d = float16_muladd(e2, e1, d, 0, status);
5190                 *(float16 *)(vd + H1_2(i)) = d;
5191             }
5192             if (likely((pg >> (j & 63)) & 1)) {
5193                 d = *(float16 *)(va + H1_2(j));
5194                 d = float16_muladd(e4, e3, d, 0, status);
5195                 *(float16 *)(vd + H1_2(j)) = d;
5196             }
5197         } while (i & 63);
5198     } while (i != 0);
5199 }
5200 
5201 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5202                                void *vg, void *status, uint32_t desc)
5203 {
5204     intptr_t j, i = simd_oprsz(desc);
5205     unsigned rot = simd_data(desc);
5206     bool flip = rot & 1;
5207     float32 neg_imag, neg_real;
5208     uint64_t *g = vg;
5209 
5210     neg_imag = float32_set_sign(0, (rot & 2) != 0);
5211     neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5212 
5213     do {
5214         uint64_t pg = g[(i - 1) >> 6];
5215         do {
5216             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5217 
5218             /* I holds the real index; J holds the imag index.  */
5219             j = i - sizeof(float32);
5220             i -= 2 * sizeof(float32);
5221 
5222             nr = *(float32 *)(vn + H1_2(i));
5223             ni = *(float32 *)(vn + H1_2(j));
5224             mr = *(float32 *)(vm + H1_2(i));
5225             mi = *(float32 *)(vm + H1_2(j));
5226 
5227             e2 = (flip ? ni : nr);
5228             e1 = (flip ? mi : mr) ^ neg_real;
5229             e4 = e2;
5230             e3 = (flip ? mr : mi) ^ neg_imag;
5231 
5232             if (likely((pg >> (i & 63)) & 1)) {
5233                 d = *(float32 *)(va + H1_2(i));
5234                 d = float32_muladd(e2, e1, d, 0, status);
5235                 *(float32 *)(vd + H1_2(i)) = d;
5236             }
5237             if (likely((pg >> (j & 63)) & 1)) {
5238                 d = *(float32 *)(va + H1_2(j));
5239                 d = float32_muladd(e4, e3, d, 0, status);
5240                 *(float32 *)(vd + H1_2(j)) = d;
5241             }
5242         } while (i & 63);
5243     } while (i != 0);
5244 }
5245 
5246 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5247                                void *vg, void *status, uint32_t desc)
5248 {
5249     intptr_t j, i = simd_oprsz(desc);
5250     unsigned rot = simd_data(desc);
5251     bool flip = rot & 1;
5252     float64 neg_imag, neg_real;
5253     uint64_t *g = vg;
5254 
5255     neg_imag = float64_set_sign(0, (rot & 2) != 0);
5256     neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5257 
5258     do {
5259         uint64_t pg = g[(i - 1) >> 6];
5260         do {
5261             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5262 
5263             /* I holds the real index; J holds the imag index.  */
5264             j = i - sizeof(float64);
5265             i -= 2 * sizeof(float64);
5266 
5267             nr = *(float64 *)(vn + H1_2(i));
5268             ni = *(float64 *)(vn + H1_2(j));
5269             mr = *(float64 *)(vm + H1_2(i));
5270             mi = *(float64 *)(vm + H1_2(j));
5271 
5272             e2 = (flip ? ni : nr);
5273             e1 = (flip ? mi : mr) ^ neg_real;
5274             e4 = e2;
5275             e3 = (flip ? mr : mi) ^ neg_imag;
5276 
5277             if (likely((pg >> (i & 63)) & 1)) {
5278                 d = *(float64 *)(va + H1_2(i));
5279                 d = float64_muladd(e2, e1, d, 0, status);
5280                 *(float64 *)(vd + H1_2(i)) = d;
5281             }
5282             if (likely((pg >> (j & 63)) & 1)) {
5283                 d = *(float64 *)(va + H1_2(j));
5284                 d = float64_muladd(e4, e3, d, 0, status);
5285                 *(float64 *)(vd + H1_2(j)) = d;
5286             }
5287         } while (i & 63);
5288     } while (i != 0);
5289 }
5290 
5291 /*
5292  * Load contiguous data, protected by a governing predicate.
5293  */
5294 
5295 /*
5296  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5297  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5298  * element >= @reg_off, or @reg_max if there were no active elements at all.
5299  */
5300 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5301                                  intptr_t reg_max, int esz)
5302 {
5303     uint64_t pg_mask = pred_esz_masks[esz];
5304     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5305 
5306     /* In normal usage, the first element is active.  */
5307     if (likely(pg & 1)) {
5308         return reg_off;
5309     }
5310 
5311     if (pg == 0) {
5312         reg_off &= -64;
5313         do {
5314             reg_off += 64;
5315             if (unlikely(reg_off >= reg_max)) {
5316                 /* The entire predicate was false.  */
5317                 return reg_max;
5318             }
5319             pg = vg[reg_off >> 6] & pg_mask;
5320         } while (pg == 0);
5321     }
5322     reg_off += ctz64(pg);
5323 
5324     /* We should never see an out of range predicate bit set.  */
5325     tcg_debug_assert(reg_off < reg_max);
5326     return reg_off;
5327 }
5328 
5329 /*
5330  * Resolve the guest virtual address to info->host and info->flags.
5331  * If @nofault, return false if the page is invalid, otherwise
5332  * exit via page fault exception.
5333  */
5334 
5335 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5336                     target_ulong addr, int mem_off, MMUAccessType access_type,
5337                     int mmu_idx, uintptr_t retaddr)
5338 {
5339     int flags;
5340 
5341     addr += mem_off;
5342 
5343     /*
5344      * User-only currently always issues with TBI.  See the comment
5345      * above useronly_clean_ptr.  Usually we clean this top byte away
5346      * during translation, but we can't do that for e.g. vector + imm
5347      * addressing modes.
5348      *
5349      * We currently always enable TBI for user-only, and do not provide
5350      * a way to turn it off.  So clean the pointer unconditionally here,
5351      * rather than look it up here, or pass it down from above.
5352      */
5353     addr = useronly_clean_ptr(addr);
5354 
5355 #ifdef CONFIG_USER_ONLY
5356     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5357                                &info->host, retaddr);
5358 #else
5359     CPUTLBEntryFull *full;
5360     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5361                               &info->host, &full, retaddr);
5362 #endif
5363     info->flags = flags;
5364 
5365     if (flags & TLB_INVALID_MASK) {
5366         g_assert(nofault);
5367         return false;
5368     }
5369 
5370 #ifdef CONFIG_USER_ONLY
5371     memset(&info->attrs, 0, sizeof(info->attrs));
5372     /* Require both ANON and MTE; see allocation_tag_mem(). */
5373     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5374 #else
5375     info->attrs = full->attrs;
5376     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5377 #endif
5378 
5379     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5380     info->host -= mem_off;
5381     return true;
5382 }
5383 
5384 /*
5385  * Find first active element on each page, and a loose bound for the
5386  * final element on each page.  Identify any single element that spans
5387  * the page boundary.  Return true if there are any active elements.
5388  */
5389 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5390                             intptr_t reg_max, int esz, int msize)
5391 {
5392     const int esize = 1 << esz;
5393     const uint64_t pg_mask = pred_esz_masks[esz];
5394     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5395     intptr_t mem_off_last, mem_off_split;
5396     intptr_t page_split, elt_split;
5397     intptr_t i;
5398 
5399     /* Set all of the element indices to -1, and the TLB data to 0. */
5400     memset(info, -1, offsetof(SVEContLdSt, page));
5401     memset(info->page, 0, sizeof(info->page));
5402 
5403     /* Gross scan over the entire predicate to find bounds. */
5404     i = 0;
5405     do {
5406         uint64_t pg = vg[i] & pg_mask;
5407         if (pg) {
5408             reg_off_last = i * 64 + 63 - clz64(pg);
5409             if (reg_off_first < 0) {
5410                 reg_off_first = i * 64 + ctz64(pg);
5411             }
5412         }
5413     } while (++i * 64 < reg_max);
5414 
5415     if (unlikely(reg_off_first < 0)) {
5416         /* No active elements, no pages touched. */
5417         return false;
5418     }
5419     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5420 
5421     info->reg_off_first[0] = reg_off_first;
5422     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5423     mem_off_last = (reg_off_last >> esz) * msize;
5424 
5425     page_split = -(addr | TARGET_PAGE_MASK);
5426     if (likely(mem_off_last + msize <= page_split)) {
5427         /* The entire operation fits within a single page. */
5428         info->reg_off_last[0] = reg_off_last;
5429         return true;
5430     }
5431 
5432     info->page_split = page_split;
5433     elt_split = page_split / msize;
5434     reg_off_split = elt_split << esz;
5435     mem_off_split = elt_split * msize;
5436 
5437     /*
5438      * This is the last full element on the first page, but it is not
5439      * necessarily active.  If there is no full element, i.e. the first
5440      * active element is the one that's split, this value remains -1.
5441      * It is useful as iteration bounds.
5442      */
5443     if (elt_split != 0) {
5444         info->reg_off_last[0] = reg_off_split - esize;
5445     }
5446 
5447     /* Determine if an unaligned element spans the pages.  */
5448     if (page_split % msize != 0) {
5449         /* It is helpful to know if the split element is active. */
5450         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5451             info->reg_off_split = reg_off_split;
5452             info->mem_off_split = mem_off_split;
5453 
5454             if (reg_off_split == reg_off_last) {
5455                 /* The page crossing element is last. */
5456                 return true;
5457             }
5458         }
5459         reg_off_split += esize;
5460         mem_off_split += msize;
5461     }
5462 
5463     /*
5464      * We do want the first active element on the second page, because
5465      * this may affect the address reported in an exception.
5466      */
5467     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5468     tcg_debug_assert(reg_off_split <= reg_off_last);
5469     info->reg_off_first[1] = reg_off_split;
5470     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5471     info->reg_off_last[1] = reg_off_last;
5472     return true;
5473 }
5474 
5475 /*
5476  * Resolve the guest virtual addresses to info->page[].
5477  * Control the generation of page faults with @fault.  Return false if
5478  * there is no work to do, which can only happen with @fault == FAULT_NO.
5479  */
5480 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5481                          CPUARMState *env, target_ulong addr,
5482                          MMUAccessType access_type, uintptr_t retaddr)
5483 {
5484     int mmu_idx = arm_env_mmu_index(env);
5485     int mem_off = info->mem_off_first[0];
5486     bool nofault = fault == FAULT_NO;
5487     bool have_work = true;
5488 
5489     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5490                         access_type, mmu_idx, retaddr)) {
5491         /* No work to be done. */
5492         return false;
5493     }
5494 
5495     if (likely(info->page_split < 0)) {
5496         /* The entire operation was on the one page. */
5497         return true;
5498     }
5499 
5500     /*
5501      * If the second page is invalid, then we want the fault address to be
5502      * the first byte on that page which is accessed.
5503      */
5504     if (info->mem_off_split >= 0) {
5505         /*
5506          * There is an element split across the pages.  The fault address
5507          * should be the first byte of the second page.
5508          */
5509         mem_off = info->page_split;
5510         /*
5511          * If the split element is also the first active element
5512          * of the vector, then:  For first-fault we should continue
5513          * to generate faults for the second page.  For no-fault,
5514          * we have work only if the second page is valid.
5515          */
5516         if (info->mem_off_first[0] < info->mem_off_split) {
5517             nofault = FAULT_FIRST;
5518             have_work = false;
5519         }
5520     } else {
5521         /*
5522          * There is no element split across the pages.  The fault address
5523          * should be the first active element on the second page.
5524          */
5525         mem_off = info->mem_off_first[1];
5526         /*
5527          * There must have been one active element on the first page,
5528          * so we're out of first-fault territory.
5529          */
5530         nofault = fault != FAULT_ALL;
5531     }
5532 
5533     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5534                                 access_type, mmu_idx, retaddr);
5535     return have_work;
5536 }
5537 
5538 #ifndef CONFIG_USER_ONLY
5539 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5540                                uint64_t *vg, target_ulong addr,
5541                                int esize, int msize, int wp_access,
5542                                uintptr_t retaddr)
5543 {
5544     intptr_t mem_off, reg_off, reg_last;
5545     int flags0 = info->page[0].flags;
5546     int flags1 = info->page[1].flags;
5547 
5548     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5549         return;
5550     }
5551 
5552     /* Indicate that watchpoints are handled. */
5553     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5554     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5555 
5556     if (flags0 & TLB_WATCHPOINT) {
5557         mem_off = info->mem_off_first[0];
5558         reg_off = info->reg_off_first[0];
5559         reg_last = info->reg_off_last[0];
5560 
5561         while (reg_off <= reg_last) {
5562             uint64_t pg = vg[reg_off >> 6];
5563             do {
5564                 if ((pg >> (reg_off & 63)) & 1) {
5565                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5566                                          msize, info->page[0].attrs,
5567                                          wp_access, retaddr);
5568                 }
5569                 reg_off += esize;
5570                 mem_off += msize;
5571             } while (reg_off <= reg_last && (reg_off & 63));
5572         }
5573     }
5574 
5575     mem_off = info->mem_off_split;
5576     if (mem_off >= 0) {
5577         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5578                              info->page[0].attrs, wp_access, retaddr);
5579     }
5580 
5581     mem_off = info->mem_off_first[1];
5582     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5583         reg_off = info->reg_off_first[1];
5584         reg_last = info->reg_off_last[1];
5585 
5586         do {
5587             uint64_t pg = vg[reg_off >> 6];
5588             do {
5589                 if ((pg >> (reg_off & 63)) & 1) {
5590                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5591                                          msize, info->page[1].attrs,
5592                                          wp_access, retaddr);
5593                 }
5594                 reg_off += esize;
5595                 mem_off += msize;
5596             } while (reg_off & 63);
5597         } while (reg_off <= reg_last);
5598     }
5599 }
5600 #endif
5601 
5602 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5603                              uint64_t *vg, target_ulong addr, int esize,
5604                              int msize, uint32_t mtedesc, uintptr_t ra)
5605 {
5606     intptr_t mem_off, reg_off, reg_last;
5607 
5608     /* Process the page only if MemAttr == Tagged. */
5609     if (info->page[0].tagged) {
5610         mem_off = info->mem_off_first[0];
5611         reg_off = info->reg_off_first[0];
5612         reg_last = info->reg_off_split;
5613         if (reg_last < 0) {
5614             reg_last = info->reg_off_last[0];
5615         }
5616 
5617         do {
5618             uint64_t pg = vg[reg_off >> 6];
5619             do {
5620                 if ((pg >> (reg_off & 63)) & 1) {
5621                     mte_check(env, mtedesc, addr, ra);
5622                 }
5623                 reg_off += esize;
5624                 mem_off += msize;
5625             } while (reg_off <= reg_last && (reg_off & 63));
5626         } while (reg_off <= reg_last);
5627     }
5628 
5629     mem_off = info->mem_off_first[1];
5630     if (mem_off >= 0 && info->page[1].tagged) {
5631         reg_off = info->reg_off_first[1];
5632         reg_last = info->reg_off_last[1];
5633 
5634         do {
5635             uint64_t pg = vg[reg_off >> 6];
5636             do {
5637                 if ((pg >> (reg_off & 63)) & 1) {
5638                     mte_check(env, mtedesc, addr, ra);
5639                 }
5640                 reg_off += esize;
5641                 mem_off += msize;
5642             } while (reg_off & 63);
5643         } while (reg_off <= reg_last);
5644     }
5645 }
5646 
5647 /*
5648  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5649  */
5650 static inline QEMU_ALWAYS_INLINE
5651 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5652                uint32_t desc, const uintptr_t retaddr,
5653                const int esz, const int msz, const int N, uint32_t mtedesc,
5654                sve_ldst1_host_fn *host_fn,
5655                sve_ldst1_tlb_fn *tlb_fn)
5656 {
5657     const unsigned rd = simd_data(desc);
5658     const intptr_t reg_max = simd_oprsz(desc);
5659     intptr_t reg_off, reg_last, mem_off;
5660     SVEContLdSt info;
5661     void *host;
5662     int flags, i;
5663 
5664     /* Find the active elements.  */
5665     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5666         /* The entire predicate was false; no load occurs.  */
5667         for (i = 0; i < N; ++i) {
5668             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5669         }
5670         return;
5671     }
5672 
5673     /* Probe the page(s).  Exit with exception for any invalid page. */
5674     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5675 
5676     /* Handle watchpoints for all active elements. */
5677     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5678                               BP_MEM_READ, retaddr);
5679 
5680     /*
5681      * Handle mte checks for all active elements.
5682      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5683      */
5684     if (mtedesc) {
5685         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5686                                 mtedesc, retaddr);
5687     }
5688 
5689     flags = info.page[0].flags | info.page[1].flags;
5690     if (unlikely(flags != 0)) {
5691         /*
5692          * At least one page includes MMIO.
5693          * Any bus operation can fail with cpu_transaction_failed,
5694          * which for ARM will raise SyncExternal.  Perform the load
5695          * into scratch memory to preserve register state until the end.
5696          */
5697         ARMVectorReg scratch[4] = { };
5698 
5699         mem_off = info.mem_off_first[0];
5700         reg_off = info.reg_off_first[0];
5701         reg_last = info.reg_off_last[1];
5702         if (reg_last < 0) {
5703             reg_last = info.reg_off_split;
5704             if (reg_last < 0) {
5705                 reg_last = info.reg_off_last[0];
5706             }
5707         }
5708 
5709         do {
5710             uint64_t pg = vg[reg_off >> 6];
5711             do {
5712                 if ((pg >> (reg_off & 63)) & 1) {
5713                     for (i = 0; i < N; ++i) {
5714                         tlb_fn(env, &scratch[i], reg_off,
5715                                addr + mem_off + (i << msz), retaddr);
5716                     }
5717                 }
5718                 reg_off += 1 << esz;
5719                 mem_off += N << msz;
5720             } while (reg_off & 63);
5721         } while (reg_off <= reg_last);
5722 
5723         for (i = 0; i < N; ++i) {
5724             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5725         }
5726         return;
5727     }
5728 
5729     /* The entire operation is in RAM, on valid pages. */
5730 
5731     for (i = 0; i < N; ++i) {
5732         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5733     }
5734 
5735     mem_off = info.mem_off_first[0];
5736     reg_off = info.reg_off_first[0];
5737     reg_last = info.reg_off_last[0];
5738     host = info.page[0].host;
5739 
5740     while (reg_off <= reg_last) {
5741         uint64_t pg = vg[reg_off >> 6];
5742         do {
5743             if ((pg >> (reg_off & 63)) & 1) {
5744                 for (i = 0; i < N; ++i) {
5745                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5746                             host + mem_off + (i << msz));
5747                 }
5748             }
5749             reg_off += 1 << esz;
5750             mem_off += N << msz;
5751         } while (reg_off <= reg_last && (reg_off & 63));
5752     }
5753 
5754     /*
5755      * Use the slow path to manage the cross-page misalignment.
5756      * But we know this is RAM and cannot trap.
5757      */
5758     mem_off = info.mem_off_split;
5759     if (unlikely(mem_off >= 0)) {
5760         reg_off = info.reg_off_split;
5761         for (i = 0; i < N; ++i) {
5762             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5763                    addr + mem_off + (i << msz), retaddr);
5764         }
5765     }
5766 
5767     mem_off = info.mem_off_first[1];
5768     if (unlikely(mem_off >= 0)) {
5769         reg_off = info.reg_off_first[1];
5770         reg_last = info.reg_off_last[1];
5771         host = info.page[1].host;
5772 
5773         do {
5774             uint64_t pg = vg[reg_off >> 6];
5775             do {
5776                 if ((pg >> (reg_off & 63)) & 1) {
5777                     for (i = 0; i < N; ++i) {
5778                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5779                                 host + mem_off + (i << msz));
5780                     }
5781                 }
5782                 reg_off += 1 << esz;
5783                 mem_off += N << msz;
5784             } while (reg_off & 63);
5785         } while (reg_off <= reg_last);
5786     }
5787 }
5788 
5789 static inline QEMU_ALWAYS_INLINE
5790 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5791                    uint32_t desc, const uintptr_t ra,
5792                    const int esz, const int msz, const int N,
5793                    sve_ldst1_host_fn *host_fn,
5794                    sve_ldst1_tlb_fn *tlb_fn)
5795 {
5796     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5797     int bit55 = extract64(addr, 55, 1);
5798 
5799     /* Remove mtedesc from the normal sve descriptor. */
5800     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5801 
5802     /* Perform gross MTE suppression early. */
5803     if (!tbi_check(mtedesc, bit55) ||
5804         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
5805         mtedesc = 0;
5806     }
5807 
5808     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5809 }
5810 
5811 #define DO_LD1_1(NAME, ESZ)                                             \
5812 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
5813                             target_ulong addr, uint32_t desc)           \
5814 {                                                                       \
5815     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
5816               sve_##NAME##_host, sve_##NAME##_tlb);                     \
5817 }                                                                       \
5818 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
5819                                 target_ulong addr, uint32_t desc)       \
5820 {                                                                       \
5821     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
5822                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
5823 }
5824 
5825 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
5826 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
5827                                target_ulong addr, uint32_t desc)        \
5828 {                                                                       \
5829     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5830               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
5831 }                                                                       \
5832 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
5833                                target_ulong addr, uint32_t desc)        \
5834 {                                                                       \
5835     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5836               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
5837 }                                                                       \
5838 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
5839                                    target_ulong addr, uint32_t desc)    \
5840 {                                                                       \
5841     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5842                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
5843 }                                                                       \
5844 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
5845                                    target_ulong addr, uint32_t desc)    \
5846 {                                                                       \
5847     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5848                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
5849 }
5850 
5851 DO_LD1_1(ld1bb,  MO_8)
5852 DO_LD1_1(ld1bhu, MO_16)
5853 DO_LD1_1(ld1bhs, MO_16)
5854 DO_LD1_1(ld1bsu, MO_32)
5855 DO_LD1_1(ld1bss, MO_32)
5856 DO_LD1_1(ld1bdu, MO_64)
5857 DO_LD1_1(ld1bds, MO_64)
5858 
5859 DO_LD1_2(ld1hh,  MO_16, MO_16)
5860 DO_LD1_2(ld1hsu, MO_32, MO_16)
5861 DO_LD1_2(ld1hss, MO_32, MO_16)
5862 DO_LD1_2(ld1hdu, MO_64, MO_16)
5863 DO_LD1_2(ld1hds, MO_64, MO_16)
5864 
5865 DO_LD1_2(ld1ss,  MO_32, MO_32)
5866 DO_LD1_2(ld1sdu, MO_64, MO_32)
5867 DO_LD1_2(ld1sds, MO_64, MO_32)
5868 
5869 DO_LD1_2(ld1dd,  MO_64, MO_64)
5870 
5871 #undef DO_LD1_1
5872 #undef DO_LD1_2
5873 
5874 #define DO_LDN_1(N)                                                     \
5875 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
5876                              target_ulong addr, uint32_t desc)          \
5877 {                                                                       \
5878     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
5879               sve_ld1bb_host, sve_ld1bb_tlb);                           \
5880 }                                                                       \
5881 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
5882                                  target_ulong addr, uint32_t desc)      \
5883 {                                                                       \
5884     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
5885                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
5886 }
5887 
5888 #define DO_LDN_2(N, SUFF, ESZ)                                          \
5889 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
5890                                     target_ulong addr, uint32_t desc)   \
5891 {                                                                       \
5892     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5893               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
5894 }                                                                       \
5895 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
5896                                     target_ulong addr, uint32_t desc)   \
5897 {                                                                       \
5898     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5899               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
5900 }                                                                       \
5901 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
5902                                         target_ulong addr, uint32_t desc) \
5903 {                                                                       \
5904     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5905                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
5906 }                                                                       \
5907 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
5908                                         target_ulong addr, uint32_t desc) \
5909 {                                                                       \
5910     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5911                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
5912 }
5913 
5914 DO_LDN_1(2)
5915 DO_LDN_1(3)
5916 DO_LDN_1(4)
5917 
5918 DO_LDN_2(2, hh, MO_16)
5919 DO_LDN_2(3, hh, MO_16)
5920 DO_LDN_2(4, hh, MO_16)
5921 
5922 DO_LDN_2(2, ss, MO_32)
5923 DO_LDN_2(3, ss, MO_32)
5924 DO_LDN_2(4, ss, MO_32)
5925 
5926 DO_LDN_2(2, dd, MO_64)
5927 DO_LDN_2(3, dd, MO_64)
5928 DO_LDN_2(4, dd, MO_64)
5929 
5930 #undef DO_LDN_1
5931 #undef DO_LDN_2
5932 
5933 /*
5934  * Load contiguous data, first-fault and no-fault.
5935  *
5936  * For user-only, one could argue that we should hold the mmap_lock during
5937  * the operation so that there is no race between page_check_range and the
5938  * load operation.  However, unmapping pages out from under a running thread
5939  * is extraordinarily unlikely.  This theoretical race condition also affects
5940  * linux-user/ in its get_user/put_user macros.
5941  *
5942  * TODO: Construct some helpers, written in assembly, that interact with
5943  * host_signal_handler to produce memory ops which can properly report errors
5944  * without racing.
5945  */
5946 
5947 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
5948  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5949  * option, which leaves subsequent data unchanged.
5950  */
5951 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5952 {
5953     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5954 
5955     if (i & 63) {
5956         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5957         i = ROUND_UP(i, 64);
5958     }
5959     for (; i < oprsz; i += 64) {
5960         ffr[i / 64] = 0;
5961     }
5962 }
5963 
5964 /*
5965  * Common helper for all contiguous no-fault and first-fault loads.
5966  */
5967 static inline QEMU_ALWAYS_INLINE
5968 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5969                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5970                    const int esz, const int msz, const SVEContFault fault,
5971                    sve_ldst1_host_fn *host_fn,
5972                    sve_ldst1_tlb_fn *tlb_fn)
5973 {
5974     const unsigned rd = simd_data(desc);
5975     void *vd = &env->vfp.zregs[rd];
5976     const intptr_t reg_max = simd_oprsz(desc);
5977     intptr_t reg_off, mem_off, reg_last;
5978     SVEContLdSt info;
5979     int flags;
5980     void *host;
5981 
5982     /* Find the active elements.  */
5983     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5984         /* The entire predicate was false; no load occurs.  */
5985         memset(vd, 0, reg_max);
5986         return;
5987     }
5988     reg_off = info.reg_off_first[0];
5989 
5990     /* Probe the page(s). */
5991     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5992         /* Fault on first element. */
5993         tcg_debug_assert(fault == FAULT_NO);
5994         memset(vd, 0, reg_max);
5995         goto do_fault;
5996     }
5997 
5998     mem_off = info.mem_off_first[0];
5999     flags = info.page[0].flags;
6000 
6001     /*
6002      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6003      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6004      */
6005     if (!info.page[0].tagged) {
6006         mtedesc = 0;
6007     }
6008 
6009     if (fault == FAULT_FIRST) {
6010         /* Trapping mte check for the first-fault element.  */
6011         if (mtedesc) {
6012             mte_check(env, mtedesc, addr + mem_off, retaddr);
6013         }
6014 
6015         /*
6016          * Special handling of the first active element,
6017          * if it crosses a page boundary or is MMIO.
6018          */
6019         bool is_split = mem_off == info.mem_off_split;
6020         if (unlikely(flags != 0) || unlikely(is_split)) {
6021             /*
6022              * Use the slow path for cross-page handling.
6023              * Might trap for MMIO or watchpoints.
6024              */
6025             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6026 
6027             /* After any fault, zero the other elements. */
6028             swap_memzero(vd, reg_off);
6029             reg_off += 1 << esz;
6030             mem_off += 1 << msz;
6031             swap_memzero(vd + reg_off, reg_max - reg_off);
6032 
6033             if (is_split) {
6034                 goto second_page;
6035             }
6036         } else {
6037             memset(vd, 0, reg_max);
6038         }
6039     } else {
6040         memset(vd, 0, reg_max);
6041         if (unlikely(mem_off == info.mem_off_split)) {
6042             /* The first active element crosses a page boundary. */
6043             flags |= info.page[1].flags;
6044             if (unlikely(flags & TLB_MMIO)) {
6045                 /* Some page is MMIO, see below. */
6046                 goto do_fault;
6047             }
6048             if (unlikely(flags & TLB_WATCHPOINT) &&
6049                 (cpu_watchpoint_address_matches
6050                  (env_cpu(env), addr + mem_off, 1 << msz)
6051                  & BP_MEM_READ)) {
6052                 /* Watchpoint hit, see below. */
6053                 goto do_fault;
6054             }
6055             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6056                 goto do_fault;
6057             }
6058             /*
6059              * Use the slow path for cross-page handling.
6060              * This is RAM, without a watchpoint, and will not trap.
6061              */
6062             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6063             goto second_page;
6064         }
6065     }
6066 
6067     /*
6068      * From this point on, all memory operations are MemSingleNF.
6069      *
6070      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6071      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6072      *
6073      * Unfortuately we do not have access to the memory attributes from the
6074      * PTE to tell Device memory from Normal memory.  So we make a mostly
6075      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6076      * This gives the right answer for the common cases of "Normal memory,
6077      * backed by host RAM" and "Device memory, backed by MMIO".
6078      * The architecture allows us to suppress an NF load and return
6079      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6080      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6081      * get wrong is "Device memory, backed by host RAM", for which we
6082      * should return (UNKNOWN, FAULT) for but do not.
6083      *
6084      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6085      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6086      * architectural breakpoints the same.
6087      */
6088     if (unlikely(flags & TLB_MMIO)) {
6089         goto do_fault;
6090     }
6091 
6092     reg_last = info.reg_off_last[0];
6093     host = info.page[0].host;
6094 
6095     do {
6096         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6097         do {
6098             if ((pg >> (reg_off & 63)) & 1) {
6099                 if (unlikely(flags & TLB_WATCHPOINT) &&
6100                     (cpu_watchpoint_address_matches
6101                      (env_cpu(env), addr + mem_off, 1 << msz)
6102                      & BP_MEM_READ)) {
6103                     goto do_fault;
6104                 }
6105                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6106                     goto do_fault;
6107                 }
6108                 host_fn(vd, reg_off, host + mem_off);
6109             }
6110             reg_off += 1 << esz;
6111             mem_off += 1 << msz;
6112         } while (reg_off <= reg_last && (reg_off & 63));
6113     } while (reg_off <= reg_last);
6114 
6115     /*
6116      * MemSingleNF is allowed to fail for any reason.  We have special
6117      * code above to handle the first element crossing a page boundary.
6118      * As an implementation choice, decline to handle a cross-page element
6119      * in any other position.
6120      */
6121     reg_off = info.reg_off_split;
6122     if (reg_off >= 0) {
6123         goto do_fault;
6124     }
6125 
6126  second_page:
6127     reg_off = info.reg_off_first[1];
6128     if (likely(reg_off < 0)) {
6129         /* No active elements on the second page.  All done. */
6130         return;
6131     }
6132 
6133     /*
6134      * MemSingleNF is allowed to fail for any reason.  As an implementation
6135      * choice, decline to handle elements on the second page.  This should
6136      * be low frequency as the guest walks through memory -- the next
6137      * iteration of the guest's loop should be aligned on the page boundary,
6138      * and then all following iterations will stay aligned.
6139      */
6140 
6141  do_fault:
6142     record_fault(env, reg_off, reg_max);
6143 }
6144 
6145 static inline QEMU_ALWAYS_INLINE
6146 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6147                        uint32_t desc, const uintptr_t retaddr,
6148                        const int esz, const int msz, const SVEContFault fault,
6149                        sve_ldst1_host_fn *host_fn,
6150                        sve_ldst1_tlb_fn *tlb_fn)
6151 {
6152     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6153     int bit55 = extract64(addr, 55, 1);
6154 
6155     /* Remove mtedesc from the normal sve descriptor. */
6156     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6157 
6158     /* Perform gross MTE suppression early. */
6159     if (!tbi_check(mtedesc, bit55) ||
6160         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6161         mtedesc = 0;
6162     }
6163 
6164     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6165                   esz, msz, fault, host_fn, tlb_fn);
6166 }
6167 
6168 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6169 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6170                                  target_ulong addr, uint32_t desc)      \
6171 {                                                                       \
6172     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6173                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6174 }                                                                       \
6175 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6176                                  target_ulong addr, uint32_t desc)      \
6177 {                                                                       \
6178     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6179                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6180 }                                                                       \
6181 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6182                                      target_ulong addr, uint32_t desc)  \
6183 {                                                                       \
6184     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6185                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6186 }                                                                       \
6187 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6188                                      target_ulong addr, uint32_t desc)  \
6189 {                                                                       \
6190     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6191                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6192 }
6193 
6194 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6195 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6196                                     target_ulong addr, uint32_t desc)   \
6197 {                                                                       \
6198     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6199                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6200 }                                                                       \
6201 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6202                                     target_ulong addr, uint32_t desc)   \
6203 {                                                                       \
6204     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6205                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6206 }                                                                       \
6207 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6208                                     target_ulong addr, uint32_t desc)   \
6209 {                                                                       \
6210     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6211                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6212 }                                                                       \
6213 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6214                                     target_ulong addr, uint32_t desc)   \
6215 {                                                                       \
6216     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6217                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6218 }                                                                       \
6219 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6220                                         target_ulong addr, uint32_t desc) \
6221 {                                                                       \
6222     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6223                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6224 }                                                                       \
6225 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6226                                         target_ulong addr, uint32_t desc) \
6227 {                                                                       \
6228     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6229                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6230 }                                                                       \
6231 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6232                                         target_ulong addr, uint32_t desc) \
6233 {                                                                       \
6234     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6235                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6236 }                                                                       \
6237 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6238                                         target_ulong addr, uint32_t desc) \
6239 {                                                                       \
6240     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6241                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6242 }
6243 
6244 DO_LDFF1_LDNF1_1(bb,  MO_8)
6245 DO_LDFF1_LDNF1_1(bhu, MO_16)
6246 DO_LDFF1_LDNF1_1(bhs, MO_16)
6247 DO_LDFF1_LDNF1_1(bsu, MO_32)
6248 DO_LDFF1_LDNF1_1(bss, MO_32)
6249 DO_LDFF1_LDNF1_1(bdu, MO_64)
6250 DO_LDFF1_LDNF1_1(bds, MO_64)
6251 
6252 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6253 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6254 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6255 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6256 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6257 
6258 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6259 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6260 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6261 
6262 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6263 
6264 #undef DO_LDFF1_LDNF1_1
6265 #undef DO_LDFF1_LDNF1_2
6266 
6267 /*
6268  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6269  */
6270 
6271 static inline QEMU_ALWAYS_INLINE
6272 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6273                uint32_t desc, const uintptr_t retaddr,
6274                const int esz, const int msz, const int N, uint32_t mtedesc,
6275                sve_ldst1_host_fn *host_fn,
6276                sve_ldst1_tlb_fn *tlb_fn)
6277 {
6278     const unsigned rd = simd_data(desc);
6279     const intptr_t reg_max = simd_oprsz(desc);
6280     intptr_t reg_off, reg_last, mem_off;
6281     SVEContLdSt info;
6282     void *host;
6283     int i, flags;
6284 
6285     /* Find the active elements.  */
6286     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6287         /* The entire predicate was false; no store occurs.  */
6288         return;
6289     }
6290 
6291     /* Probe the page(s).  Exit with exception for any invalid page. */
6292     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6293 
6294     /* Handle watchpoints for all active elements. */
6295     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6296                               BP_MEM_WRITE, retaddr);
6297 
6298     /*
6299      * Handle mte checks for all active elements.
6300      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6301      */
6302     if (mtedesc) {
6303         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6304                                 mtedesc, retaddr);
6305     }
6306 
6307     flags = info.page[0].flags | info.page[1].flags;
6308     if (unlikely(flags != 0)) {
6309 #ifdef CONFIG_USER_ONLY
6310         g_assert_not_reached();
6311 #else
6312         /*
6313          * At least one page includes MMIO.
6314          * Any bus operation can fail with cpu_transaction_failed,
6315          * which for ARM will raise SyncExternal.  We cannot avoid
6316          * this fault and will leave with the store incomplete.
6317          */
6318         mem_off = info.mem_off_first[0];
6319         reg_off = info.reg_off_first[0];
6320         reg_last = info.reg_off_last[1];
6321         if (reg_last < 0) {
6322             reg_last = info.reg_off_split;
6323             if (reg_last < 0) {
6324                 reg_last = info.reg_off_last[0];
6325             }
6326         }
6327 
6328         do {
6329             uint64_t pg = vg[reg_off >> 6];
6330             do {
6331                 if ((pg >> (reg_off & 63)) & 1) {
6332                     for (i = 0; i < N; ++i) {
6333                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6334                                addr + mem_off + (i << msz), retaddr);
6335                     }
6336                 }
6337                 reg_off += 1 << esz;
6338                 mem_off += N << msz;
6339             } while (reg_off & 63);
6340         } while (reg_off <= reg_last);
6341         return;
6342 #endif
6343     }
6344 
6345     mem_off = info.mem_off_first[0];
6346     reg_off = info.reg_off_first[0];
6347     reg_last = info.reg_off_last[0];
6348     host = info.page[0].host;
6349 
6350     while (reg_off <= reg_last) {
6351         uint64_t pg = vg[reg_off >> 6];
6352         do {
6353             if ((pg >> (reg_off & 63)) & 1) {
6354                 for (i = 0; i < N; ++i) {
6355                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6356                             host + mem_off + (i << msz));
6357                 }
6358             }
6359             reg_off += 1 << esz;
6360             mem_off += N << msz;
6361         } while (reg_off <= reg_last && (reg_off & 63));
6362     }
6363 
6364     /*
6365      * Use the slow path to manage the cross-page misalignment.
6366      * But we know this is RAM and cannot trap.
6367      */
6368     mem_off = info.mem_off_split;
6369     if (unlikely(mem_off >= 0)) {
6370         reg_off = info.reg_off_split;
6371         for (i = 0; i < N; ++i) {
6372             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6373                    addr + mem_off + (i << msz), retaddr);
6374         }
6375     }
6376 
6377     mem_off = info.mem_off_first[1];
6378     if (unlikely(mem_off >= 0)) {
6379         reg_off = info.reg_off_first[1];
6380         reg_last = info.reg_off_last[1];
6381         host = info.page[1].host;
6382 
6383         do {
6384             uint64_t pg = vg[reg_off >> 6];
6385             do {
6386                 if ((pg >> (reg_off & 63)) & 1) {
6387                     for (i = 0; i < N; ++i) {
6388                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6389                                 host + mem_off + (i << msz));
6390                     }
6391                 }
6392                 reg_off += 1 << esz;
6393                 mem_off += N << msz;
6394             } while (reg_off & 63);
6395         } while (reg_off <= reg_last);
6396     }
6397 }
6398 
6399 static inline QEMU_ALWAYS_INLINE
6400 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6401                    uint32_t desc, const uintptr_t ra,
6402                    const int esz, const int msz, const int N,
6403                    sve_ldst1_host_fn *host_fn,
6404                    sve_ldst1_tlb_fn *tlb_fn)
6405 {
6406     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6407     int bit55 = extract64(addr, 55, 1);
6408 
6409     /* Remove mtedesc from the normal sve descriptor. */
6410     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6411 
6412     /* Perform gross MTE suppression early. */
6413     if (!tbi_check(mtedesc, bit55) ||
6414         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6415         mtedesc = 0;
6416     }
6417 
6418     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6419 }
6420 
6421 #define DO_STN_1(N, NAME, ESZ)                                          \
6422 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6423                                  target_ulong addr, uint32_t desc)      \
6424 {                                                                       \
6425     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6426               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6427 }                                                                       \
6428 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6429                                      target_ulong addr, uint32_t desc)  \
6430 {                                                                       \
6431     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6432                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6433 }
6434 
6435 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6436 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6437                                     target_ulong addr, uint32_t desc)   \
6438 {                                                                       \
6439     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6440               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6441 }                                                                       \
6442 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6443                                     target_ulong addr, uint32_t desc)   \
6444 {                                                                       \
6445     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6446               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6447 }                                                                       \
6448 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6449                                         target_ulong addr, uint32_t desc) \
6450 {                                                                       \
6451     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6452                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6453 }                                                                       \
6454 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6455                                         target_ulong addr, uint32_t desc) \
6456 {                                                                       \
6457     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6458                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6459 }
6460 
6461 DO_STN_1(1, bb, MO_8)
6462 DO_STN_1(1, bh, MO_16)
6463 DO_STN_1(1, bs, MO_32)
6464 DO_STN_1(1, bd, MO_64)
6465 DO_STN_1(2, bb, MO_8)
6466 DO_STN_1(3, bb, MO_8)
6467 DO_STN_1(4, bb, MO_8)
6468 
6469 DO_STN_2(1, hh, MO_16, MO_16)
6470 DO_STN_2(1, hs, MO_32, MO_16)
6471 DO_STN_2(1, hd, MO_64, MO_16)
6472 DO_STN_2(2, hh, MO_16, MO_16)
6473 DO_STN_2(3, hh, MO_16, MO_16)
6474 DO_STN_2(4, hh, MO_16, MO_16)
6475 
6476 DO_STN_2(1, ss, MO_32, MO_32)
6477 DO_STN_2(1, sd, MO_64, MO_32)
6478 DO_STN_2(2, ss, MO_32, MO_32)
6479 DO_STN_2(3, ss, MO_32, MO_32)
6480 DO_STN_2(4, ss, MO_32, MO_32)
6481 
6482 DO_STN_2(1, dd, MO_64, MO_64)
6483 DO_STN_2(2, dd, MO_64, MO_64)
6484 DO_STN_2(3, dd, MO_64, MO_64)
6485 DO_STN_2(4, dd, MO_64, MO_64)
6486 
6487 #undef DO_STN_1
6488 #undef DO_STN_2
6489 
6490 /*
6491  * Loads with a vector index.
6492  */
6493 
6494 /*
6495  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6496  */
6497 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6498 
6499 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6500 {
6501     return *(uint32_t *)(reg + H1_4(reg_ofs));
6502 }
6503 
6504 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6505 {
6506     return *(int32_t *)(reg + H1_4(reg_ofs));
6507 }
6508 
6509 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6510 {
6511     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6512 }
6513 
6514 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6515 {
6516     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6517 }
6518 
6519 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6520 {
6521     return *(uint64_t *)(reg + reg_ofs);
6522 }
6523 
6524 static inline QEMU_ALWAYS_INLINE
6525 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6526                target_ulong base, uint32_t desc, uintptr_t retaddr,
6527                uint32_t mtedesc, int esize, int msize,
6528                zreg_off_fn *off_fn,
6529                sve_ldst1_host_fn *host_fn,
6530                sve_ldst1_tlb_fn *tlb_fn)
6531 {
6532     const int mmu_idx = arm_env_mmu_index(env);
6533     const intptr_t reg_max = simd_oprsz(desc);
6534     const int scale = simd_data(desc);
6535     ARMVectorReg scratch;
6536     intptr_t reg_off;
6537     SVEHostPage info, info2;
6538 
6539     memset(&scratch, 0, reg_max);
6540     reg_off = 0;
6541     do {
6542         uint64_t pg = vg[reg_off >> 6];
6543         do {
6544             if (likely(pg & 1)) {
6545                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6546                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6547 
6548                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6549                                mmu_idx, retaddr);
6550 
6551                 if (likely(in_page >= msize)) {
6552                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6553                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6554                                              info.attrs, BP_MEM_READ, retaddr);
6555                     }
6556                     if (mtedesc && info.tagged) {
6557                         mte_check(env, mtedesc, addr, retaddr);
6558                     }
6559                     if (unlikely(info.flags & TLB_MMIO)) {
6560                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6561                     } else {
6562                         host_fn(&scratch, reg_off, info.host);
6563                     }
6564                 } else {
6565                     /* Element crosses the page boundary. */
6566                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6567                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6568                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6569                         cpu_check_watchpoint(env_cpu(env), addr,
6570                                              msize, info.attrs,
6571                                              BP_MEM_READ, retaddr);
6572                     }
6573                     if (mtedesc && info.tagged) {
6574                         mte_check(env, mtedesc, addr, retaddr);
6575                     }
6576                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6577                 }
6578             }
6579             reg_off += esize;
6580             pg >>= esize;
6581         } while (reg_off & 63);
6582     } while (reg_off < reg_max);
6583 
6584     /* Wait until all exceptions have been raised to write back.  */
6585     memcpy(vd, &scratch, reg_max);
6586 }
6587 
6588 static inline QEMU_ALWAYS_INLINE
6589 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6590                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6591                    int esize, int msize, zreg_off_fn *off_fn,
6592                    sve_ldst1_host_fn *host_fn,
6593                    sve_ldst1_tlb_fn *tlb_fn)
6594 {
6595     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6596     /* Remove mtedesc from the normal sve descriptor. */
6597     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6598 
6599     /*
6600      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6601      * offset base entirely over the address space hole to change the
6602      * pointer tag, or change the bit55 selector.  So we could here
6603      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6604      */
6605     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6606               esize, msize, off_fn, host_fn, tlb_fn);
6607 }
6608 
6609 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6610 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6611                                  void *vm, target_ulong base, uint32_t desc) \
6612 {                                                                            \
6613     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6614               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6615 }                                                                            \
6616 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6617      void *vm, target_ulong base, uint32_t desc)                             \
6618 {                                                                            \
6619     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6620                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6621 }
6622 
6623 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6624 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6625                                  void *vm, target_ulong base, uint32_t desc) \
6626 {                                                                            \
6627     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6628               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6629 }                                                                            \
6630 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6631     void *vm, target_ulong base, uint32_t desc)                              \
6632 {                                                                            \
6633     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6634                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6635 }
6636 
6637 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6638 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6639 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6640 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6641 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6642 
6643 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6644 DO_LD1_ZPZ_S(bss, zss, MO_8)
6645 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6646 DO_LD1_ZPZ_D(bds, zss, MO_8)
6647 DO_LD1_ZPZ_D(bds, zd, MO_8)
6648 
6649 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6650 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6651 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6652 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6653 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6654 
6655 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6656 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6657 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6658 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6659 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6660 
6661 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6662 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6663 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6664 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6665 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6666 
6667 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6668 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6669 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6670 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6671 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6672 
6673 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6674 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6675 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6676 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6677 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6678 
6679 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6680 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6681 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6682 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6683 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6684 
6685 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6686 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6687 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6688 
6689 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6690 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6691 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6692 
6693 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6694 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6695 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6696 
6697 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6698 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6699 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6700 
6701 #undef DO_LD1_ZPZ_S
6702 #undef DO_LD1_ZPZ_D
6703 
6704 /* First fault loads with a vector index.  */
6705 
6706 /*
6707  * Common helpers for all gather first-faulting loads.
6708  */
6709 
6710 static inline QEMU_ALWAYS_INLINE
6711 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6712                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6713                  uint32_t mtedesc, const int esz, const int msz,
6714                  zreg_off_fn *off_fn,
6715                  sve_ldst1_host_fn *host_fn,
6716                  sve_ldst1_tlb_fn *tlb_fn)
6717 {
6718     const int mmu_idx = arm_env_mmu_index(env);
6719     const intptr_t reg_max = simd_oprsz(desc);
6720     const int scale = simd_data(desc);
6721     const int esize = 1 << esz;
6722     const int msize = 1 << msz;
6723     intptr_t reg_off;
6724     SVEHostPage info;
6725     target_ulong addr, in_page;
6726     ARMVectorReg scratch;
6727 
6728     /* Skip to the first true predicate.  */
6729     reg_off = find_next_active(vg, 0, reg_max, esz);
6730     if (unlikely(reg_off >= reg_max)) {
6731         /* The entire predicate was false; no load occurs.  */
6732         memset(vd, 0, reg_max);
6733         return;
6734     }
6735 
6736     /* Protect against overlap between vd and vm. */
6737     if (unlikely(vd == vm)) {
6738         vm = memcpy(&scratch, vm, reg_max);
6739     }
6740 
6741     /*
6742      * Probe the first element, allowing faults.
6743      */
6744     addr = base + (off_fn(vm, reg_off) << scale);
6745     if (mtedesc) {
6746         mte_check(env, mtedesc, addr, retaddr);
6747     }
6748     tlb_fn(env, vd, reg_off, addr, retaddr);
6749 
6750     /* After any fault, zero the other elements. */
6751     swap_memzero(vd, reg_off);
6752     reg_off += esize;
6753     swap_memzero(vd + reg_off, reg_max - reg_off);
6754 
6755     /*
6756      * Probe the remaining elements, not allowing faults.
6757      */
6758     while (reg_off < reg_max) {
6759         uint64_t pg = vg[reg_off >> 6];
6760         do {
6761             if (likely((pg >> (reg_off & 63)) & 1)) {
6762                 addr = base + (off_fn(vm, reg_off) << scale);
6763                 in_page = -(addr | TARGET_PAGE_MASK);
6764 
6765                 if (unlikely(in_page < msize)) {
6766                     /* Stop if the element crosses a page boundary. */
6767                     goto fault;
6768                 }
6769 
6770                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6771                                mmu_idx, retaddr);
6772                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6773                     goto fault;
6774                 }
6775                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6776                     (cpu_watchpoint_address_matches
6777                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6778                     goto fault;
6779                 }
6780                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6781                     goto fault;
6782                 }
6783 
6784                 host_fn(vd, reg_off, info.host);
6785             }
6786             reg_off += esize;
6787         } while (reg_off & 63);
6788     }
6789     return;
6790 
6791  fault:
6792     record_fault(env, reg_off, reg_max);
6793 }
6794 
6795 static inline QEMU_ALWAYS_INLINE
6796 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6797                      target_ulong base, uint32_t desc, uintptr_t retaddr,
6798                      const int esz, const int msz,
6799                      zreg_off_fn *off_fn,
6800                      sve_ldst1_host_fn *host_fn,
6801                      sve_ldst1_tlb_fn *tlb_fn)
6802 {
6803     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6804     /* Remove mtedesc from the normal sve descriptor. */
6805     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6806 
6807     /*
6808      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6809      * offset base entirely over the address space hole to change the
6810      * pointer tag, or change the bit55 selector.  So we could here
6811      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6812      */
6813     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6814                 esz, msz, off_fn, host_fn, tlb_fn);
6815 }
6816 
6817 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
6818 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6819     (CPUARMState *env, void *vd, void *vg,                              \
6820      void *vm, target_ulong base, uint32_t desc)                        \
6821 {                                                                       \
6822     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
6823                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6824 }                                                                       \
6825 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6826     (CPUARMState *env, void *vd, void *vg,                              \
6827      void *vm, target_ulong base, uint32_t desc)                        \
6828 {                                                                       \
6829     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
6830                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6831 }
6832 
6833 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
6834 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6835     (CPUARMState *env, void *vd, void *vg,                              \
6836      void *vm, target_ulong base, uint32_t desc)                        \
6837 {                                                                       \
6838     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
6839                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6840 }                                                                       \
6841 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6842     (CPUARMState *env, void *vd, void *vg,                              \
6843      void *vm, target_ulong base, uint32_t desc)                        \
6844 {                                                                       \
6845     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
6846                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6847 }
6848 
6849 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6850 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6851 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6852 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6853 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6854 
6855 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6856 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6857 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6858 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6859 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6860 
6861 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6862 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6863 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6864 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6865 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6866 
6867 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6868 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6869 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6870 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6871 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6872 
6873 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6874 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6875 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6876 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6877 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6878 
6879 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6880 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6881 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6882 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6883 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6884 
6885 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
6886 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
6887 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6888 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6889 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6890 
6891 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
6892 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
6893 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6894 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6895 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6896 
6897 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6898 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6899 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6900 
6901 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6902 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6903 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6904 
6905 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6906 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6907 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6908 
6909 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6910 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6911 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6912 
6913 /* Stores with a vector index.  */
6914 
6915 static inline QEMU_ALWAYS_INLINE
6916 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6917                target_ulong base, uint32_t desc, uintptr_t retaddr,
6918                uint32_t mtedesc, int esize, int msize,
6919                zreg_off_fn *off_fn,
6920                sve_ldst1_host_fn *host_fn,
6921                sve_ldst1_tlb_fn *tlb_fn)
6922 {
6923     const int mmu_idx = arm_env_mmu_index(env);
6924     const intptr_t reg_max = simd_oprsz(desc);
6925     const int scale = simd_data(desc);
6926     void *host[ARM_MAX_VQ * 4];
6927     intptr_t reg_off, i;
6928     SVEHostPage info, info2;
6929 
6930     /*
6931      * Probe all of the elements for host addresses and flags.
6932      */
6933     i = reg_off = 0;
6934     do {
6935         uint64_t pg = vg[reg_off >> 6];
6936         do {
6937             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6938             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6939 
6940             host[i] = NULL;
6941             if (likely((pg >> (reg_off & 63)) & 1)) {
6942                 if (likely(in_page >= msize)) {
6943                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6944                                    mmu_idx, retaddr);
6945                     if (!(info.flags & TLB_MMIO)) {
6946                         host[i] = info.host;
6947                     }
6948                 } else {
6949                     /*
6950                      * Element crosses the page boundary.
6951                      * Probe both pages, but do not record the host address,
6952                      * so that we use the slow path.
6953                      */
6954                     sve_probe_page(&info, false, env, addr, 0,
6955                                    MMU_DATA_STORE, mmu_idx, retaddr);
6956                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6957                                    MMU_DATA_STORE, mmu_idx, retaddr);
6958                     info.flags |= info2.flags;
6959                 }
6960 
6961                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6962                     cpu_check_watchpoint(env_cpu(env), addr, msize,
6963                                          info.attrs, BP_MEM_WRITE, retaddr);
6964                 }
6965 
6966                 if (mtedesc && info.tagged) {
6967                     mte_check(env, mtedesc, addr, retaddr);
6968                 }
6969             }
6970             i += 1;
6971             reg_off += esize;
6972         } while (reg_off & 63);
6973     } while (reg_off < reg_max);
6974 
6975     /*
6976      * Now that we have recognized all exceptions except SyncExternal
6977      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6978      *
6979      * Note for the common case of an element in RAM, not crossing a page
6980      * boundary, we have stored the host address in host[].  This doubles
6981      * as a first-level check against the predicate, since only enabled
6982      * elements have non-null host addresses.
6983      */
6984     i = reg_off = 0;
6985     do {
6986         void *h = host[i];
6987         if (likely(h != NULL)) {
6988             host_fn(vd, reg_off, h);
6989         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6990             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6991             tlb_fn(env, vd, reg_off, addr, retaddr);
6992         }
6993         i += 1;
6994         reg_off += esize;
6995     } while (reg_off < reg_max);
6996 }
6997 
6998 static inline QEMU_ALWAYS_INLINE
6999 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7000                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7001                    int esize, int msize, zreg_off_fn *off_fn,
7002                    sve_ldst1_host_fn *host_fn,
7003                    sve_ldst1_tlb_fn *tlb_fn)
7004 {
7005     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7006     /* Remove mtedesc from the normal sve descriptor. */
7007     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7008 
7009     /*
7010      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7011      * offset base entirely over the address space hole to change the
7012      * pointer tag, or change the bit55 selector.  So we could here
7013      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7014      */
7015     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7016               esize, msize, off_fn, host_fn, tlb_fn);
7017 }
7018 
7019 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7020 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7021                                  void *vm, target_ulong base, uint32_t desc) \
7022 {                                                                       \
7023     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7024               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7025 }                                                                       \
7026 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7027     void *vm, target_ulong base, uint32_t desc)                         \
7028 {                                                                       \
7029     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7030                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7031 }
7032 
7033 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7034 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7035                                  void *vm, target_ulong base, uint32_t desc) \
7036 {                                                                       \
7037     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7038               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7039 }                                                                       \
7040 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7041     void *vm, target_ulong base, uint32_t desc)                         \
7042 {                                                                       \
7043     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7044                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7045 }
7046 
7047 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7048 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7049 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7050 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7051 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7052 
7053 DO_ST1_ZPZ_S(bs, zss, MO_8)
7054 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7055 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7056 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7057 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7058 
7059 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7060 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7061 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7062 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7063 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7064 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7065 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7066 
7067 DO_ST1_ZPZ_D(bd, zss, MO_8)
7068 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7069 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7070 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7071 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7072 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7073 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7074 
7075 DO_ST1_ZPZ_D(bd, zd, MO_8)
7076 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7077 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7078 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7079 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7080 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7081 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7082 
7083 #undef DO_ST1_ZPZ_S
7084 #undef DO_ST1_ZPZ_D
7085 
7086 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7087 {
7088     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7089     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7090 
7091     for (i = 0; i < opr_sz; ++i) {
7092         d[i] = n[i] ^ m[i] ^ k[i];
7093     }
7094 }
7095 
7096 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7097 {
7098     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7099     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7100 
7101     for (i = 0; i < opr_sz; ++i) {
7102         d[i] = n[i] ^ (m[i] & ~k[i]);
7103     }
7104 }
7105 
7106 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7107 {
7108     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7109     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7110 
7111     for (i = 0; i < opr_sz; ++i) {
7112         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7113     }
7114 }
7115 
7116 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7117 {
7118     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7119     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7120 
7121     for (i = 0; i < opr_sz; ++i) {
7122         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7123     }
7124 }
7125 
7126 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7127 {
7128     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7129     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7130 
7131     for (i = 0; i < opr_sz; ++i) {
7132         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7133     }
7134 }
7135 
7136 /*
7137  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7138  * See hasless(v,1) from
7139  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7140  */
7141 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7142 {
7143     int bits = 8 << esz;
7144     uint64_t ones = dup_const(esz, 1);
7145     uint64_t signs = ones << (bits - 1);
7146     uint64_t cmp0, cmp1;
7147 
7148     cmp1 = dup_const(esz, n);
7149     cmp0 = cmp1 ^ m0;
7150     cmp1 = cmp1 ^ m1;
7151     cmp0 = (cmp0 - ones) & ~cmp0;
7152     cmp1 = (cmp1 - ones) & ~cmp1;
7153     return (cmp0 | cmp1) & signs;
7154 }
7155 
7156 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7157                                 uint32_t desc, int esz, bool nmatch)
7158 {
7159     uint16_t esz_mask = pred_esz_masks[esz];
7160     intptr_t opr_sz = simd_oprsz(desc);
7161     uint32_t flags = PREDTEST_INIT;
7162     intptr_t i, j, k;
7163 
7164     for (i = 0; i < opr_sz; i += 16) {
7165         uint64_t m0 = *(uint64_t *)(vm + i);
7166         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7167         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7168         uint16_t out = 0;
7169 
7170         for (j = 0; j < 16; j += 8) {
7171             uint64_t n = *(uint64_t *)(vn + i + j);
7172 
7173             for (k = 0; k < 8; k += 1 << esz) {
7174                 if (pg & (1 << (j + k))) {
7175                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7176                     out |= (o ^ nmatch) << (j + k);
7177                 }
7178             }
7179         }
7180         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7181         flags = iter_predtest_fwd(out, pg, flags);
7182     }
7183     return flags;
7184 }
7185 
7186 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7187 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7188 {                                                                             \
7189     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7190 }
7191 
7192 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7193 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7194 
7195 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7196 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7197 
7198 #undef DO_PPZZ_MATCH
7199 
7200 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7201                             uint32_t desc)
7202 {
7203     ARMVectorReg scratch;
7204     intptr_t i, j;
7205     intptr_t opr_sz = simd_oprsz(desc);
7206     uint32_t *d = vd, *n = vn, *m = vm;
7207     uint8_t *pg = vg;
7208 
7209     if (d == n) {
7210         n = memcpy(&scratch, n, opr_sz);
7211         if (d == m) {
7212             m = n;
7213         }
7214     } else if (d == m) {
7215         m = memcpy(&scratch, m, opr_sz);
7216     }
7217 
7218     for (i = 0; i < opr_sz; i += 4) {
7219         uint64_t count = 0;
7220         uint8_t pred;
7221 
7222         pred = pg[H1(i >> 3)] >> (i & 7);
7223         if (pred & 1) {
7224             uint32_t nn = n[H4(i >> 2)];
7225 
7226             for (j = 0; j <= i; j += 4) {
7227                 pred = pg[H1(j >> 3)] >> (j & 7);
7228                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7229                     ++count;
7230                 }
7231             }
7232         }
7233         d[H4(i >> 2)] = count;
7234     }
7235 }
7236 
7237 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7238                             uint32_t desc)
7239 {
7240     ARMVectorReg scratch;
7241     intptr_t i, j;
7242     intptr_t opr_sz = simd_oprsz(desc);
7243     uint64_t *d = vd, *n = vn, *m = vm;
7244     uint8_t *pg = vg;
7245 
7246     if (d == n) {
7247         n = memcpy(&scratch, n, opr_sz);
7248         if (d == m) {
7249             m = n;
7250         }
7251     } else if (d == m) {
7252         m = memcpy(&scratch, m, opr_sz);
7253     }
7254 
7255     for (i = 0; i < opr_sz / 8; ++i) {
7256         uint64_t count = 0;
7257         if (pg[H1(i)] & 1) {
7258             uint64_t nn = n[i];
7259             for (j = 0; j <= i; ++j) {
7260                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7261                     ++count;
7262                 }
7263             }
7264         }
7265         d[i] = count;
7266     }
7267 }
7268 
7269 /*
7270  * Returns the number of bytes in m0 and m1 that match n.
7271  * Unlike do_match2 we don't just need true/false, we need an exact count.
7272  * This requires two extra logical operations.
7273  */
7274 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7275 {
7276     const uint64_t mask = dup_const(MO_8, 0x7f);
7277     uint64_t cmp0, cmp1;
7278 
7279     cmp1 = dup_const(MO_8, n);
7280     cmp0 = cmp1 ^ m0;
7281     cmp1 = cmp1 ^ m1;
7282 
7283     /*
7284      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7285      * 2: carry in to msb if byte != 0 (+ mask)
7286      * 3: set msb if cmp has msb set (| cmp)
7287      * 4: set ~msb to ignore them (| mask)
7288      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7289      * 5: invert, resulting in 0x80 if and only if byte == 0.
7290      */
7291     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7292     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7293 
7294     /*
7295      * Combine the two compares in a way that the bits do
7296      * not overlap, and so preserves the count of set bits.
7297      * If the host has an efficient instruction for ctpop,
7298      * then ctpop(x) + ctpop(y) has the same number of
7299      * operations as ctpop(x | (y >> 1)).  If the host does
7300      * not have an efficient ctpop, then we only want to
7301      * use it once.
7302      */
7303     return ctpop64(cmp0 | (cmp1 >> 1));
7304 }
7305 
7306 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7307 {
7308     intptr_t i, j;
7309     intptr_t opr_sz = simd_oprsz(desc);
7310 
7311     for (i = 0; i < opr_sz; i += 16) {
7312         uint64_t n0 = *(uint64_t *)(vn + i);
7313         uint64_t m0 = *(uint64_t *)(vm + i);
7314         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7315         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7316         uint64_t out0 = 0;
7317         uint64_t out1 = 0;
7318 
7319         for (j = 0; j < 64; j += 8) {
7320             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7321             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7322             out0 |= cnt0 << j;
7323             out1 |= cnt1 << j;
7324         }
7325 
7326         *(uint64_t *)(vd + i) = out0;
7327         *(uint64_t *)(vd + i + 8) = out1;
7328     }
7329 }
7330 
7331 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7332 {
7333     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7334     int shr = simd_data(desc);
7335     int shl = 8 - shr;
7336     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7337     uint64_t *d = vd, *n = vn, *m = vm;
7338 
7339     for (i = 0; i < opr_sz; ++i) {
7340         uint64_t t = n[i] ^ m[i];
7341         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7342     }
7343 }
7344 
7345 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7346 {
7347     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7348     int shr = simd_data(desc);
7349     int shl = 16 - shr;
7350     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7351     uint64_t *d = vd, *n = vn, *m = vm;
7352 
7353     for (i = 0; i < opr_sz; ++i) {
7354         uint64_t t = n[i] ^ m[i];
7355         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7356     }
7357 }
7358 
7359 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7360 {
7361     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7362     int shr = simd_data(desc);
7363     uint32_t *d = vd, *n = vn, *m = vm;
7364 
7365     for (i = 0; i < opr_sz; ++i) {
7366         d[i] = ror32(n[i] ^ m[i], shr);
7367     }
7368 }
7369 
7370 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7371                      void *status, uint32_t desc)
7372 {
7373     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7374 
7375     for (s = 0; s < opr_sz; ++s) {
7376         float32 *n = vn + s * sizeof(float32) * 4;
7377         float32 *m = vm + s * sizeof(float32) * 4;
7378         float32 *a = va + s * sizeof(float32) * 4;
7379         float32 *d = vd + s * sizeof(float32) * 4;
7380         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7381         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7382         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7383         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7384         float32 p0, p1;
7385 
7386         /* i = 0, j = 0 */
7387         p0 = float32_mul(n00, m00, status);
7388         p1 = float32_mul(n01, m01, status);
7389         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7390 
7391         /* i = 0, j = 1 */
7392         p0 = float32_mul(n00, m10, status);
7393         p1 = float32_mul(n01, m11, status);
7394         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7395 
7396         /* i = 1, j = 0 */
7397         p0 = float32_mul(n10, m00, status);
7398         p1 = float32_mul(n11, m01, status);
7399         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7400 
7401         /* i = 1, j = 1 */
7402         p0 = float32_mul(n10, m10, status);
7403         p1 = float32_mul(n11, m11, status);
7404         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7405     }
7406 }
7407 
7408 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7409                      void *status, uint32_t desc)
7410 {
7411     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7412 
7413     for (s = 0; s < opr_sz; ++s) {
7414         float64 *n = vn + s * sizeof(float64) * 4;
7415         float64 *m = vm + s * sizeof(float64) * 4;
7416         float64 *a = va + s * sizeof(float64) * 4;
7417         float64 *d = vd + s * sizeof(float64) * 4;
7418         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7419         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7420         float64 p0, p1;
7421 
7422         /* i = 0, j = 0 */
7423         p0 = float64_mul(n00, m00, status);
7424         p1 = float64_mul(n01, m01, status);
7425         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7426 
7427         /* i = 0, j = 1 */
7428         p0 = float64_mul(n00, m10, status);
7429         p1 = float64_mul(n01, m11, status);
7430         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7431 
7432         /* i = 1, j = 0 */
7433         p0 = float64_mul(n10, m00, status);
7434         p1 = float64_mul(n11, m01, status);
7435         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7436 
7437         /* i = 1, j = 1 */
7438         p0 = float64_mul(n10, m10, status);
7439         p1 = float64_mul(n11, m11, status);
7440         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7441     }
7442 }
7443 
7444 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7445 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7446 {                                                                             \
7447     intptr_t i = simd_oprsz(desc);                                            \
7448     uint64_t *g = vg;                                                         \
7449     do {                                                                      \
7450         uint64_t pg = g[(i - 1) >> 6];                                        \
7451         do {                                                                  \
7452             i -= sizeof(TYPEW);                                               \
7453             if (likely((pg >> (i & 63)) & 1)) {                               \
7454                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7455                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7456             }                                                                 \
7457         } while (i & 63);                                                     \
7458     } while (i != 0);                                                         \
7459 }
7460 
7461 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7462 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7463 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7464 
7465 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7466 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7467 {                                                                             \
7468     intptr_t i = simd_oprsz(desc);                                            \
7469     uint64_t *g = vg;                                                         \
7470     do {                                                                      \
7471         uint64_t pg = g[(i - 1) >> 6];                                        \
7472         do {                                                                  \
7473             i -= sizeof(TYPEW);                                               \
7474             if (likely((pg >> (i & 63)) & 1)) {                               \
7475                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7476                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7477             }                                                                 \
7478         } while (i & 63);                                                     \
7479     } while (i != 0);                                                         \
7480 }
7481 
7482 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7483 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7484 
7485 #undef DO_FCVTLT
7486 #undef DO_FCVTNT
7487