xref: /openbmc/qemu/target/arm/tcg/sve_helper.c (revision d2dfe0b5)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg.h"
28 #include "vec_internal.h"
29 #include "sve_ldst_internal.h"
30 #include "hw/core/tcg-cpu-ops.h"
31 
32 
33 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
34  *
35  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
36  * and bit 0 set if C is set.  Compare the definitions of these variables
37  * within CPUARMState.
38  */
39 
40 /* For no G bits set, NZCV = C.  */
41 #define PREDTEST_INIT  1
42 
43 /* This is an iterative function, called for each Pd and Pg word
44  * moving forward.
45  */
46 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
47 {
48     if (likely(g)) {
49         /* Compute N from first D & G.
50            Use bit 2 to signal first G bit seen.  */
51         if (!(flags & 4)) {
52             flags |= ((d & (g & -g)) != 0) << 31;
53             flags |= 4;
54         }
55 
56         /* Accumulate Z from each D & G.  */
57         flags |= ((d & g) != 0) << 1;
58 
59         /* Compute C from last !(D & G).  Replace previous.  */
60         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
61     }
62     return flags;
63 }
64 
65 /* This is an iterative function, called for each Pd and Pg word
66  * moving backward.
67  */
68 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
69 {
70     if (likely(g)) {
71         /* Compute C from first (i.e last) !(D & G).
72            Use bit 2 to signal first G bit seen.  */
73         if (!(flags & 4)) {
74             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
75             flags |= (d & pow2floor(g)) == 0;
76         }
77 
78         /* Accumulate Z from each D & G.  */
79         flags |= ((d & g) != 0) << 1;
80 
81         /* Compute N from last (i.e first) D & G.  Replace previous.  */
82         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
83     }
84     return flags;
85 }
86 
87 /* The same for a single word predicate.  */
88 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
89 {
90     return iter_predtest_fwd(d, g, PREDTEST_INIT);
91 }
92 
93 /* The same for a multi-word predicate.  */
94 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
95 {
96     uint32_t flags = PREDTEST_INIT;
97     uint64_t *d = vd, *g = vg;
98     uintptr_t i = 0;
99 
100     do {
101         flags = iter_predtest_fwd(d[i], g[i], flags);
102     } while (++i < words);
103 
104     return flags;
105 }
106 
107 /* Similarly for single word elements.  */
108 static inline uint64_t expand_pred_s(uint8_t byte)
109 {
110     static const uint64_t word[] = {
111         [0x01] = 0x00000000ffffffffull,
112         [0x10] = 0xffffffff00000000ull,
113         [0x11] = 0xffffffffffffffffull,
114     };
115     return word[byte & 0x11];
116 }
117 
118 #define LOGICAL_PPPP(NAME, FUNC) \
119 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
120 {                                                                         \
121     uintptr_t opr_sz = simd_oprsz(desc);                                  \
122     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
123     uintptr_t i;                                                          \
124     for (i = 0; i < opr_sz / 8; ++i) {                                    \
125         d[i] = FUNC(n[i], m[i], g[i]);                                    \
126     }                                                                     \
127 }
128 
129 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
130 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
131 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
132 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
133 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
134 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
135 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
136 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
137 
138 LOGICAL_PPPP(sve_and_pppp, DO_AND)
139 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
140 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
141 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
142 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
143 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
144 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
145 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
146 
147 #undef DO_AND
148 #undef DO_BIC
149 #undef DO_EOR
150 #undef DO_ORR
151 #undef DO_ORN
152 #undef DO_NOR
153 #undef DO_NAND
154 #undef DO_SEL
155 #undef LOGICAL_PPPP
156 
157 /* Fully general three-operand expander, controlled by a predicate.
158  * This is complicated by the host-endian storage of the register file.
159  */
160 /* ??? I don't expect the compiler could ever vectorize this itself.
161  * With some tables we can convert bit masks to byte masks, and with
162  * extra care wrt byte/word ordering we could use gcc generic vectors
163  * and do 16 bytes at a time.
164  */
165 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
166 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
167 {                                                                       \
168     intptr_t i, opr_sz = simd_oprsz(desc);                              \
169     for (i = 0; i < opr_sz; ) {                                         \
170         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
171         do {                                                            \
172             if (pg & 1) {                                               \
173                 TYPE nn = *(TYPE *)(vn + H(i));                         \
174                 TYPE mm = *(TYPE *)(vm + H(i));                         \
175                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
176             }                                                           \
177             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
178         } while (i & 15);                                               \
179     }                                                                   \
180 }
181 
182 /* Similarly, specialized for 64-bit operands.  */
183 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
184 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
185 {                                                               \
186     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
187     TYPE *d = vd, *n = vn, *m = vm;                             \
188     uint8_t *pg = vg;                                           \
189     for (i = 0; i < opr_sz; i += 1) {                           \
190         if (pg[H1(i)] & 1) {                                    \
191             TYPE nn = n[i], mm = m[i];                          \
192             d[i] = OP(nn, mm);                                  \
193         }                                                       \
194     }                                                           \
195 }
196 
197 #define DO_AND(N, M)  (N & M)
198 #define DO_EOR(N, M)  (N ^ M)
199 #define DO_ORR(N, M)  (N | M)
200 #define DO_BIC(N, M)  (N & ~M)
201 #define DO_ADD(N, M)  (N + M)
202 #define DO_SUB(N, M)  (N - M)
203 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
204 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
205 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
206 #define DO_MUL(N, M)  (N * M)
207 
208 
209 /*
210  * We must avoid the C undefined behaviour cases: division by
211  * zero and signed division of INT_MIN by -1. Both of these
212  * have architecturally defined required results for Arm.
213  * We special case all signed divisions by -1 to avoid having
214  * to deduce the minimum integer for the type involved.
215  */
216 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
217 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
218 
219 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
220 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
221 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
222 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
223 
224 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
225 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
226 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
227 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
228 
229 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
230 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
231 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
232 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
233 
234 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
235 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
236 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
237 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
238 
239 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
240 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
241 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
242 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
243 
244 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
245 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
246 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
247 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
248 
249 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
250 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
251 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
252 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
253 
254 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
255 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
256 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
257 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
258 
259 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
260 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
261 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
262 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
263 
264 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
265 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
266 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
267 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
268 
269 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
270 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
271 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
272 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
273 
274 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
275 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
276 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
277 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
278 
279 /* Because the computation type is at least twice as large as required,
280    these work for both signed and unsigned source types.  */
281 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
282 {
283     return (n * m) >> 8;
284 }
285 
286 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
287 {
288     return (n * m) >> 16;
289 }
290 
291 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
292 {
293     return (n * m) >> 32;
294 }
295 
296 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
297 {
298     uint64_t lo, hi;
299     muls64(&lo, &hi, n, m);
300     return hi;
301 }
302 
303 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
304 {
305     uint64_t lo, hi;
306     mulu64(&lo, &hi, n, m);
307     return hi;
308 }
309 
310 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
311 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
312 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
313 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
314 
315 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
316 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
317 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
318 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
319 
320 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
321 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
322 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
323 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
324 
325 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
326 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
327 
328 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
329 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
330 
331 /* Note that all bits of the shift are significant
332    and not modulo the element size.  */
333 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
334 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
335 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
336 
337 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
338 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
339 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
340 
341 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
342 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
343 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
344 
345 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
346 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
347 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
348 
349 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
350 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
351 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
352 
353 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
354 {
355     int8_t n1 = n, n2 = n >> 8;
356     return m + n1 + n2;
357 }
358 
359 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
360 {
361     int16_t n1 = n, n2 = n >> 16;
362     return m + n1 + n2;
363 }
364 
365 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
366 {
367     int32_t n1 = n, n2 = n >> 32;
368     return m + n1 + n2;
369 }
370 
371 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
372 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
373 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
374 
375 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
376 {
377     uint8_t n1 = n, n2 = n >> 8;
378     return m + n1 + n2;
379 }
380 
381 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
382 {
383     uint16_t n1 = n, n2 = n >> 16;
384     return m + n1 + n2;
385 }
386 
387 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
388 {
389     uint32_t n1 = n, n2 = n >> 32;
390     return m + n1 + n2;
391 }
392 
393 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
394 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
395 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
396 
397 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
398 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
399 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
400 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
401 
402 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
403 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
404 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
405 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
406 
407 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
408 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
409 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
410 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
411 
412 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
413 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
414 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
415 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
416 
417 /*
418  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
419  * We pass in a pointer to a dummy saturation field to trigger
420  * the saturating arithmetic but discard the information about
421  * whether it has occurred.
422  */
423 #define do_sqshl_b(n, m) \
424    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
425 #define do_sqshl_h(n, m) \
426    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
427 #define do_sqshl_s(n, m) \
428    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
429 #define do_sqshl_d(n, m) \
430    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
431 
432 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
433 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
434 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
435 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
436 
437 #define do_uqshl_b(n, m) \
438    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
439 #define do_uqshl_h(n, m) \
440    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
441 #define do_uqshl_s(n, m) \
442    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
443 #define do_uqshl_d(n, m) \
444    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
445 
446 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
447 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
448 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
449 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
450 
451 #define do_sqrshl_b(n, m) \
452    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
453 #define do_sqrshl_h(n, m) \
454    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
455 #define do_sqrshl_s(n, m) \
456    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
457 #define do_sqrshl_d(n, m) \
458    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
459 
460 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
461 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
462 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
463 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
464 
465 #undef do_sqrshl_d
466 
467 #define do_uqrshl_b(n, m) \
468    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
469 #define do_uqrshl_h(n, m) \
470    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
471 #define do_uqrshl_s(n, m) \
472    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
473 #define do_uqrshl_d(n, m) \
474    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
475 
476 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
477 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
478 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
479 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
480 
481 #undef do_uqrshl_d
482 
483 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
484 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
485 
486 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
487 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
488 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
489 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
490 
491 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
492 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
493 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
494 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
495 
496 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
497 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
498 
499 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
500 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
501 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
502 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
503 
504 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
505 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
506 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
507 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
508 
509 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
510 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
511 
512 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
513 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
514 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
515 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
516 
517 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
518 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
519 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
520 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
521 
522 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
523 {
524     return val >= max ? max : val <= min ? min : val;
525 }
526 
527 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
528 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
529 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
530 
531 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
532 {
533     int64_t r = n + m;
534     if (((r ^ n) & ~(n ^ m)) < 0) {
535         /* Signed overflow.  */
536         return r < 0 ? INT64_MAX : INT64_MIN;
537     }
538     return r;
539 }
540 
541 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
542 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
543 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
544 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
545 
546 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
547 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
548 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
549 
550 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
551 {
552     uint64_t r = n + m;
553     return r < n ? UINT64_MAX : r;
554 }
555 
556 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
557 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
558 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
559 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
560 
561 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
562 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
563 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
564 
565 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
566 {
567     int64_t r = n - m;
568     if (((r ^ n) & (n ^ m)) < 0) {
569         /* Signed overflow.  */
570         return r < 0 ? INT64_MAX : INT64_MIN;
571     }
572     return r;
573 }
574 
575 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
576 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
577 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
578 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
579 
580 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
581 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
582 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
583 
584 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
585 {
586     return n > m ? n - m : 0;
587 }
588 
589 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
590 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
591 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
592 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
593 
594 #define DO_SUQADD_B(n, m) \
595     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
596 #define DO_SUQADD_H(n, m) \
597     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
598 #define DO_SUQADD_S(n, m) \
599     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
600 
601 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
602 {
603     uint64_t r = n + m;
604 
605     if (n < 0) {
606         /* Note that m - abs(n) cannot underflow. */
607         if (r > INT64_MAX) {
608             /* Result is either very large positive or negative. */
609             if (m > -n) {
610                 /* m > abs(n), so r is a very large positive. */
611                 return INT64_MAX;
612             }
613             /* Result is negative. */
614         }
615     } else {
616         /* Both inputs are positive: check for overflow.  */
617         if (r < m || r > INT64_MAX) {
618             return INT64_MAX;
619         }
620     }
621     return r;
622 }
623 
624 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
625 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
626 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
627 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
628 
629 #define DO_USQADD_B(n, m) \
630     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
631 #define DO_USQADD_H(n, m) \
632     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
633 #define DO_USQADD_S(n, m) \
634     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
635 
636 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
637 {
638     uint64_t r = n + m;
639 
640     if (m < 0) {
641         return n < -m ? 0 : r;
642     }
643     return r < n ? UINT64_MAX : r;
644 }
645 
646 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
647 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
648 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
649 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
650 
651 #undef DO_ZPZZ
652 #undef DO_ZPZZ_D
653 
654 /*
655  * Three operand expander, operating on element pairs.
656  * If the slot I is even, the elements from from VN {I, I+1}.
657  * If the slot I is odd, the elements from from VM {I-1, I}.
658  * Load all of the input elements in each pair before overwriting output.
659  */
660 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
661 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
662 {                                                               \
663     intptr_t i, opr_sz = simd_oprsz(desc);                      \
664     for (i = 0; i < opr_sz; ) {                                 \
665         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
666         do {                                                    \
667             TYPE n0 = *(TYPE *)(vn + H(i));                     \
668             TYPE m0 = *(TYPE *)(vm + H(i));                     \
669             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
670             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
671             if (pg & 1) {                                       \
672                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
673             }                                                   \
674             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
675             if (pg & 1) {                                       \
676                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
677             }                                                   \
678             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
679         } while (i & 15);                                       \
680     }                                                           \
681 }
682 
683 /* Similarly, specialized for 64-bit operands.  */
684 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
685 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
686 {                                                               \
687     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
688     TYPE *d = vd, *n = vn, *m = vm;                             \
689     uint8_t *pg = vg;                                           \
690     for (i = 0; i < opr_sz; i += 2) {                           \
691         TYPE n0 = n[i], n1 = n[i + 1];                          \
692         TYPE m0 = m[i], m1 = m[i + 1];                          \
693         if (pg[H1(i)] & 1) {                                    \
694             d[i] = OP(n0, n1);                                  \
695         }                                                       \
696         if (pg[H1(i + 1)] & 1) {                                \
697             d[i + 1] = OP(m0, m1);                              \
698         }                                                       \
699     }                                                           \
700 }
701 
702 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
703 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
704 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
705 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
706 
707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
709 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
710 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
711 
712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
714 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
715 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
716 
717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
719 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
720 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
721 
722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
724 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
725 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
726 
727 #undef DO_ZPZZ_PAIR
728 #undef DO_ZPZZ_PAIR_D
729 
730 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
731 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
732                   void *status, uint32_t desc)                          \
733 {                                                                       \
734     intptr_t i, opr_sz = simd_oprsz(desc);                              \
735     for (i = 0; i < opr_sz; ) {                                         \
736         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
737         do {                                                            \
738             TYPE n0 = *(TYPE *)(vn + H(i));                             \
739             TYPE m0 = *(TYPE *)(vm + H(i));                             \
740             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
741             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
742             if (pg & 1) {                                               \
743                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
744             }                                                           \
745             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
746             if (pg & 1) {                                               \
747                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
748             }                                                           \
749             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
750         } while (i & 15);                                               \
751     }                                                                   \
752 }
753 
754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
756 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
757 
758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
760 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
761 
762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
764 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
765 
766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
768 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
769 
770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
772 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
773 
774 #undef DO_ZPZZ_PAIR_FP
775 
776 /* Three-operand expander, controlled by a predicate, in which the
777  * third operand is "wide".  That is, for D = N op M, the same 64-bit
778  * value of M is used with all of the narrower values of N.
779  */
780 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
781 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
782 {                                                                       \
783     intptr_t i, opr_sz = simd_oprsz(desc);                              \
784     for (i = 0; i < opr_sz; ) {                                         \
785         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
786         TYPEW mm = *(TYPEW *)(vm + i);                                  \
787         do {                                                            \
788             if (pg & 1) {                                               \
789                 TYPE nn = *(TYPE *)(vn + H(i));                         \
790                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
791             }                                                           \
792             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
793         } while (i & 7);                                                \
794     }                                                                   \
795 }
796 
797 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
798 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
799 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
800 
801 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
802 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
803 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
804 
805 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
806 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
807 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
808 
809 #undef DO_ZPZW
810 
811 /* Fully general two-operand expander, controlled by a predicate.
812  */
813 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
814 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
815 {                                                               \
816     intptr_t i, opr_sz = simd_oprsz(desc);                      \
817     for (i = 0; i < opr_sz; ) {                                 \
818         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
819         do {                                                    \
820             if (pg & 1) {                                       \
821                 TYPE nn = *(TYPE *)(vn + H(i));                 \
822                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
823             }                                                   \
824             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
825         } while (i & 15);                                       \
826     }                                                           \
827 }
828 
829 /* Similarly, specialized for 64-bit operands.  */
830 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
831 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
832 {                                                               \
833     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
834     TYPE *d = vd, *n = vn;                                      \
835     uint8_t *pg = vg;                                           \
836     for (i = 0; i < opr_sz; i += 1) {                           \
837         if (pg[H1(i)] & 1) {                                    \
838             TYPE nn = n[i];                                     \
839             d[i] = OP(nn);                                      \
840         }                                                       \
841     }                                                           \
842 }
843 
844 #define DO_CLS_B(N)   (clrsb32(N) - 24)
845 #define DO_CLS_H(N)   (clrsb32(N) - 16)
846 
847 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
848 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
849 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
850 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
851 
852 #define DO_CLZ_B(N)   (clz32(N) - 24)
853 #define DO_CLZ_H(N)   (clz32(N) - 16)
854 
855 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
856 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
857 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
858 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
859 
860 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
861 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
862 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
863 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
864 
865 #define DO_CNOT(N)    (N == 0)
866 
867 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
868 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
869 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
870 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
871 
872 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
873 
874 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
875 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
876 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
877 
878 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
879 
880 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
881 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
882 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
883 
884 #define DO_NOT(N)    (~N)
885 
886 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
887 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
888 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
889 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
890 
891 #define DO_SXTB(N)    ((int8_t)N)
892 #define DO_SXTH(N)    ((int16_t)N)
893 #define DO_SXTS(N)    ((int32_t)N)
894 #define DO_UXTB(N)    ((uint8_t)N)
895 #define DO_UXTH(N)    ((uint16_t)N)
896 #define DO_UXTS(N)    ((uint32_t)N)
897 
898 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
899 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
900 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
901 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
902 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
903 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
904 
905 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
906 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
907 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
908 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
909 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
910 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
911 
912 #define DO_ABS(N)    (N < 0 ? -N : N)
913 
914 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
915 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
916 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
917 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
918 
919 #define DO_NEG(N)    (-N)
920 
921 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
922 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
923 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
924 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
925 
926 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
927 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
928 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
929 
930 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
931 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
932 
933 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
934 
935 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
936 {
937     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
938     uint64_t *d = vd, *n = vn;
939     uint8_t *pg = vg;
940 
941     for (i = 0; i < opr_sz; i += 2) {
942         if (pg[H1(i)] & 1) {
943             uint64_t n0 = n[i + 0];
944             uint64_t n1 = n[i + 1];
945             d[i + 0] = n1;
946             d[i + 1] = n0;
947         }
948     }
949 }
950 
951 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
952 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
953 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
954 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
955 
956 #define DO_SQABS(X) \
957     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
958        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
959 
960 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
961 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
962 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
963 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
964 
965 #define DO_SQNEG(X) \
966     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
967        x_ == min_ ? -min_ - 1 : -x_; })
968 
969 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
970 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
971 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
972 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
973 
974 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
975 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
976 
977 /* Three-operand expander, unpredicated, in which the third operand is "wide".
978  */
979 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
980 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
981 {                                                              \
982     intptr_t i, opr_sz = simd_oprsz(desc);                     \
983     for (i = 0; i < opr_sz; ) {                                \
984         TYPEW mm = *(TYPEW *)(vm + i);                         \
985         do {                                                   \
986             TYPE nn = *(TYPE *)(vn + H(i));                    \
987             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
988             i += sizeof(TYPE);                                 \
989         } while (i & 7);                                       \
990     }                                                          \
991 }
992 
993 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
994 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
995 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
996 
997 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
998 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
999 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1000 
1001 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1002 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1003 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1004 
1005 #undef DO_ZZW
1006 
1007 #undef DO_CLS_B
1008 #undef DO_CLS_H
1009 #undef DO_CLZ_B
1010 #undef DO_CLZ_H
1011 #undef DO_CNOT
1012 #undef DO_FABS
1013 #undef DO_FNEG
1014 #undef DO_ABS
1015 #undef DO_NEG
1016 #undef DO_ZPZ
1017 #undef DO_ZPZ_D
1018 
1019 /*
1020  * Three-operand expander, unpredicated, in which the two inputs are
1021  * selected from the top or bottom half of the wide column.
1022  */
1023 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1024 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1025 {                                                                       \
1026     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1027     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1028     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1029     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1030         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1031         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1032         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1033     }                                                                   \
1034 }
1035 
1036 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1037 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1038 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1039 
1040 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1041 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1042 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1043 
1044 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1045 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1046 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1047 
1048 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1049 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1050 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1051 
1052 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1053 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1054 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1055 
1056 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1057 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1058 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1059 
1060 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1061 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1062 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1063 
1064 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1065 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1066 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1067 
1068 /* Note that the multiply cannot overflow, but the doubling can. */
1069 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1070 {
1071     int16_t val = n * m;
1072     return DO_SQADD_H(val, val);
1073 }
1074 
1075 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1076 {
1077     int32_t val = n * m;
1078     return DO_SQADD_S(val, val);
1079 }
1080 
1081 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1082 {
1083     int64_t val = n * m;
1084     return do_sqadd_d(val, val);
1085 }
1086 
1087 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1088 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1089 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1090 
1091 #undef DO_ZZZ_TB
1092 
1093 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1094 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1095 {                                                              \
1096     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1097     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1098     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1099         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1100         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1101         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1102     }                                                          \
1103 }
1104 
1105 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1106 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1107 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1108 
1109 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1110 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1111 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1112 
1113 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1114 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1115 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1116 
1117 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1118 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1119 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1120 
1121 #undef DO_ZZZ_WTB
1122 
1123 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1124 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1125 {                                                                       \
1126     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1127     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1128     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1129     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1130         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1131         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1132         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1133     }                                                                   \
1134 }
1135 
1136 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1137 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1138 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1139 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1140 
1141 #undef DO_ZZZ_NTB
1142 
1143 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1144 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1145 {                                                               \
1146     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1147     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1148     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1149         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1150         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1151         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1152         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1153     }                                                           \
1154 }
1155 
1156 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1157 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1158 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1159 
1160 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1161 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1162 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1163 
1164 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1165 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1166 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1167 
1168 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1169 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1170 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1171 
1172 #define DO_NMUL(N, M)  -(N * M)
1173 
1174 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1176 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1177 
1178 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1180 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1181 
1182 #undef DO_ZZZW_ACC
1183 
1184 #define DO_XTNB(NAME, TYPE, OP) \
1185 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1186 {                                                            \
1187     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1188     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1189         TYPE nn = *(TYPE *)(vn + i);                         \
1190         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1191         *(TYPE *)(vd + i) = nn;                              \
1192     }                                                        \
1193 }
1194 
1195 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1196 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1197 {                                                                       \
1198     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1199     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1200         TYPE nn = *(TYPE *)(vn + i);                                    \
1201         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1202     }                                                                   \
1203 }
1204 
1205 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1206 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1207 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1208 
1209 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1210 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1211 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1212 
1213 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1214 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1215 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1216 
1217 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1218 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1219 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1220 
1221 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1222 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1223 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1224 
1225 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1226 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1227 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1228 
1229 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1230 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1231 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1232 
1233 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1234 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1235 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1236 
1237 #undef DO_XTNB
1238 #undef DO_XTNT
1239 
1240 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1241 {
1242     intptr_t i, opr_sz = simd_oprsz(desc);
1243     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1244     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1245     uint32_t *a = va, *n = vn;
1246     uint64_t *d = vd, *m = vm;
1247 
1248     for (i = 0; i < opr_sz / 8; ++i) {
1249         uint32_t e1 = a[2 * i + H4(0)];
1250         uint32_t e2 = n[2 * i + sel] ^ inv;
1251         uint64_t c = extract64(m[i], 32, 1);
1252         /* Compute and store the entire 33-bit result at once. */
1253         d[i] = c + e1 + e2;
1254     }
1255 }
1256 
1257 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1258 {
1259     intptr_t i, opr_sz = simd_oprsz(desc);
1260     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1261     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1262     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1263 
1264     for (i = 0; i < opr_sz / 8; i += 2) {
1265         Int128 e1 = int128_make64(a[i]);
1266         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1267         Int128 c = int128_make64(m[i + 1] & 1);
1268         Int128 r = int128_add(int128_add(e1, e2), c);
1269         d[i + 0] = int128_getlo(r);
1270         d[i + 1] = int128_gethi(r);
1271     }
1272 }
1273 
1274 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1275 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1276 {                                                                       \
1277     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1278     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1279     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1280     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1281         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1282         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1283         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1284         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1285     }                                                                   \
1286 }
1287 
1288 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1289            do_sqdmull_h, DO_SQADD_H)
1290 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1291            do_sqdmull_s, DO_SQADD_S)
1292 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1293            do_sqdmull_d, do_sqadd_d)
1294 
1295 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1296            do_sqdmull_h, DO_SQSUB_H)
1297 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1298            do_sqdmull_s, DO_SQSUB_S)
1299 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1300            do_sqdmull_d, do_sqsub_d)
1301 
1302 #undef DO_SQDMLAL
1303 
1304 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1305 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1306 {                                                               \
1307     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1308     int rot = simd_data(desc);                                  \
1309     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1310     bool sub_r = rot == 1 || rot == 2;                          \
1311     bool sub_i = rot >= 2;                                      \
1312     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1313     for (i = 0; i < opr_sz; i += 2) {                           \
1314         TYPE elt1_a = n[H(i + sel_a)];                          \
1315         TYPE elt2_a = m[H(i + sel_a)];                          \
1316         TYPE elt2_b = m[H(i + sel_b)];                          \
1317         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1318         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1319     }                                                           \
1320 }
1321 
1322 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1323 
1324 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1325 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1326 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1327 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1328 
1329 #define DO_SQRDMLAH_B(N, M, A, S) \
1330     do_sqrdmlah_b(N, M, A, S, true)
1331 #define DO_SQRDMLAH_H(N, M, A, S) \
1332     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1333 #define DO_SQRDMLAH_S(N, M, A, S) \
1334     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1335 #define DO_SQRDMLAH_D(N, M, A, S) \
1336     do_sqrdmlah_d(N, M, A, S, true)
1337 
1338 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1341 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1342 
1343 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1344 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1345 {                                                                           \
1346     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1347     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1348     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1349     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1350     bool sub_r = rot == 1 || rot == 2;                                      \
1351     bool sub_i = rot >= 2;                                                  \
1352     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1353     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1354         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1355         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1356         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1357             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1358             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1359             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1360         }                                                                   \
1361     }                                                                       \
1362 }
1363 
1364 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1365 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1366 
1367 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1368 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1369 
1370 #undef DO_CMLA
1371 #undef DO_CMLA_FUNC
1372 #undef DO_CMLA_IDX_FUNC
1373 #undef DO_SQRDMLAH_B
1374 #undef DO_SQRDMLAH_H
1375 #undef DO_SQRDMLAH_S
1376 #undef DO_SQRDMLAH_D
1377 
1378 /* Note N and M are 4 elements bundled into one unit. */
1379 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1380                          int sel_a, int sel_b, int sub_i)
1381 {
1382     for (int i = 0; i <= 1; i++) {
1383         int32_t elt1_r = (int8_t)(n >> (16 * i));
1384         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1385         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1386         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1387 
1388         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1389     }
1390     return a;
1391 }
1392 
1393 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1394                          int sel_a, int sel_b, int sub_i)
1395 {
1396     for (int i = 0; i <= 1; i++) {
1397         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1398         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1399         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1400         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1401 
1402         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1403     }
1404     return a;
1405 }
1406 
1407 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1408                               void *va, uint32_t desc)
1409 {
1410     int opr_sz = simd_oprsz(desc);
1411     int rot = simd_data(desc);
1412     int sel_a = rot & 1;
1413     int sel_b = sel_a ^ 1;
1414     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1415     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1416 
1417     for (int e = 0; e < opr_sz / 4; e++) {
1418         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1419     }
1420 }
1421 
1422 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1423                               void *va, uint32_t desc)
1424 {
1425     int opr_sz = simd_oprsz(desc);
1426     int rot = simd_data(desc);
1427     int sel_a = rot & 1;
1428     int sel_b = sel_a ^ 1;
1429     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1430     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1431 
1432     for (int e = 0; e < opr_sz / 8; e++) {
1433         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1434     }
1435 }
1436 
1437 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1438                              void *va, uint32_t desc)
1439 {
1440     int opr_sz = simd_oprsz(desc);
1441     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1442     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1443     int sel_a = rot & 1;
1444     int sel_b = sel_a ^ 1;
1445     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1446     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1447 
1448     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1449         uint32_t seg_m = m[seg + idx];
1450         for (int e = 0; e < 4; e++) {
1451             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1452                                    sel_a, sel_b, sub_i);
1453         }
1454     }
1455 }
1456 
1457 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1458                              void *va, uint32_t desc)
1459 {
1460     int seg, opr_sz = simd_oprsz(desc);
1461     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1462     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1463     int sel_a = rot & 1;
1464     int sel_b = sel_a ^ 1;
1465     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1466     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1467 
1468     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1469         uint64_t seg_m = m[seg + idx];
1470         for (int e = 0; e < 2; e++) {
1471             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1472                                    sel_a, sel_b, sub_i);
1473         }
1474     }
1475 }
1476 
1477 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1478 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1479 {                                                                       \
1480     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1481     intptr_t i, j, idx = simd_data(desc);                               \
1482     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1483     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1484         TYPE mm = m[i];                                                 \
1485         for (j = 0; j < segment; j++) {                                 \
1486             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1487         }                                                               \
1488     }                                                                   \
1489 }
1490 
1491 #define DO_SQRDMLAH_H(N, M, A) \
1492     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1493 #define DO_SQRDMLAH_S(N, M, A) \
1494     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1495 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1496 
1497 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1498 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1499 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1500 
1501 #define DO_SQRDMLSH_H(N, M, A) \
1502     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1503 #define DO_SQRDMLSH_S(N, M, A) \
1504     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1505 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1506 
1507 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1508 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1509 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1510 
1511 #undef DO_ZZXZ
1512 
1513 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1514 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1515 {                                                                         \
1516     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1517     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1518     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1519     for (i = 0; i < oprsz; i += 16) {                                     \
1520         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1521         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1522             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1523             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1524             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1525         }                                                                 \
1526     }                                                                     \
1527 }
1528 
1529 #define DO_MLA(N, M, A)  (A + N * M)
1530 
1531 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1532 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1533 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1534 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1535 
1536 #define DO_MLS(N, M, A)  (A - N * M)
1537 
1538 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1539 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1540 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1541 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1542 
1543 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1544 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1545 
1546 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1547 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1548 
1549 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1550 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1551 
1552 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1553 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1554 
1555 #undef DO_MLA
1556 #undef DO_MLS
1557 #undef DO_ZZXW
1558 
1559 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1560 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1561 {                                                                         \
1562     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1563     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1564     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1565     for (i = 0; i < oprsz; i += 16) {                                     \
1566         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1567         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1568             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1569             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1570         }                                                                 \
1571     }                                                                     \
1572 }
1573 
1574 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1575 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1576 
1577 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1578 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1579 
1580 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1581 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1582 
1583 #undef DO_ZZX
1584 
1585 #define DO_BITPERM(NAME, TYPE, OP) \
1586 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1587 {                                                              \
1588     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1589     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1590         TYPE nn = *(TYPE *)(vn + i);                           \
1591         TYPE mm = *(TYPE *)(vm + i);                           \
1592         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1593     }                                                          \
1594 }
1595 
1596 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1597 {
1598     uint64_t res = 0;
1599     int db, rb = 0;
1600 
1601     for (db = 0; db < n; ++db) {
1602         if ((mask >> db) & 1) {
1603             res |= ((data >> db) & 1) << rb;
1604             ++rb;
1605         }
1606     }
1607     return res;
1608 }
1609 
1610 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1611 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1612 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1613 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1614 
1615 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1616 {
1617     uint64_t res = 0;
1618     int rb, db = 0;
1619 
1620     for (rb = 0; rb < n; ++rb) {
1621         if ((mask >> rb) & 1) {
1622             res |= ((data >> db) & 1) << rb;
1623             ++db;
1624         }
1625     }
1626     return res;
1627 }
1628 
1629 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1630 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1631 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1632 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1633 
1634 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1635 {
1636     uint64_t resm = 0, resu = 0;
1637     int db, rbm = 0, rbu = 0;
1638 
1639     for (db = 0; db < n; ++db) {
1640         uint64_t val = (data >> db) & 1;
1641         if ((mask >> db) & 1) {
1642             resm |= val << rbm++;
1643         } else {
1644             resu |= val << rbu++;
1645         }
1646     }
1647 
1648     return resm | (resu << rbm);
1649 }
1650 
1651 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1652 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1653 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1654 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1655 
1656 #undef DO_BITPERM
1657 
1658 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1659 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1660 {                                                               \
1661     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1662     int sub_r = simd_data(desc);                                \
1663     if (sub_r) {                                                \
1664         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1665             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1666             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1667             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1668             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1669             acc_r = ADD_OP(acc_r, el2_i);                       \
1670             acc_i = SUB_OP(acc_i, el2_r);                       \
1671             *(TYPE *)(vd + H(i)) = acc_r;                       \
1672             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1673         }                                                       \
1674     } else {                                                    \
1675         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1676             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1677             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1678             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1679             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1680             acc_r = SUB_OP(acc_r, el2_i);                       \
1681             acc_i = ADD_OP(acc_i, el2_r);                       \
1682             *(TYPE *)(vd + H(i)) = acc_r;                       \
1683             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1684         }                                                       \
1685     }                                                           \
1686 }
1687 
1688 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1689 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1690 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1691 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1692 
1693 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1694 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1695 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1696 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1697 
1698 #undef DO_CADD
1699 
1700 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1701 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1702 {                                                              \
1703     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1704     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1705     int shift = simd_data(desc) >> 1;                          \
1706     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1707         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1708         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1709     }                                                          \
1710 }
1711 
1712 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1713 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1714 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1715 
1716 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1717 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1718 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1719 
1720 #undef DO_ZZI_SHLL
1721 
1722 /* Two-operand reduction expander, controlled by a predicate.
1723  * The difference between TYPERED and TYPERET has to do with
1724  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1725  * but TYPERET must be unsigned so that e.g. a 32-bit value
1726  * is not sign-extended to the ABI uint64_t return type.
1727  */
1728 /* ??? If we were to vectorize this by hand the reduction ordering
1729  * would change.  For integer operands, this is perfectly fine.
1730  */
1731 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1732 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1733 {                                                          \
1734     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1735     TYPERED ret = INIT;                                    \
1736     for (i = 0; i < opr_sz; ) {                            \
1737         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1738         do {                                               \
1739             if (pg & 1) {                                  \
1740                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1741                 ret = OP(ret, nn);                         \
1742             }                                              \
1743             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1744         } while (i & 15);                                  \
1745     }                                                      \
1746     return (TYPERET)ret;                                   \
1747 }
1748 
1749 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1750 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1751 {                                                          \
1752     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1753     TYPEE *n = vn;                                         \
1754     uint8_t *pg = vg;                                      \
1755     TYPER ret = INIT;                                      \
1756     for (i = 0; i < opr_sz; i += 1) {                      \
1757         if (pg[H1(i)] & 1) {                               \
1758             TYPEE nn = n[i];                               \
1759             ret = OP(ret, nn);                             \
1760         }                                                  \
1761     }                                                      \
1762     return ret;                                            \
1763 }
1764 
1765 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1766 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1767 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1768 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1769 
1770 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1771 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1772 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1773 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1774 
1775 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1776 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1777 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1778 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1779 
1780 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1781 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1782 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1783 
1784 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1785 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1786 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1787 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1788 
1789 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1790 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1791 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1792 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1793 
1794 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1795 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1796 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1797 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1798 
1799 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1800 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1801 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1802 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1803 
1804 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1805 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1806 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1807 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1808 
1809 #undef DO_VPZ
1810 #undef DO_VPZ_D
1811 
1812 /* Two vector operand, one scalar operand, unpredicated.  */
1813 #define DO_ZZI(NAME, TYPE, OP)                                       \
1814 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1815 {                                                                    \
1816     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1817     TYPE s = s64, *d = vd, *n = vn;                                  \
1818     for (i = 0; i < opr_sz; ++i) {                                   \
1819         d[i] = OP(n[i], s);                                          \
1820     }                                                                \
1821 }
1822 
1823 #define DO_SUBR(X, Y)   (Y - X)
1824 
1825 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1826 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1827 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1828 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1829 
1830 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1831 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1832 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1833 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1834 
1835 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1836 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1837 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1838 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1839 
1840 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1841 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1842 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1843 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1844 
1845 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1846 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1847 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1848 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1849 
1850 #undef DO_ZZI
1851 
1852 #undef DO_AND
1853 #undef DO_ORR
1854 #undef DO_EOR
1855 #undef DO_BIC
1856 #undef DO_ADD
1857 #undef DO_SUB
1858 #undef DO_MAX
1859 #undef DO_MIN
1860 #undef DO_ABD
1861 #undef DO_MUL
1862 #undef DO_DIV
1863 #undef DO_ASR
1864 #undef DO_LSR
1865 #undef DO_LSL
1866 #undef DO_SUBR
1867 
1868 /* Similar to the ARM LastActiveElement pseudocode function, except the
1869    result is multiplied by the element size.  This includes the not found
1870    indication; e.g. not found for esz=3 is -8.  */
1871 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1872 {
1873     uint64_t mask = pred_esz_masks[esz];
1874     intptr_t i = words;
1875 
1876     do {
1877         uint64_t this_g = g[--i] & mask;
1878         if (this_g) {
1879             return i * 64 + (63 - clz64(this_g));
1880         }
1881     } while (i > 0);
1882     return (intptr_t)-1 << esz;
1883 }
1884 
1885 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1886 {
1887     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1888     uint32_t flags = PREDTEST_INIT;
1889     uint64_t *d = vd, *g = vg;
1890     intptr_t i = 0;
1891 
1892     do {
1893         uint64_t this_d = d[i];
1894         uint64_t this_g = g[i];
1895 
1896         if (this_g) {
1897             if (!(flags & 4)) {
1898                 /* Set in D the first bit of G.  */
1899                 this_d |= this_g & -this_g;
1900                 d[i] = this_d;
1901             }
1902             flags = iter_predtest_fwd(this_d, this_g, flags);
1903         }
1904     } while (++i < words);
1905 
1906     return flags;
1907 }
1908 
1909 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1910 {
1911     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1912     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1913     uint32_t flags = PREDTEST_INIT;
1914     uint64_t *d = vd, *g = vg, esz_mask;
1915     intptr_t i, next;
1916 
1917     next = last_active_element(vd, words, esz) + (1 << esz);
1918     esz_mask = pred_esz_masks[esz];
1919 
1920     /* Similar to the pseudocode for pnext, but scaled by ESZ
1921        so that we find the correct bit.  */
1922     if (next < words * 64) {
1923         uint64_t mask = -1;
1924 
1925         if (next & 63) {
1926             mask = ~((1ull << (next & 63)) - 1);
1927             next &= -64;
1928         }
1929         do {
1930             uint64_t this_g = g[next / 64] & esz_mask & mask;
1931             if (this_g != 0) {
1932                 next = (next & -64) + ctz64(this_g);
1933                 break;
1934             }
1935             next += 64;
1936             mask = -1;
1937         } while (next < words * 64);
1938     }
1939 
1940     i = 0;
1941     do {
1942         uint64_t this_d = 0;
1943         if (i == next / 64) {
1944             this_d = 1ull << (next & 63);
1945         }
1946         d[i] = this_d;
1947         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1948     } while (++i < words);
1949 
1950     return flags;
1951 }
1952 
1953 /*
1954  * Copy Zn into Zd, and store zero into inactive elements.
1955  * If inv, store zeros into the active elements.
1956  */
1957 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1958 {
1959     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1960     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1961     uint64_t *d = vd, *n = vn;
1962     uint8_t *pg = vg;
1963 
1964     for (i = 0; i < opr_sz; i += 1) {
1965         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1966     }
1967 }
1968 
1969 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1970 {
1971     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1972     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1973     uint64_t *d = vd, *n = vn;
1974     uint8_t *pg = vg;
1975 
1976     for (i = 0; i < opr_sz; i += 1) {
1977         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1978     }
1979 }
1980 
1981 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1982 {
1983     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1984     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1985     uint64_t *d = vd, *n = vn;
1986     uint8_t *pg = vg;
1987 
1988     for (i = 0; i < opr_sz; i += 1) {
1989         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1990     }
1991 }
1992 
1993 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1994 {
1995     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1996     uint64_t *d = vd, *n = vn;
1997     uint8_t *pg = vg;
1998     uint8_t inv = simd_data(desc);
1999 
2000     for (i = 0; i < opr_sz; i += 1) {
2001         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2002     }
2003 }
2004 
2005 /* Three-operand expander, immediate operand, controlled by a predicate.
2006  */
2007 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2008 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2009 {                                                               \
2010     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2011     TYPE imm = simd_data(desc);                                 \
2012     for (i = 0; i < opr_sz; ) {                                 \
2013         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2014         do {                                                    \
2015             if (pg & 1) {                                       \
2016                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2017                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2018             }                                                   \
2019             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2020         } while (i & 15);                                       \
2021     }                                                           \
2022 }
2023 
2024 /* Similarly, specialized for 64-bit operands.  */
2025 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2026 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2027 {                                                               \
2028     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2029     TYPE *d = vd, *n = vn;                                      \
2030     TYPE imm = simd_data(desc);                                 \
2031     uint8_t *pg = vg;                                           \
2032     for (i = 0; i < opr_sz; i += 1) {                           \
2033         if (pg[H1(i)] & 1) {                                    \
2034             TYPE nn = n[i];                                     \
2035             d[i] = OP(nn, imm);                                 \
2036         }                                                       \
2037     }                                                           \
2038 }
2039 
2040 #define DO_SHR(N, M)  (N >> M)
2041 #define DO_SHL(N, M)  (N << M)
2042 
2043 /* Arithmetic shift right for division.  This rounds negative numbers
2044    toward zero as per signed division.  Therefore before shifting,
2045    when N is negative, add 2**M-1.  */
2046 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2047 
2048 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2049 {
2050     if (likely(sh < 64)) {
2051         return (x >> sh) + ((x >> (sh - 1)) & 1);
2052     } else if (sh == 64) {
2053         return x >> 63;
2054     } else {
2055         return 0;
2056     }
2057 }
2058 
2059 static inline int64_t do_srshr(int64_t x, unsigned sh)
2060 {
2061     if (likely(sh < 64)) {
2062         return (x >> sh) + ((x >> (sh - 1)) & 1);
2063     } else {
2064         /* Rounding the sign bit always produces 0. */
2065         return 0;
2066     }
2067 }
2068 
2069 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2070 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2071 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2072 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2073 
2074 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2075 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2076 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2077 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2078 
2079 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2080 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2081 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2082 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2083 
2084 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2085 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2086 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2087 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2088 
2089 /* SVE2 bitwise shift by immediate */
2090 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2091 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2092 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2093 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2094 
2095 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2096 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2097 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2098 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2099 
2100 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2101 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2102 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2103 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2104 
2105 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2106 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2107 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2108 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2109 
2110 #define do_suqrshl_b(n, m) \
2111    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2112 #define do_suqrshl_h(n, m) \
2113    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2114 #define do_suqrshl_s(n, m) \
2115    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2116 #define do_suqrshl_d(n, m) \
2117    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2118 
2119 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2120 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2121 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2122 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2123 
2124 #undef DO_ASRD
2125 #undef DO_ZPZI
2126 #undef DO_ZPZI_D
2127 
2128 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2129 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2130 {                                                            \
2131     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2132     int shift = simd_data(desc);                             \
2133     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2134         TYPEW nn = *(TYPEW *)(vn + i);                       \
2135         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2136     }                                                        \
2137 }
2138 
2139 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2140 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2141 {                                                                 \
2142     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2143     int shift = simd_data(desc);                                  \
2144     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2145         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2146         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2147     }                                                             \
2148 }
2149 
2150 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2151 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2152 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2153 
2154 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2155 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2156 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2157 
2158 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2159 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2160 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2161 
2162 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2163 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2164 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2165 
2166 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2167 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2168 #define DO_SQSHRUN_D(x, sh) \
2169     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2170 
2171 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2172 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2173 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2174 
2175 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2176 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2177 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2178 
2179 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2180 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2181 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2182 
2183 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2184 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2185 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2186 
2187 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2188 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2189 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2190 
2191 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2192 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2193 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2194 
2195 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2196 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2197 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2198 
2199 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2200 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2201 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2202 
2203 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2204 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2205 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2206 
2207 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2208 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2209 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2210 
2211 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2212 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2213 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2214 
2215 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2216 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2217 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2218 
2219 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2220 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2221 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2222 
2223 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2224 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2225 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2226 
2227 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2228 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2229 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2230 
2231 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2232 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2233 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2234 
2235 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2236 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2237 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2238 
2239 #undef DO_SHRNB
2240 #undef DO_SHRNT
2241 
2242 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2243 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2244 {                                                                           \
2245     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2246     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2247         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2248         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2249         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2250     }                                                                       \
2251 }
2252 
2253 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2254 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2255 {                                                                           \
2256     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2257     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2258         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2259         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2260         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2261     }                                                                       \
2262 }
2263 
2264 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2265 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2266 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2267 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2268 
2269 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2270 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2271 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2272 
2273 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2274 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2275 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2276 
2277 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2278 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2279 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2280 
2281 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2282 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2283 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2284 
2285 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2286 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2287 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2288 
2289 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2290 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2291 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2292 
2293 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2294 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2295 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2296 
2297 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2298 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2299 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2300 
2301 #undef DO_RSUBHN
2302 #undef DO_SUBHN
2303 #undef DO_RADDHN
2304 #undef DO_ADDHN
2305 
2306 #undef DO_BINOPNB
2307 
2308 /* Fully general four-operand expander, controlled by a predicate.
2309  */
2310 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2311 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2312                   void *vg, uint32_t desc)                    \
2313 {                                                             \
2314     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2315     for (i = 0; i < opr_sz; ) {                               \
2316         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2317         do {                                                  \
2318             if (pg & 1) {                                     \
2319                 TYPE nn = *(TYPE *)(vn + H(i));               \
2320                 TYPE mm = *(TYPE *)(vm + H(i));               \
2321                 TYPE aa = *(TYPE *)(va + H(i));               \
2322                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2323             }                                                 \
2324             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2325         } while (i & 15);                                     \
2326     }                                                         \
2327 }
2328 
2329 /* Similarly, specialized for 64-bit operands.  */
2330 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2331 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2332                   void *vg, uint32_t desc)                    \
2333 {                                                             \
2334     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2335     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2336     uint8_t *pg = vg;                                         \
2337     for (i = 0; i < opr_sz; i += 1) {                         \
2338         if (pg[H1(i)] & 1) {                                  \
2339             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2340             d[i] = OP(aa, nn, mm);                            \
2341         }                                                     \
2342     }                                                         \
2343 }
2344 
2345 #define DO_MLA(A, N, M)  (A + N * M)
2346 #define DO_MLS(A, N, M)  (A - N * M)
2347 
2348 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2349 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2350 
2351 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2352 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2353 
2354 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2355 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2356 
2357 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2358 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2359 
2360 #undef DO_MLA
2361 #undef DO_MLS
2362 #undef DO_ZPZZZ
2363 #undef DO_ZPZZZ_D
2364 
2365 void HELPER(sve_index_b)(void *vd, uint32_t start,
2366                          uint32_t incr, uint32_t desc)
2367 {
2368     intptr_t i, opr_sz = simd_oprsz(desc);
2369     uint8_t *d = vd;
2370     for (i = 0; i < opr_sz; i += 1) {
2371         d[H1(i)] = start + i * incr;
2372     }
2373 }
2374 
2375 void HELPER(sve_index_h)(void *vd, uint32_t start,
2376                          uint32_t incr, uint32_t desc)
2377 {
2378     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2379     uint16_t *d = vd;
2380     for (i = 0; i < opr_sz; i += 1) {
2381         d[H2(i)] = start + i * incr;
2382     }
2383 }
2384 
2385 void HELPER(sve_index_s)(void *vd, uint32_t start,
2386                          uint32_t incr, uint32_t desc)
2387 {
2388     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2389     uint32_t *d = vd;
2390     for (i = 0; i < opr_sz; i += 1) {
2391         d[H4(i)] = start + i * incr;
2392     }
2393 }
2394 
2395 void HELPER(sve_index_d)(void *vd, uint64_t start,
2396                          uint64_t incr, uint32_t desc)
2397 {
2398     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2399     uint64_t *d = vd;
2400     for (i = 0; i < opr_sz; i += 1) {
2401         d[i] = start + i * incr;
2402     }
2403 }
2404 
2405 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2406 {
2407     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2408     uint32_t sh = simd_data(desc);
2409     uint32_t *d = vd, *n = vn, *m = vm;
2410     for (i = 0; i < opr_sz; i += 1) {
2411         d[i] = n[i] + (m[i] << sh);
2412     }
2413 }
2414 
2415 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2416 {
2417     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2418     uint64_t sh = simd_data(desc);
2419     uint64_t *d = vd, *n = vn, *m = vm;
2420     for (i = 0; i < opr_sz; i += 1) {
2421         d[i] = n[i] + (m[i] << sh);
2422     }
2423 }
2424 
2425 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2426 {
2427     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2428     uint64_t sh = simd_data(desc);
2429     uint64_t *d = vd, *n = vn, *m = vm;
2430     for (i = 0; i < opr_sz; i += 1) {
2431         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2432     }
2433 }
2434 
2435 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2436 {
2437     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2438     uint64_t sh = simd_data(desc);
2439     uint64_t *d = vd, *n = vn, *m = vm;
2440     for (i = 0; i < opr_sz; i += 1) {
2441         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2442     }
2443 }
2444 
2445 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2446 {
2447     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2448     static const uint16_t coeff[] = {
2449         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2450         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2451         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2452         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2453     };
2454     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2455     uint16_t *d = vd, *n = vn;
2456 
2457     for (i = 0; i < opr_sz; i++) {
2458         uint16_t nn = n[i];
2459         intptr_t idx = extract32(nn, 0, 5);
2460         uint16_t exp = extract32(nn, 5, 5);
2461         d[i] = coeff[idx] | (exp << 10);
2462     }
2463 }
2464 
2465 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2466 {
2467     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2468     static const uint32_t coeff[] = {
2469         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2470         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2471         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2472         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2473         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2474         0x1ef532, 0x20b051, 0x227043, 0x243516,
2475         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2476         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2477         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2478         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2479         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2480         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2481         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2482         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2483         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2484         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2485     };
2486     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2487     uint32_t *d = vd, *n = vn;
2488 
2489     for (i = 0; i < opr_sz; i++) {
2490         uint32_t nn = n[i];
2491         intptr_t idx = extract32(nn, 0, 6);
2492         uint32_t exp = extract32(nn, 6, 8);
2493         d[i] = coeff[idx] | (exp << 23);
2494     }
2495 }
2496 
2497 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2498 {
2499     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2500     static const uint64_t coeff[] = {
2501         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2502         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2503         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2504         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2505         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2506         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2507         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2508         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2509         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2510         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2511         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2512         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2513         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2514         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2515         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2516         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2517         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2518         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2519         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2520         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2521         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2522         0xFA7C1819E90D8ull,
2523     };
2524     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2525     uint64_t *d = vd, *n = vn;
2526 
2527     for (i = 0; i < opr_sz; i++) {
2528         uint64_t nn = n[i];
2529         intptr_t idx = extract32(nn, 0, 6);
2530         uint64_t exp = extract32(nn, 6, 11);
2531         d[i] = coeff[idx] | (exp << 52);
2532     }
2533 }
2534 
2535 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2536 {
2537     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2538     uint16_t *d = vd, *n = vn, *m = vm;
2539     for (i = 0; i < opr_sz; i += 1) {
2540         uint16_t nn = n[i];
2541         uint16_t mm = m[i];
2542         if (mm & 1) {
2543             nn = float16_one;
2544         }
2545         d[i] = nn ^ (mm & 2) << 14;
2546     }
2547 }
2548 
2549 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2550 {
2551     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2552     uint32_t *d = vd, *n = vn, *m = vm;
2553     for (i = 0; i < opr_sz; i += 1) {
2554         uint32_t nn = n[i];
2555         uint32_t mm = m[i];
2556         if (mm & 1) {
2557             nn = float32_one;
2558         }
2559         d[i] = nn ^ (mm & 2) << 30;
2560     }
2561 }
2562 
2563 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2564 {
2565     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2566     uint64_t *d = vd, *n = vn, *m = vm;
2567     for (i = 0; i < opr_sz; i += 1) {
2568         uint64_t nn = n[i];
2569         uint64_t mm = m[i];
2570         if (mm & 1) {
2571             nn = float64_one;
2572         }
2573         d[i] = nn ^ (mm & 2) << 62;
2574     }
2575 }
2576 
2577 /*
2578  * Signed saturating addition with scalar operand.
2579  */
2580 
2581 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2582 {
2583     intptr_t i, oprsz = simd_oprsz(desc);
2584 
2585     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2586         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2587     }
2588 }
2589 
2590 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2591 {
2592     intptr_t i, oprsz = simd_oprsz(desc);
2593 
2594     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2595         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2596     }
2597 }
2598 
2599 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2600 {
2601     intptr_t i, oprsz = simd_oprsz(desc);
2602 
2603     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2604         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2605     }
2606 }
2607 
2608 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2609 {
2610     intptr_t i, oprsz = simd_oprsz(desc);
2611 
2612     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2613         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2614     }
2615 }
2616 
2617 /*
2618  * Unsigned saturating addition with scalar operand.
2619  */
2620 
2621 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2622 {
2623     intptr_t i, oprsz = simd_oprsz(desc);
2624 
2625     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2626         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2627     }
2628 }
2629 
2630 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2631 {
2632     intptr_t i, oprsz = simd_oprsz(desc);
2633 
2634     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2635         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2636     }
2637 }
2638 
2639 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2640 {
2641     intptr_t i, oprsz = simd_oprsz(desc);
2642 
2643     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2644         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2645     }
2646 }
2647 
2648 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2649 {
2650     intptr_t i, oprsz = simd_oprsz(desc);
2651 
2652     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2653         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2654     }
2655 }
2656 
2657 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2658 {
2659     intptr_t i, oprsz = simd_oprsz(desc);
2660 
2661     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2662         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2663     }
2664 }
2665 
2666 /* Two operand predicated copy immediate with merge.  All valid immediates
2667  * can fit within 17 signed bits in the simd_data field.
2668  */
2669 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2670                          uint64_t mm, uint32_t desc)
2671 {
2672     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2673     uint64_t *d = vd, *n = vn;
2674     uint8_t *pg = vg;
2675 
2676     mm = dup_const(MO_8, mm);
2677     for (i = 0; i < opr_sz; i += 1) {
2678         uint64_t nn = n[i];
2679         uint64_t pp = expand_pred_b(pg[H1(i)]);
2680         d[i] = (mm & pp) | (nn & ~pp);
2681     }
2682 }
2683 
2684 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2685                          uint64_t mm, uint32_t desc)
2686 {
2687     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2688     uint64_t *d = vd, *n = vn;
2689     uint8_t *pg = vg;
2690 
2691     mm = dup_const(MO_16, mm);
2692     for (i = 0; i < opr_sz; i += 1) {
2693         uint64_t nn = n[i];
2694         uint64_t pp = expand_pred_h(pg[H1(i)]);
2695         d[i] = (mm & pp) | (nn & ~pp);
2696     }
2697 }
2698 
2699 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2700                          uint64_t mm, uint32_t desc)
2701 {
2702     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2703     uint64_t *d = vd, *n = vn;
2704     uint8_t *pg = vg;
2705 
2706     mm = dup_const(MO_32, mm);
2707     for (i = 0; i < opr_sz; i += 1) {
2708         uint64_t nn = n[i];
2709         uint64_t pp = expand_pred_s(pg[H1(i)]);
2710         d[i] = (mm & pp) | (nn & ~pp);
2711     }
2712 }
2713 
2714 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2715                          uint64_t mm, uint32_t desc)
2716 {
2717     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2718     uint64_t *d = vd, *n = vn;
2719     uint8_t *pg = vg;
2720 
2721     for (i = 0; i < opr_sz; i += 1) {
2722         uint64_t nn = n[i];
2723         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2724     }
2725 }
2726 
2727 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2728 {
2729     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2730     uint64_t *d = vd;
2731     uint8_t *pg = vg;
2732 
2733     val = dup_const(MO_8, val);
2734     for (i = 0; i < opr_sz; i += 1) {
2735         d[i] = val & expand_pred_b(pg[H1(i)]);
2736     }
2737 }
2738 
2739 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2740 {
2741     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2742     uint64_t *d = vd;
2743     uint8_t *pg = vg;
2744 
2745     val = dup_const(MO_16, val);
2746     for (i = 0; i < opr_sz; i += 1) {
2747         d[i] = val & expand_pred_h(pg[H1(i)]);
2748     }
2749 }
2750 
2751 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2752 {
2753     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2754     uint64_t *d = vd;
2755     uint8_t *pg = vg;
2756 
2757     val = dup_const(MO_32, val);
2758     for (i = 0; i < opr_sz; i += 1) {
2759         d[i] = val & expand_pred_s(pg[H1(i)]);
2760     }
2761 }
2762 
2763 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2764 {
2765     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2766     uint64_t *d = vd;
2767     uint8_t *pg = vg;
2768 
2769     for (i = 0; i < opr_sz; i += 1) {
2770         d[i] = (pg[H1(i)] & 1 ? val : 0);
2771     }
2772 }
2773 
2774 /* Big-endian hosts need to frob the byte indices.  If the copy
2775  * happens to be 8-byte aligned, then no frobbing necessary.
2776  */
2777 static void swap_memmove(void *vd, void *vs, size_t n)
2778 {
2779     uintptr_t d = (uintptr_t)vd;
2780     uintptr_t s = (uintptr_t)vs;
2781     uintptr_t o = (d | s | n) & 7;
2782     size_t i;
2783 
2784 #if !HOST_BIG_ENDIAN
2785     o = 0;
2786 #endif
2787     switch (o) {
2788     case 0:
2789         memmove(vd, vs, n);
2790         break;
2791 
2792     case 4:
2793         if (d < s || d >= s + n) {
2794             for (i = 0; i < n; i += 4) {
2795                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2796             }
2797         } else {
2798             for (i = n; i > 0; ) {
2799                 i -= 4;
2800                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2801             }
2802         }
2803         break;
2804 
2805     case 2:
2806     case 6:
2807         if (d < s || d >= s + n) {
2808             for (i = 0; i < n; i += 2) {
2809                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2810             }
2811         } else {
2812             for (i = n; i > 0; ) {
2813                 i -= 2;
2814                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2815             }
2816         }
2817         break;
2818 
2819     default:
2820         if (d < s || d >= s + n) {
2821             for (i = 0; i < n; i++) {
2822                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2823             }
2824         } else {
2825             for (i = n; i > 0; ) {
2826                 i -= 1;
2827                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2828             }
2829         }
2830         break;
2831     }
2832 }
2833 
2834 /* Similarly for memset of 0.  */
2835 static void swap_memzero(void *vd, size_t n)
2836 {
2837     uintptr_t d = (uintptr_t)vd;
2838     uintptr_t o = (d | n) & 7;
2839     size_t i;
2840 
2841     /* Usually, the first bit of a predicate is set, so N is 0.  */
2842     if (likely(n == 0)) {
2843         return;
2844     }
2845 
2846 #if !HOST_BIG_ENDIAN
2847     o = 0;
2848 #endif
2849     switch (o) {
2850     case 0:
2851         memset(vd, 0, n);
2852         break;
2853 
2854     case 4:
2855         for (i = 0; i < n; i += 4) {
2856             *(uint32_t *)H1_4(d + i) = 0;
2857         }
2858         break;
2859 
2860     case 2:
2861     case 6:
2862         for (i = 0; i < n; i += 2) {
2863             *(uint16_t *)H1_2(d + i) = 0;
2864         }
2865         break;
2866 
2867     default:
2868         for (i = 0; i < n; i++) {
2869             *(uint8_t *)H1(d + i) = 0;
2870         }
2871         break;
2872     }
2873 }
2874 
2875 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2876 {
2877     intptr_t opr_sz = simd_oprsz(desc);
2878     size_t n_ofs = simd_data(desc);
2879     size_t n_siz = opr_sz - n_ofs;
2880 
2881     if (vd != vm) {
2882         swap_memmove(vd, vn + n_ofs, n_siz);
2883         swap_memmove(vd + n_siz, vm, n_ofs);
2884     } else if (vd != vn) {
2885         swap_memmove(vd + n_siz, vd, n_ofs);
2886         swap_memmove(vd, vn + n_ofs, n_siz);
2887     } else {
2888         /* vd == vn == vm.  Need temp space.  */
2889         ARMVectorReg tmp;
2890         swap_memmove(&tmp, vm, n_ofs);
2891         swap_memmove(vd, vd + n_ofs, n_siz);
2892         memcpy(vd + n_siz, &tmp, n_ofs);
2893     }
2894 }
2895 
2896 #define DO_INSR(NAME, TYPE, H) \
2897 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2898 {                                                                  \
2899     intptr_t opr_sz = simd_oprsz(desc);                            \
2900     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2901     *(TYPE *)(vd + H(0)) = val;                                    \
2902 }
2903 
2904 DO_INSR(sve_insr_b, uint8_t, H1)
2905 DO_INSR(sve_insr_h, uint16_t, H1_2)
2906 DO_INSR(sve_insr_s, uint32_t, H1_4)
2907 DO_INSR(sve_insr_d, uint64_t, H1_8)
2908 
2909 #undef DO_INSR
2910 
2911 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2912 {
2913     intptr_t i, j, opr_sz = simd_oprsz(desc);
2914     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2915         uint64_t f = *(uint64_t *)(vn + i);
2916         uint64_t b = *(uint64_t *)(vn + j);
2917         *(uint64_t *)(vd + i) = bswap64(b);
2918         *(uint64_t *)(vd + j) = bswap64(f);
2919     }
2920 }
2921 
2922 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2923 {
2924     intptr_t i, j, opr_sz = simd_oprsz(desc);
2925     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2926         uint64_t f = *(uint64_t *)(vn + i);
2927         uint64_t b = *(uint64_t *)(vn + j);
2928         *(uint64_t *)(vd + i) = hswap64(b);
2929         *(uint64_t *)(vd + j) = hswap64(f);
2930     }
2931 }
2932 
2933 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2934 {
2935     intptr_t i, j, opr_sz = simd_oprsz(desc);
2936     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2937         uint64_t f = *(uint64_t *)(vn + i);
2938         uint64_t b = *(uint64_t *)(vn + j);
2939         *(uint64_t *)(vd + i) = rol64(b, 32);
2940         *(uint64_t *)(vd + j) = rol64(f, 32);
2941     }
2942 }
2943 
2944 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2945 {
2946     intptr_t i, j, opr_sz = simd_oprsz(desc);
2947     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2948         uint64_t f = *(uint64_t *)(vn + i);
2949         uint64_t b = *(uint64_t *)(vn + j);
2950         *(uint64_t *)(vd + i) = b;
2951         *(uint64_t *)(vd + j) = f;
2952     }
2953 }
2954 
2955 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2956 
2957 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2958                            bool is_tbx, tb_impl_fn *fn)
2959 {
2960     ARMVectorReg scratch;
2961     uintptr_t oprsz = simd_oprsz(desc);
2962 
2963     if (unlikely(vd == vn)) {
2964         vn = memcpy(&scratch, vn, oprsz);
2965     }
2966 
2967     fn(vd, vn, NULL, vm, oprsz, is_tbx);
2968 }
2969 
2970 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2971                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2972 {
2973     ARMVectorReg scratch;
2974     uintptr_t oprsz = simd_oprsz(desc);
2975 
2976     if (unlikely(vd == vn0)) {
2977         vn0 = memcpy(&scratch, vn0, oprsz);
2978         if (vd == vn1) {
2979             vn1 = vn0;
2980         }
2981     } else if (unlikely(vd == vn1)) {
2982         vn1 = memcpy(&scratch, vn1, oprsz);
2983     }
2984 
2985     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2986 }
2987 
2988 #define DO_TB(SUFF, TYPE, H)                                            \
2989 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
2990                                 void *vm, uintptr_t oprsz, bool is_tbx) \
2991 {                                                                       \
2992     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
2993     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
2994     for (i = 0; i < nelem; ++i) {                                       \
2995         TYPE index = indexes[H1(i)], val = 0;                           \
2996         if (index < nelem) {                                            \
2997             val = tbl0[H(index)];                                       \
2998         } else {                                                        \
2999             index -= nelem;                                             \
3000             if (tbl1 && index < nelem) {                                \
3001                 val = tbl1[H(index)];                                   \
3002             } else if (is_tbx) {                                        \
3003                 continue;                                               \
3004             }                                                           \
3005         }                                                               \
3006         d[H(i)] = val;                                                  \
3007     }                                                                   \
3008 }                                                                       \
3009 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3010 {                                                                       \
3011     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3012 }                                                                       \
3013 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3014                              void *vm, uint32_t desc)                   \
3015 {                                                                       \
3016     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3017 }                                                                       \
3018 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3019 {                                                                       \
3020     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3021 }
3022 
3023 DO_TB(b, uint8_t, H1)
3024 DO_TB(h, uint16_t, H2)
3025 DO_TB(s, uint32_t, H4)
3026 DO_TB(d, uint64_t, H8)
3027 
3028 #undef DO_TB
3029 
3030 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3031 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3032 {                                                              \
3033     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3034     TYPED *d = vd;                                             \
3035     TYPES *n = vn;                                             \
3036     ARMVectorReg tmp;                                          \
3037     if (unlikely(vn - vd < opr_sz)) {                          \
3038         n = memcpy(&tmp, n, opr_sz / 2);                       \
3039     }                                                          \
3040     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3041         d[HD(i)] = n[HS(i)];                                   \
3042     }                                                          \
3043 }
3044 
3045 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3046 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3047 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3048 
3049 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3050 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3051 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3052 
3053 #undef DO_UNPK
3054 
3055 /* Mask of bits included in the even numbered predicates of width esz.
3056  * We also use this for expand_bits/compress_bits, and so extend the
3057  * same pattern out to 16-bit units.
3058  */
3059 static const uint64_t even_bit_esz_masks[5] = {
3060     0x5555555555555555ull,
3061     0x3333333333333333ull,
3062     0x0f0f0f0f0f0f0f0full,
3063     0x00ff00ff00ff00ffull,
3064     0x0000ffff0000ffffull,
3065 };
3066 
3067 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3068  * For N==0, this corresponds to the operation that in qemu/bitops.h
3069  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3070  * section 7-2 Shuffling Bits.
3071  */
3072 static uint64_t expand_bits(uint64_t x, int n)
3073 {
3074     int i;
3075 
3076     x &= 0xffffffffu;
3077     for (i = 4; i >= n; i--) {
3078         int sh = 1 << i;
3079         x = ((x << sh) | x) & even_bit_esz_masks[i];
3080     }
3081     return x;
3082 }
3083 
3084 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3085  * For N==0, this corresponds to the operation that in qemu/bitops.h
3086  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3087  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3088  */
3089 static uint64_t compress_bits(uint64_t x, int n)
3090 {
3091     int i;
3092 
3093     for (i = n; i <= 4; i++) {
3094         int sh = 1 << i;
3095         x &= even_bit_esz_masks[i];
3096         x = (x >> sh) | x;
3097     }
3098     return x & 0xffffffffu;
3099 }
3100 
3101 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3102 {
3103     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3104     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3105     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3106     int esize = 1 << esz;
3107     uint64_t *d = vd;
3108     intptr_t i;
3109 
3110     if (oprsz <= 8) {
3111         uint64_t nn = *(uint64_t *)vn;
3112         uint64_t mm = *(uint64_t *)vm;
3113         int half = 4 * oprsz;
3114 
3115         nn = extract64(nn, high * half, half);
3116         mm = extract64(mm, high * half, half);
3117         nn = expand_bits(nn, esz);
3118         mm = expand_bits(mm, esz);
3119         d[0] = nn | (mm << esize);
3120     } else {
3121         ARMPredicateReg tmp;
3122 
3123         /* We produce output faster than we consume input.
3124            Therefore we must be mindful of possible overlap.  */
3125         if (vd == vn) {
3126             vn = memcpy(&tmp, vn, oprsz);
3127             if (vd == vm) {
3128                 vm = vn;
3129             }
3130         } else if (vd == vm) {
3131             vm = memcpy(&tmp, vm, oprsz);
3132         }
3133         if (high) {
3134             high = oprsz >> 1;
3135         }
3136 
3137         if ((oprsz & 7) == 0) {
3138             uint32_t *n = vn, *m = vm;
3139             high >>= 2;
3140 
3141             for (i = 0; i < oprsz / 8; i++) {
3142                 uint64_t nn = n[H4(high + i)];
3143                 uint64_t mm = m[H4(high + i)];
3144 
3145                 nn = expand_bits(nn, esz);
3146                 mm = expand_bits(mm, esz);
3147                 d[i] = nn | (mm << esize);
3148             }
3149         } else {
3150             uint8_t *n = vn, *m = vm;
3151             uint16_t *d16 = vd;
3152 
3153             for (i = 0; i < oprsz / 2; i++) {
3154                 uint16_t nn = n[H1(high + i)];
3155                 uint16_t mm = m[H1(high + i)];
3156 
3157                 nn = expand_bits(nn, esz);
3158                 mm = expand_bits(mm, esz);
3159                 d16[H2(i)] = nn | (mm << esize);
3160             }
3161         }
3162     }
3163 }
3164 
3165 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3166 {
3167     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3168     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3169     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3170     uint64_t *d = vd, *n = vn, *m = vm;
3171     uint64_t l, h;
3172     intptr_t i;
3173 
3174     if (oprsz <= 8) {
3175         l = compress_bits(n[0] >> odd, esz);
3176         h = compress_bits(m[0] >> odd, esz);
3177         d[0] = l | (h << (4 * oprsz));
3178     } else {
3179         ARMPredicateReg tmp_m;
3180         intptr_t oprsz_16 = oprsz / 16;
3181 
3182         if ((vm - vd) < (uintptr_t)oprsz) {
3183             m = memcpy(&tmp_m, vm, oprsz);
3184         }
3185 
3186         for (i = 0; i < oprsz_16; i++) {
3187             l = n[2 * i + 0];
3188             h = n[2 * i + 1];
3189             l = compress_bits(l >> odd, esz);
3190             h = compress_bits(h >> odd, esz);
3191             d[i] = l | (h << 32);
3192         }
3193 
3194         /*
3195          * For VL which is not a multiple of 512, the results from M do not
3196          * align nicely with the uint64_t for D.  Put the aligned results
3197          * from M into TMP_M and then copy it into place afterward.
3198          */
3199         if (oprsz & 15) {
3200             int final_shift = (oprsz & 15) * 2;
3201 
3202             l = n[2 * i + 0];
3203             h = n[2 * i + 1];
3204             l = compress_bits(l >> odd, esz);
3205             h = compress_bits(h >> odd, esz);
3206             d[i] = l | (h << final_shift);
3207 
3208             for (i = 0; i < oprsz_16; i++) {
3209                 l = m[2 * i + 0];
3210                 h = m[2 * i + 1];
3211                 l = compress_bits(l >> odd, esz);
3212                 h = compress_bits(h >> odd, esz);
3213                 tmp_m.p[i] = l | (h << 32);
3214             }
3215             l = m[2 * i + 0];
3216             h = m[2 * i + 1];
3217             l = compress_bits(l >> odd, esz);
3218             h = compress_bits(h >> odd, esz);
3219             tmp_m.p[i] = l | (h << final_shift);
3220 
3221             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3222         } else {
3223             for (i = 0; i < oprsz_16; i++) {
3224                 l = m[2 * i + 0];
3225                 h = m[2 * i + 1];
3226                 l = compress_bits(l >> odd, esz);
3227                 h = compress_bits(h >> odd, esz);
3228                 d[oprsz_16 + i] = l | (h << 32);
3229             }
3230         }
3231     }
3232 }
3233 
3234 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3235 {
3236     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3237     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3238     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3239     uint64_t *d = vd, *n = vn, *m = vm;
3240     uint64_t mask;
3241     int shr, shl;
3242     intptr_t i;
3243 
3244     shl = 1 << esz;
3245     shr = 0;
3246     mask = even_bit_esz_masks[esz];
3247     if (odd) {
3248         mask <<= shl;
3249         shr = shl;
3250         shl = 0;
3251     }
3252 
3253     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3254         uint64_t nn = (n[i] & mask) >> shr;
3255         uint64_t mm = (m[i] & mask) << shl;
3256         d[i] = nn + mm;
3257     }
3258 }
3259 
3260 /* Reverse units of 2**N bits.  */
3261 static uint64_t reverse_bits_64(uint64_t x, int n)
3262 {
3263     int i, sh;
3264 
3265     x = bswap64(x);
3266     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3267         uint64_t mask = even_bit_esz_masks[i];
3268         x = ((x & mask) << sh) | ((x >> sh) & mask);
3269     }
3270     return x;
3271 }
3272 
3273 static uint8_t reverse_bits_8(uint8_t x, int n)
3274 {
3275     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3276     int i, sh;
3277 
3278     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3279         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3280     }
3281     return x;
3282 }
3283 
3284 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3285 {
3286     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3287     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3288     intptr_t i, oprsz_2 = oprsz / 2;
3289 
3290     if (oprsz <= 8) {
3291         uint64_t l = *(uint64_t *)vn;
3292         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3293         *(uint64_t *)vd = l;
3294     } else if ((oprsz & 15) == 0) {
3295         for (i = 0; i < oprsz_2; i += 8) {
3296             intptr_t ih = oprsz - 8 - i;
3297             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3298             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3299             *(uint64_t *)(vd + i) = h;
3300             *(uint64_t *)(vd + ih) = l;
3301         }
3302     } else {
3303         for (i = 0; i < oprsz_2; i += 1) {
3304             intptr_t il = H1(i);
3305             intptr_t ih = H1(oprsz - 1 - i);
3306             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3307             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3308             *(uint8_t *)(vd + il) = h;
3309             *(uint8_t *)(vd + ih) = l;
3310         }
3311     }
3312 }
3313 
3314 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3315 {
3316     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3317     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3318     uint64_t *d = vd;
3319     intptr_t i;
3320 
3321     if (oprsz <= 8) {
3322         uint64_t nn = *(uint64_t *)vn;
3323         int half = 4 * oprsz;
3324 
3325         nn = extract64(nn, high * half, half);
3326         nn = expand_bits(nn, 0);
3327         d[0] = nn;
3328     } else {
3329         ARMPredicateReg tmp_n;
3330 
3331         /* We produce output faster than we consume input.
3332            Therefore we must be mindful of possible overlap.  */
3333         if ((vn - vd) < (uintptr_t)oprsz) {
3334             vn = memcpy(&tmp_n, vn, oprsz);
3335         }
3336         if (high) {
3337             high = oprsz >> 1;
3338         }
3339 
3340         if ((oprsz & 7) == 0) {
3341             uint32_t *n = vn;
3342             high >>= 2;
3343 
3344             for (i = 0; i < oprsz / 8; i++) {
3345                 uint64_t nn = n[H4(high + i)];
3346                 d[i] = expand_bits(nn, 0);
3347             }
3348         } else {
3349             uint16_t *d16 = vd;
3350             uint8_t *n = vn;
3351 
3352             for (i = 0; i < oprsz / 2; i++) {
3353                 uint16_t nn = n[H1(high + i)];
3354                 d16[H2(i)] = expand_bits(nn, 0);
3355             }
3356         }
3357     }
3358 }
3359 
3360 #define DO_ZIP(NAME, TYPE, H) \
3361 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3362 {                                                                    \
3363     intptr_t oprsz = simd_oprsz(desc);                               \
3364     intptr_t odd_ofs = simd_data(desc);                              \
3365     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3366     ARMVectorReg tmp_n, tmp_m;                                       \
3367     /* We produce output faster than we consume input.               \
3368        Therefore we must be mindful of possible overlap.  */         \
3369     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3370         vn = memcpy(&tmp_n, vn, oprsz);                              \
3371     }                                                                \
3372     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3373         vm = memcpy(&tmp_m, vm, oprsz);                              \
3374     }                                                                \
3375     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3376         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3377         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3378             *(TYPE *)(vm + odd_ofs + H(i));                          \
3379     }                                                                \
3380     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3381         memset(vd + oprsz - 16, 0, 16);                              \
3382     }                                                                \
3383 }
3384 
3385 DO_ZIP(sve_zip_b, uint8_t, H1)
3386 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3387 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3388 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3389 DO_ZIP(sve2_zip_q, Int128, )
3390 
3391 #define DO_UZP(NAME, TYPE, H) \
3392 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3393 {                                                                      \
3394     intptr_t oprsz = simd_oprsz(desc);                                 \
3395     intptr_t odd_ofs = simd_data(desc);                                \
3396     intptr_t i, p;                                                     \
3397     ARMVectorReg tmp_m;                                                \
3398     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3399         vm = memcpy(&tmp_m, vm, oprsz);                                \
3400     }                                                                  \
3401     i = 0, p = odd_ofs;                                                \
3402     do {                                                               \
3403         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3404         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3405     } while (p < oprsz);                                               \
3406     p -= oprsz;                                                        \
3407     do {                                                               \
3408         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3409         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3410     } while (p < oprsz);                                               \
3411     tcg_debug_assert(i == oprsz);                                      \
3412 }
3413 
3414 DO_UZP(sve_uzp_b, uint8_t, H1)
3415 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3416 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3417 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3418 DO_UZP(sve2_uzp_q, Int128, )
3419 
3420 #define DO_TRN(NAME, TYPE, H) \
3421 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3422 {                                                                      \
3423     intptr_t oprsz = simd_oprsz(desc);                                 \
3424     intptr_t odd_ofs = simd_data(desc);                                \
3425     intptr_t i;                                                        \
3426     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3427         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3428         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3429         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3430         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3431     }                                                                  \
3432     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3433         memset(vd + oprsz - 16, 0, 16);                                \
3434     }                                                                  \
3435 }
3436 
3437 DO_TRN(sve_trn_b, uint8_t, H1)
3438 DO_TRN(sve_trn_h, uint16_t, H1_2)
3439 DO_TRN(sve_trn_s, uint32_t, H1_4)
3440 DO_TRN(sve_trn_d, uint64_t, H1_8)
3441 DO_TRN(sve2_trn_q, Int128, )
3442 
3443 #undef DO_ZIP
3444 #undef DO_UZP
3445 #undef DO_TRN
3446 
3447 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3448 {
3449     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3450     uint32_t *d = vd, *n = vn;
3451     uint8_t *pg = vg;
3452 
3453     for (i = j = 0; i < opr_sz; i++) {
3454         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3455             d[H4(j)] = n[H4(i)];
3456             j++;
3457         }
3458     }
3459     for (; j < opr_sz; j++) {
3460         d[H4(j)] = 0;
3461     }
3462 }
3463 
3464 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3465 {
3466     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3467     uint64_t *d = vd, *n = vn;
3468     uint8_t *pg = vg;
3469 
3470     for (i = j = 0; i < opr_sz; i++) {
3471         if (pg[H1(i)] & 1) {
3472             d[j] = n[i];
3473             j++;
3474         }
3475     }
3476     for (; j < opr_sz; j++) {
3477         d[j] = 0;
3478     }
3479 }
3480 
3481 /* Similar to the ARM LastActiveElement pseudocode function, except the
3482  * result is multiplied by the element size.  This includes the not found
3483  * indication; e.g. not found for esz=3 is -8.
3484  */
3485 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3486 {
3487     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3488     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3489 
3490     return last_active_element(vg, words, esz);
3491 }
3492 
3493 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3494 {
3495     intptr_t opr_sz = simd_oprsz(desc) / 8;
3496     int esz = simd_data(desc);
3497     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3498     intptr_t i, first_i, last_i;
3499     ARMVectorReg tmp;
3500 
3501     first_i = last_i = 0;
3502     first_g = last_g = 0;
3503 
3504     /* Find the extent of the active elements within VG.  */
3505     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3506         pg = *(uint64_t *)(vg + i) & mask;
3507         if (pg) {
3508             if (last_g == 0) {
3509                 last_g = pg;
3510                 last_i = i;
3511             }
3512             first_g = pg;
3513             first_i = i;
3514         }
3515     }
3516 
3517     len = 0;
3518     if (first_g != 0) {
3519         first_i = first_i * 8 + ctz64(first_g);
3520         last_i = last_i * 8 + 63 - clz64(last_g);
3521         len = last_i - first_i + (1 << esz);
3522         if (vd == vm) {
3523             vm = memcpy(&tmp, vm, opr_sz * 8);
3524         }
3525         swap_memmove(vd, vn + first_i, len);
3526     }
3527     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3528 }
3529 
3530 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3531                             void *vg, uint32_t desc)
3532 {
3533     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3534     uint64_t *d = vd, *n = vn, *m = vm;
3535     uint8_t *pg = vg;
3536 
3537     for (i = 0; i < opr_sz; i += 1) {
3538         uint64_t nn = n[i], mm = m[i];
3539         uint64_t pp = expand_pred_b(pg[H1(i)]);
3540         d[i] = (nn & pp) | (mm & ~pp);
3541     }
3542 }
3543 
3544 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3545                             void *vg, uint32_t desc)
3546 {
3547     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3548     uint64_t *d = vd, *n = vn, *m = vm;
3549     uint8_t *pg = vg;
3550 
3551     for (i = 0; i < opr_sz; i += 1) {
3552         uint64_t nn = n[i], mm = m[i];
3553         uint64_t pp = expand_pred_h(pg[H1(i)]);
3554         d[i] = (nn & pp) | (mm & ~pp);
3555     }
3556 }
3557 
3558 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3559                             void *vg, uint32_t desc)
3560 {
3561     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3562     uint64_t *d = vd, *n = vn, *m = vm;
3563     uint8_t *pg = vg;
3564 
3565     for (i = 0; i < opr_sz; i += 1) {
3566         uint64_t nn = n[i], mm = m[i];
3567         uint64_t pp = expand_pred_s(pg[H1(i)]);
3568         d[i] = (nn & pp) | (mm & ~pp);
3569     }
3570 }
3571 
3572 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3573                             void *vg, uint32_t desc)
3574 {
3575     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3576     uint64_t *d = vd, *n = vn, *m = vm;
3577     uint8_t *pg = vg;
3578 
3579     for (i = 0; i < opr_sz; i += 1) {
3580         uint64_t nn = n[i], mm = m[i];
3581         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3582     }
3583 }
3584 
3585 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3586                             void *vg, uint32_t desc)
3587 {
3588     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3589     Int128 *d = vd, *n = vn, *m = vm;
3590     uint16_t *pg = vg;
3591 
3592     for (i = 0; i < opr_sz; i += 1) {
3593         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3594     }
3595 }
3596 
3597 /* Two operand comparison controlled by a predicate.
3598  * ??? It is very tempting to want to be able to expand this inline
3599  * with x86 instructions, e.g.
3600  *
3601  *    vcmpeqw    zm, zn, %ymm0
3602  *    vpmovmskb  %ymm0, %eax
3603  *    and        $0x5555, %eax
3604  *    and        pg, %eax
3605  *
3606  * or even aarch64, e.g.
3607  *
3608  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3609  *    cmeq       v0.8h, zn, zm
3610  *    and        v0.8h, v0.8h, mask
3611  *    addv       h0, v0.8h
3612  *    and        v0.8b, pg
3613  *
3614  * However, coming up with an abstraction that allows vector inputs and
3615  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3616  * scalar outputs, is tricky.
3617  */
3618 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3619 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3620 {                                                                            \
3621     intptr_t opr_sz = simd_oprsz(desc);                                      \
3622     uint32_t flags = PREDTEST_INIT;                                          \
3623     intptr_t i = opr_sz;                                                     \
3624     do {                                                                     \
3625         uint64_t out = 0, pg;                                                \
3626         do {                                                                 \
3627             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3628             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3629             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3630             out |= nn OP mm;                                                 \
3631         } while (i & 63);                                                    \
3632         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3633         out &= pg;                                                           \
3634         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3635         flags = iter_predtest_bwd(out, pg, flags);                           \
3636     } while (i > 0);                                                         \
3637     return flags;                                                            \
3638 }
3639 
3640 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3641     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3642 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3643     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3644 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3645     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3646 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3647     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3648 
3649 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3650 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3651 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3652 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3653 
3654 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3655 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3656 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3657 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3658 
3659 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3660 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3661 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3662 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3663 
3664 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3665 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3666 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3667 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3668 
3669 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3670 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3671 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3672 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3673 
3674 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3675 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3676 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3677 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3678 
3679 #undef DO_CMP_PPZZ_B
3680 #undef DO_CMP_PPZZ_H
3681 #undef DO_CMP_PPZZ_S
3682 #undef DO_CMP_PPZZ_D
3683 #undef DO_CMP_PPZZ
3684 
3685 /* Similar, but the second source is "wide".  */
3686 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3687 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3688 {                                                                            \
3689     intptr_t opr_sz = simd_oprsz(desc);                                      \
3690     uint32_t flags = PREDTEST_INIT;                                          \
3691     intptr_t i = opr_sz;                                                     \
3692     do {                                                                     \
3693         uint64_t out = 0, pg;                                                \
3694         do {                                                                 \
3695             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3696             do {                                                             \
3697                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3698                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3699                 out |= nn OP mm;                                             \
3700             } while (i & 7);                                                 \
3701         } while (i & 63);                                                    \
3702         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3703         out &= pg;                                                           \
3704         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3705         flags = iter_predtest_bwd(out, pg, flags);                           \
3706     } while (i > 0);                                                         \
3707     return flags;                                                            \
3708 }
3709 
3710 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3711     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3712 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3713     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3714 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3715     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3716 
3717 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3718 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3719 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3720 
3721 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3722 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3723 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3724 
3725 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3726 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3727 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3728 
3729 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3730 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3731 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3732 
3733 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3734 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3735 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3736 
3737 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3738 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3739 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3740 
3741 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3742 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3743 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3744 
3745 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3746 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3747 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3748 
3749 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3750 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3751 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3752 
3753 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3754 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3755 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3756 
3757 #undef DO_CMP_PPZW_B
3758 #undef DO_CMP_PPZW_H
3759 #undef DO_CMP_PPZW_S
3760 #undef DO_CMP_PPZW
3761 
3762 /* Similar, but the second source is immediate.  */
3763 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3764 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3765 {                                                                    \
3766     intptr_t opr_sz = simd_oprsz(desc);                              \
3767     uint32_t flags = PREDTEST_INIT;                                  \
3768     TYPE mm = simd_data(desc);                                       \
3769     intptr_t i = opr_sz;                                             \
3770     do {                                                             \
3771         uint64_t out = 0, pg;                                        \
3772         do {                                                         \
3773             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3774             TYPE nn = *(TYPE *)(vn + H(i));                          \
3775             out |= nn OP mm;                                         \
3776         } while (i & 63);                                            \
3777         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3778         out &= pg;                                                   \
3779         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3780         flags = iter_predtest_bwd(out, pg, flags);                   \
3781     } while (i > 0);                                                 \
3782     return flags;                                                    \
3783 }
3784 
3785 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3786     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3787 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3788     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3789 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3790     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3791 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3792     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3793 
3794 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3795 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3796 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3797 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3798 
3799 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3800 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3801 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3802 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3803 
3804 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3805 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3806 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3807 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3808 
3809 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3810 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3811 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3812 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3813 
3814 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3815 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3816 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3817 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3818 
3819 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3820 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3821 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3822 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3823 
3824 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3825 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3826 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3827 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3828 
3829 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3830 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3831 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3832 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3833 
3834 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3835 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3836 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3837 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3838 
3839 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3840 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3841 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3842 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3843 
3844 #undef DO_CMP_PPZI_B
3845 #undef DO_CMP_PPZI_H
3846 #undef DO_CMP_PPZI_S
3847 #undef DO_CMP_PPZI_D
3848 #undef DO_CMP_PPZI
3849 
3850 /* Similar to the ARM LastActive pseudocode function.  */
3851 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3852 {
3853     intptr_t i;
3854 
3855     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3856         uint64_t pg = *(uint64_t *)(vg + i);
3857         if (pg) {
3858             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3859         }
3860     }
3861     return 0;
3862 }
3863 
3864 /* Compute a mask into RETB that is true for all G, up to and including
3865  * (if after) or excluding (if !after) the first G & N.
3866  * Return true if BRK found.
3867  */
3868 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3869                         bool brk, bool after)
3870 {
3871     uint64_t b;
3872 
3873     if (brk) {
3874         b = 0;
3875     } else if ((g & n) == 0) {
3876         /* For all G, no N are set; break not found.  */
3877         b = g;
3878     } else {
3879         /* Break somewhere in N.  Locate it.  */
3880         b = g & n;            /* guard true, pred true */
3881         b = b & -b;           /* first such */
3882         if (after) {
3883             b = b | (b - 1);  /* break after same */
3884         } else {
3885             b = b - 1;        /* break before same */
3886         }
3887         brk = true;
3888     }
3889 
3890     *retb = b;
3891     return brk;
3892 }
3893 
3894 /* Compute a zeroing BRK.  */
3895 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3896                           intptr_t oprsz, bool after)
3897 {
3898     bool brk = false;
3899     intptr_t i;
3900 
3901     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3902         uint64_t this_b, this_g = g[i];
3903 
3904         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3905         d[i] = this_b & this_g;
3906     }
3907 }
3908 
3909 /* Likewise, but also compute flags.  */
3910 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3911                                intptr_t oprsz, bool after)
3912 {
3913     uint32_t flags = PREDTEST_INIT;
3914     bool brk = false;
3915     intptr_t i;
3916 
3917     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3918         uint64_t this_b, this_d, this_g = g[i];
3919 
3920         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3921         d[i] = this_d = this_b & this_g;
3922         flags = iter_predtest_fwd(this_d, this_g, flags);
3923     }
3924     return flags;
3925 }
3926 
3927 /* Compute a merging BRK.  */
3928 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3929                           intptr_t oprsz, bool after)
3930 {
3931     bool brk = false;
3932     intptr_t i;
3933 
3934     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3935         uint64_t this_b, this_g = g[i];
3936 
3937         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3938         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3939     }
3940 }
3941 
3942 /* Likewise, but also compute flags.  */
3943 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3944                                intptr_t oprsz, bool after)
3945 {
3946     uint32_t flags = PREDTEST_INIT;
3947     bool brk = false;
3948     intptr_t i;
3949 
3950     for (i = 0; i < oprsz / 8; ++i) {
3951         uint64_t this_b, this_d = d[i], this_g = g[i];
3952 
3953         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3954         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3955         flags = iter_predtest_fwd(this_d, this_g, flags);
3956     }
3957     return flags;
3958 }
3959 
3960 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3961 {
3962     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3963      * The compiler should turn this into 4 64-bit integer stores.
3964      */
3965     memset(d, 0, sizeof(ARMPredicateReg));
3966     return PREDTEST_INIT;
3967 }
3968 
3969 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3970                        uint32_t pred_desc)
3971 {
3972     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3973     if (last_active_pred(vn, vg, oprsz)) {
3974         compute_brk_z(vd, vm, vg, oprsz, true);
3975     } else {
3976         do_zero(vd, oprsz);
3977     }
3978 }
3979 
3980 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3981                             uint32_t pred_desc)
3982 {
3983     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3984     if (last_active_pred(vn, vg, oprsz)) {
3985         return compute_brks_z(vd, vm, vg, oprsz, true);
3986     } else {
3987         return do_zero(vd, oprsz);
3988     }
3989 }
3990 
3991 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3992                        uint32_t pred_desc)
3993 {
3994     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3995     if (last_active_pred(vn, vg, oprsz)) {
3996         compute_brk_z(vd, vm, vg, oprsz, false);
3997     } else {
3998         do_zero(vd, oprsz);
3999     }
4000 }
4001 
4002 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4003                             uint32_t pred_desc)
4004 {
4005     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4006     if (last_active_pred(vn, vg, oprsz)) {
4007         return compute_brks_z(vd, vm, vg, oprsz, false);
4008     } else {
4009         return do_zero(vd, oprsz);
4010     }
4011 }
4012 
4013 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4014 {
4015     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4016     compute_brk_z(vd, vn, vg, oprsz, true);
4017 }
4018 
4019 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4020 {
4021     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4022     return compute_brks_z(vd, vn, vg, oprsz, true);
4023 }
4024 
4025 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4026 {
4027     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4028     compute_brk_z(vd, vn, vg, oprsz, false);
4029 }
4030 
4031 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4032 {
4033     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4034     return compute_brks_z(vd, vn, vg, oprsz, false);
4035 }
4036 
4037 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4038 {
4039     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4040     compute_brk_m(vd, vn, vg, oprsz, true);
4041 }
4042 
4043 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4044 {
4045     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4046     return compute_brks_m(vd, vn, vg, oprsz, true);
4047 }
4048 
4049 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4050 {
4051     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4052     compute_brk_m(vd, vn, vg, oprsz, false);
4053 }
4054 
4055 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4056 {
4057     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4058     return compute_brks_m(vd, vn, vg, oprsz, false);
4059 }
4060 
4061 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4062 {
4063     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4064     if (!last_active_pred(vn, vg, oprsz)) {
4065         do_zero(vd, oprsz);
4066     }
4067 }
4068 
4069 /* As if PredTest(Ones(PL), D, esz).  */
4070 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4071                               uint64_t esz_mask)
4072 {
4073     uint32_t flags = PREDTEST_INIT;
4074     intptr_t i;
4075 
4076     for (i = 0; i < oprsz / 8; i++) {
4077         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4078     }
4079     if (oprsz & 7) {
4080         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4081         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4082     }
4083     return flags;
4084 }
4085 
4086 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4087 {
4088     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4089     if (last_active_pred(vn, vg, oprsz)) {
4090         return predtest_ones(vd, oprsz, -1);
4091     } else {
4092         return do_zero(vd, oprsz);
4093     }
4094 }
4095 
4096 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4097 {
4098     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4099     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4100     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4101     intptr_t i;
4102 
4103     for (i = 0; i < words; ++i) {
4104         uint64_t t = n[i] & g[i] & mask;
4105         sum += ctpop64(t);
4106     }
4107     return sum;
4108 }
4109 
4110 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4111 {
4112     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4113     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4114     uint64_t esz_mask = pred_esz_masks[esz];
4115     ARMPredicateReg *d = vd;
4116     uint32_t flags;
4117     intptr_t i;
4118 
4119     /* Begin with a zero predicate register.  */
4120     flags = do_zero(d, oprsz);
4121     if (count == 0) {
4122         return flags;
4123     }
4124 
4125     /* Set all of the requested bits.  */
4126     for (i = 0; i < count / 64; ++i) {
4127         d->p[i] = esz_mask;
4128     }
4129     if (count & 63) {
4130         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4131     }
4132 
4133     return predtest_ones(d, oprsz, esz_mask);
4134 }
4135 
4136 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4137 {
4138     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4139     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4140     uint64_t esz_mask = pred_esz_masks[esz];
4141     ARMPredicateReg *d = vd;
4142     intptr_t i, invcount, oprbits;
4143     uint64_t bits;
4144 
4145     if (count == 0) {
4146         return do_zero(d, oprsz);
4147     }
4148 
4149     oprbits = oprsz * 8;
4150     tcg_debug_assert(count <= oprbits);
4151 
4152     bits = esz_mask;
4153     if (oprbits & 63) {
4154         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4155     }
4156 
4157     invcount = oprbits - count;
4158     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4159         d->p[i] = bits;
4160         bits = esz_mask;
4161     }
4162 
4163     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4164 
4165     while (--i >= 0) {
4166         d->p[i] = 0;
4167     }
4168 
4169     return predtest_ones(d, oprsz, esz_mask);
4170 }
4171 
4172 /* Recursive reduction on a function;
4173  * C.f. the ARM ARM function ReducePredicated.
4174  *
4175  * While it would be possible to write this without the DATA temporary,
4176  * it is much simpler to process the predicate register this way.
4177  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4178  * little to gain with a more complex non-recursive form.
4179  */
4180 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4181 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4182 {                                                                     \
4183     if (n == 1) {                                                     \
4184         return *data;                                                 \
4185     } else {                                                          \
4186         uintptr_t half = n / 2;                                       \
4187         TYPE lo = NAME##_reduce(data, status, half);                  \
4188         TYPE hi = NAME##_reduce(data + half, status, half);           \
4189         return TYPE##_##FUNC(lo, hi, status);                         \
4190     }                                                                 \
4191 }                                                                     \
4192 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
4193 {                                                                     \
4194     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4195     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4196     for (i = 0; i < oprsz; ) {                                        \
4197         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4198         do {                                                          \
4199             TYPE nn = *(TYPE *)(vn + H(i));                           \
4200             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4201             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4202         } while (i & 15);                                             \
4203     }                                                                 \
4204     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4205         *(TYPE *)((void *)data + i) = IDENT;                          \
4206     }                                                                 \
4207     return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
4208 }
4209 
4210 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4211 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4212 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4213 
4214 /* Identity is floatN_default_nan, without the function call.  */
4215 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4216 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4217 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4218 
4219 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4220 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4221 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4222 
4223 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4224 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4225 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4226 
4227 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4228 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4229 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4230 
4231 #undef DO_REDUCE
4232 
4233 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4234                              void *status, uint32_t desc)
4235 {
4236     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4237     float16 result = nn;
4238 
4239     do {
4240         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4241         do {
4242             if (pg & 1) {
4243                 float16 mm = *(float16 *)(vm + H1_2(i));
4244                 result = float16_add(result, mm, status);
4245             }
4246             i += sizeof(float16), pg >>= sizeof(float16);
4247         } while (i & 15);
4248     } while (i < opr_sz);
4249 
4250     return result;
4251 }
4252 
4253 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4254                              void *status, uint32_t desc)
4255 {
4256     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4257     float32 result = nn;
4258 
4259     do {
4260         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4261         do {
4262             if (pg & 1) {
4263                 float32 mm = *(float32 *)(vm + H1_2(i));
4264                 result = float32_add(result, mm, status);
4265             }
4266             i += sizeof(float32), pg >>= sizeof(float32);
4267         } while (i & 15);
4268     } while (i < opr_sz);
4269 
4270     return result;
4271 }
4272 
4273 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4274                              void *status, uint32_t desc)
4275 {
4276     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4277     uint64_t *m = vm;
4278     uint8_t *pg = vg;
4279 
4280     for (i = 0; i < opr_sz; i++) {
4281         if (pg[H1(i)] & 1) {
4282             nn = float64_add(nn, m[i], status);
4283         }
4284     }
4285 
4286     return nn;
4287 }
4288 
4289 /* Fully general three-operand expander, controlled by a predicate,
4290  * With the extra float_status parameter.
4291  */
4292 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4293 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4294                   void *status, uint32_t desc)                  \
4295 {                                                               \
4296     intptr_t i = simd_oprsz(desc);                              \
4297     uint64_t *g = vg;                                           \
4298     do {                                                        \
4299         uint64_t pg = g[(i - 1) >> 6];                          \
4300         do {                                                    \
4301             i -= sizeof(TYPE);                                  \
4302             if (likely((pg >> (i & 63)) & 1)) {                 \
4303                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4304                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4305                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4306             }                                                   \
4307         } while (i & 63);                                       \
4308     } while (i != 0);                                           \
4309 }
4310 
4311 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4312 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4313 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4314 
4315 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4316 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4317 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4318 
4319 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4320 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4321 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4322 
4323 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4324 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4325 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4326 
4327 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4328 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4329 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4330 
4331 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4332 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4333 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4334 
4335 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4336 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4337 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4338 
4339 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4340 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4341 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4342 
4343 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4344 {
4345     return float16_abs(float16_sub(a, b, s));
4346 }
4347 
4348 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4349 {
4350     return float32_abs(float32_sub(a, b, s));
4351 }
4352 
4353 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4354 {
4355     return float64_abs(float64_sub(a, b, s));
4356 }
4357 
4358 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4359 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4360 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4361 
4362 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4363 {
4364     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4365     return float64_scalbn(a, b_int, s);
4366 }
4367 
4368 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4369 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4370 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4371 
4372 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4373 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4374 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4375 
4376 #undef DO_ZPZZ_FP
4377 
4378 /* Three-operand expander, with one scalar operand, controlled by
4379  * a predicate, with the extra float_status parameter.
4380  */
4381 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4382 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4383                   void *status, uint32_t desc)                    \
4384 {                                                                 \
4385     intptr_t i = simd_oprsz(desc);                                \
4386     uint64_t *g = vg;                                             \
4387     TYPE mm = scalar;                                             \
4388     do {                                                          \
4389         uint64_t pg = g[(i - 1) >> 6];                            \
4390         do {                                                      \
4391             i -= sizeof(TYPE);                                    \
4392             if (likely((pg >> (i & 63)) & 1)) {                   \
4393                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4394                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4395             }                                                     \
4396         } while (i & 63);                                         \
4397     } while (i != 0);                                             \
4398 }
4399 
4400 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4401 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4402 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4403 
4404 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4405 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4406 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4407 
4408 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4409 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4410 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4411 
4412 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4413 {
4414     return float16_sub(b, a, s);
4415 }
4416 
4417 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4418 {
4419     return float32_sub(b, a, s);
4420 }
4421 
4422 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4423 {
4424     return float64_sub(b, a, s);
4425 }
4426 
4427 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4428 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4429 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4430 
4431 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4432 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4433 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4434 
4435 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4436 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4437 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4438 
4439 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4440 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4441 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4442 
4443 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4444 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4445 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4446 
4447 /* Fully general two-operand expander, controlled by a predicate,
4448  * With the extra float_status parameter.
4449  */
4450 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4451 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4452 {                                                                     \
4453     intptr_t i = simd_oprsz(desc);                                    \
4454     uint64_t *g = vg;                                                 \
4455     do {                                                              \
4456         uint64_t pg = g[(i - 1) >> 6];                                \
4457         do {                                                          \
4458             i -= sizeof(TYPE);                                        \
4459             if (likely((pg >> (i & 63)) & 1)) {                       \
4460                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4461                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4462             }                                                         \
4463         } while (i & 63);                                             \
4464     } while (i != 0);                                                 \
4465 }
4466 
4467 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4468  * FZ16.  When converting from fp16, this affects flushing input denormals;
4469  * when converting to fp16, this affects flushing output denormals.
4470  */
4471 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4472 {
4473     bool save = get_flush_inputs_to_zero(fpst);
4474     float32 ret;
4475 
4476     set_flush_inputs_to_zero(false, fpst);
4477     ret = float16_to_float32(f, true, fpst);
4478     set_flush_inputs_to_zero(save, fpst);
4479     return ret;
4480 }
4481 
4482 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4483 {
4484     bool save = get_flush_inputs_to_zero(fpst);
4485     float64 ret;
4486 
4487     set_flush_inputs_to_zero(false, fpst);
4488     ret = float16_to_float64(f, true, fpst);
4489     set_flush_inputs_to_zero(save, fpst);
4490     return ret;
4491 }
4492 
4493 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4494 {
4495     bool save = get_flush_to_zero(fpst);
4496     float16 ret;
4497 
4498     set_flush_to_zero(false, fpst);
4499     ret = float32_to_float16(f, true, fpst);
4500     set_flush_to_zero(save, fpst);
4501     return ret;
4502 }
4503 
4504 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4505 {
4506     bool save = get_flush_to_zero(fpst);
4507     float16 ret;
4508 
4509     set_flush_to_zero(false, fpst);
4510     ret = float64_to_float16(f, true, fpst);
4511     set_flush_to_zero(save, fpst);
4512     return ret;
4513 }
4514 
4515 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4516 {
4517     if (float16_is_any_nan(f)) {
4518         float_raise(float_flag_invalid, s);
4519         return 0;
4520     }
4521     return float16_to_int16_round_to_zero(f, s);
4522 }
4523 
4524 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4525 {
4526     if (float16_is_any_nan(f)) {
4527         float_raise(float_flag_invalid, s);
4528         return 0;
4529     }
4530     return float16_to_int64_round_to_zero(f, s);
4531 }
4532 
4533 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4534 {
4535     if (float32_is_any_nan(f)) {
4536         float_raise(float_flag_invalid, s);
4537         return 0;
4538     }
4539     return float32_to_int64_round_to_zero(f, s);
4540 }
4541 
4542 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4543 {
4544     if (float64_is_any_nan(f)) {
4545         float_raise(float_flag_invalid, s);
4546         return 0;
4547     }
4548     return float64_to_int64_round_to_zero(f, s);
4549 }
4550 
4551 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4552 {
4553     if (float16_is_any_nan(f)) {
4554         float_raise(float_flag_invalid, s);
4555         return 0;
4556     }
4557     return float16_to_uint16_round_to_zero(f, s);
4558 }
4559 
4560 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4561 {
4562     if (float16_is_any_nan(f)) {
4563         float_raise(float_flag_invalid, s);
4564         return 0;
4565     }
4566     return float16_to_uint64_round_to_zero(f, s);
4567 }
4568 
4569 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4570 {
4571     if (float32_is_any_nan(f)) {
4572         float_raise(float_flag_invalid, s);
4573         return 0;
4574     }
4575     return float32_to_uint64_round_to_zero(f, s);
4576 }
4577 
4578 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4579 {
4580     if (float64_is_any_nan(f)) {
4581         float_raise(float_flag_invalid, s);
4582         return 0;
4583     }
4584     return float64_to_uint64_round_to_zero(f, s);
4585 }
4586 
4587 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4588 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4589 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4590 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4591 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4592 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4593 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4594 
4595 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4596 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4597 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4598 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4599 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4600 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4601 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4602 
4603 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4604 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4605 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4606 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4607 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4608 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4609 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4610 
4611 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4612 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4613 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4614 
4615 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4616 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4617 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4618 
4619 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4620 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4621 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4622 
4623 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4624 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4625 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4626 
4627 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4628 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4629 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4630 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4631 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4632 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4633 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4634 
4635 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4636 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4637 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4638 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4639 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4640 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4641 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4642 
4643 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4644 {
4645     /* Extract frac to the top of the uint32_t. */
4646     uint32_t frac = (uint32_t)a << (16 + 6);
4647     int16_t exp = extract32(a, 10, 5);
4648 
4649     if (unlikely(exp == 0)) {
4650         if (frac != 0) {
4651             if (!get_flush_inputs_to_zero(s)) {
4652                 /* denormal: bias - fractional_zeros */
4653                 return -15 - clz32(frac);
4654             }
4655             /* flush to zero */
4656             float_raise(float_flag_input_denormal, s);
4657         }
4658     } else if (unlikely(exp == 0x1f)) {
4659         if (frac == 0) {
4660             return INT16_MAX; /* infinity */
4661         }
4662     } else {
4663         /* normal: exp - bias */
4664         return exp - 15;
4665     }
4666     /* nan or zero */
4667     float_raise(float_flag_invalid, s);
4668     return INT16_MIN;
4669 }
4670 
4671 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4672 {
4673     /* Extract frac to the top of the uint32_t. */
4674     uint32_t frac = a << 9;
4675     int32_t exp = extract32(a, 23, 8);
4676 
4677     if (unlikely(exp == 0)) {
4678         if (frac != 0) {
4679             if (!get_flush_inputs_to_zero(s)) {
4680                 /* denormal: bias - fractional_zeros */
4681                 return -127 - clz32(frac);
4682             }
4683             /* flush to zero */
4684             float_raise(float_flag_input_denormal, s);
4685         }
4686     } else if (unlikely(exp == 0xff)) {
4687         if (frac == 0) {
4688             return INT32_MAX; /* infinity */
4689         }
4690     } else {
4691         /* normal: exp - bias */
4692         return exp - 127;
4693     }
4694     /* nan or zero */
4695     float_raise(float_flag_invalid, s);
4696     return INT32_MIN;
4697 }
4698 
4699 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4700 {
4701     /* Extract frac to the top of the uint64_t. */
4702     uint64_t frac = a << 12;
4703     int64_t exp = extract64(a, 52, 11);
4704 
4705     if (unlikely(exp == 0)) {
4706         if (frac != 0) {
4707             if (!get_flush_inputs_to_zero(s)) {
4708                 /* denormal: bias - fractional_zeros */
4709                 return -1023 - clz64(frac);
4710             }
4711             /* flush to zero */
4712             float_raise(float_flag_input_denormal, s);
4713         }
4714     } else if (unlikely(exp == 0x7ff)) {
4715         if (frac == 0) {
4716             return INT64_MAX; /* infinity */
4717         }
4718     } else {
4719         /* normal: exp - bias */
4720         return exp - 1023;
4721     }
4722     /* nan or zero */
4723     float_raise(float_flag_invalid, s);
4724     return INT64_MIN;
4725 }
4726 
4727 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4728 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4729 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4730 
4731 #undef DO_ZPZ_FP
4732 
4733 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4734                             float_status *status, uint32_t desc,
4735                             uint16_t neg1, uint16_t neg3)
4736 {
4737     intptr_t i = simd_oprsz(desc);
4738     uint64_t *g = vg;
4739 
4740     do {
4741         uint64_t pg = g[(i - 1) >> 6];
4742         do {
4743             i -= 2;
4744             if (likely((pg >> (i & 63)) & 1)) {
4745                 float16 e1, e2, e3, r;
4746 
4747                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4748                 e2 = *(uint16_t *)(vm + H1_2(i));
4749                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4750                 r = float16_muladd(e1, e2, e3, 0, status);
4751                 *(uint16_t *)(vd + H1_2(i)) = r;
4752             }
4753         } while (i & 63);
4754     } while (i != 0);
4755 }
4756 
4757 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4758                               void *vg, void *status, uint32_t desc)
4759 {
4760     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4761 }
4762 
4763 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4764                               void *vg, void *status, uint32_t desc)
4765 {
4766     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4767 }
4768 
4769 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4770                                void *vg, void *status, uint32_t desc)
4771 {
4772     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4773 }
4774 
4775 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4776                                void *vg, void *status, uint32_t desc)
4777 {
4778     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4779 }
4780 
4781 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4782                             float_status *status, uint32_t desc,
4783                             uint32_t neg1, uint32_t neg3)
4784 {
4785     intptr_t i = simd_oprsz(desc);
4786     uint64_t *g = vg;
4787 
4788     do {
4789         uint64_t pg = g[(i - 1) >> 6];
4790         do {
4791             i -= 4;
4792             if (likely((pg >> (i & 63)) & 1)) {
4793                 float32 e1, e2, e3, r;
4794 
4795                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4796                 e2 = *(uint32_t *)(vm + H1_4(i));
4797                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4798                 r = float32_muladd(e1, e2, e3, 0, status);
4799                 *(uint32_t *)(vd + H1_4(i)) = r;
4800             }
4801         } while (i & 63);
4802     } while (i != 0);
4803 }
4804 
4805 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4806                               void *vg, void *status, uint32_t desc)
4807 {
4808     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4809 }
4810 
4811 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4812                               void *vg, void *status, uint32_t desc)
4813 {
4814     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4815 }
4816 
4817 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4818                                void *vg, void *status, uint32_t desc)
4819 {
4820     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4821 }
4822 
4823 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4824                                void *vg, void *status, uint32_t desc)
4825 {
4826     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4827 }
4828 
4829 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4830                             float_status *status, uint32_t desc,
4831                             uint64_t neg1, uint64_t neg3)
4832 {
4833     intptr_t i = simd_oprsz(desc);
4834     uint64_t *g = vg;
4835 
4836     do {
4837         uint64_t pg = g[(i - 1) >> 6];
4838         do {
4839             i -= 8;
4840             if (likely((pg >> (i & 63)) & 1)) {
4841                 float64 e1, e2, e3, r;
4842 
4843                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4844                 e2 = *(uint64_t *)(vm + i);
4845                 e3 = *(uint64_t *)(va + i) ^ neg3;
4846                 r = float64_muladd(e1, e2, e3, 0, status);
4847                 *(uint64_t *)(vd + i) = r;
4848             }
4849         } while (i & 63);
4850     } while (i != 0);
4851 }
4852 
4853 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4854                               void *vg, void *status, uint32_t desc)
4855 {
4856     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4857 }
4858 
4859 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4860                               void *vg, void *status, uint32_t desc)
4861 {
4862     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4863 }
4864 
4865 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4866                                void *vg, void *status, uint32_t desc)
4867 {
4868     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4869 }
4870 
4871 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4872                                void *vg, void *status, uint32_t desc)
4873 {
4874     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4875 }
4876 
4877 /* Two operand floating-point comparison controlled by a predicate.
4878  * Unlike the integer version, we are not allowed to optimistically
4879  * compare operands, since the comparison may have side effects wrt
4880  * the FPSR.
4881  */
4882 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
4883 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
4884                   void *status, uint32_t desc)                          \
4885 {                                                                       \
4886     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
4887     uint64_t *d = vd, *g = vg;                                          \
4888     do {                                                                \
4889         uint64_t out = 0, pg = g[j];                                    \
4890         do {                                                            \
4891             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
4892             if (likely((pg >> (i & 63)) & 1)) {                         \
4893                 TYPE nn = *(TYPE *)(vn + H(i));                         \
4894                 TYPE mm = *(TYPE *)(vm + H(i));                         \
4895                 out |= OP(TYPE, nn, mm, status);                        \
4896             }                                                           \
4897         } while (i & 63);                                               \
4898         d[j--] = out;                                                   \
4899     } while (i > 0);                                                    \
4900 }
4901 
4902 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4903     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4904 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4905     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4906 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4907     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4908 
4909 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4910     DO_FPCMP_PPZZ_H(NAME, OP)   \
4911     DO_FPCMP_PPZZ_S(NAME, OP)   \
4912     DO_FPCMP_PPZZ_D(NAME, OP)
4913 
4914 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
4915 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
4916 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
4917 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
4918 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
4919 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
4920 #define DO_FCMUO(TYPE, X, Y, ST)  \
4921     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4922 #define DO_FACGE(TYPE, X, Y, ST)  \
4923     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4924 #define DO_FACGT(TYPE, X, Y, ST)  \
4925     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4926 
4927 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4928 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4929 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4930 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4931 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4932 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4933 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4934 
4935 #undef DO_FPCMP_PPZZ_ALL
4936 #undef DO_FPCMP_PPZZ_D
4937 #undef DO_FPCMP_PPZZ_S
4938 #undef DO_FPCMP_PPZZ_H
4939 #undef DO_FPCMP_PPZZ
4940 
4941 /* One operand floating-point comparison against zero, controlled
4942  * by a predicate.
4943  */
4944 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
4945 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
4946                   void *status, uint32_t desc)             \
4947 {                                                          \
4948     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
4949     uint64_t *d = vd, *g = vg;                             \
4950     do {                                                   \
4951         uint64_t out = 0, pg = g[j];                       \
4952         do {                                               \
4953             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
4954             if ((pg >> (i & 63)) & 1) {                    \
4955                 TYPE nn = *(TYPE *)(vn + H(i));            \
4956                 out |= OP(TYPE, nn, 0, status);            \
4957             }                                              \
4958         } while (i & 63);                                  \
4959         d[j--] = out;                                      \
4960     } while (i > 0);                                       \
4961 }
4962 
4963 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4964     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4965 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4966     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4967 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4968     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4969 
4970 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4971     DO_FPCMP_PPZ0_H(NAME, OP)   \
4972     DO_FPCMP_PPZ0_S(NAME, OP)   \
4973     DO_FPCMP_PPZ0_D(NAME, OP)
4974 
4975 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4976 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4977 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4978 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4979 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4980 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4981 
4982 /* FP Trig Multiply-Add. */
4983 
4984 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4985 {
4986     static const float16 coeff[16] = {
4987         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4988         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989     };
4990     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4991     intptr_t x = simd_data(desc);
4992     float16 *d = vd, *n = vn, *m = vm;
4993     for (i = 0; i < opr_sz; i++) {
4994         float16 mm = m[i];
4995         intptr_t xx = x;
4996         if (float16_is_neg(mm)) {
4997             mm = float16_abs(mm);
4998             xx += 8;
4999         }
5000         d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5001     }
5002 }
5003 
5004 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5005 {
5006     static const float32 coeff[16] = {
5007         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5008         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5009         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5010         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5011     };
5012     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5013     intptr_t x = simd_data(desc);
5014     float32 *d = vd, *n = vn, *m = vm;
5015     for (i = 0; i < opr_sz; i++) {
5016         float32 mm = m[i];
5017         intptr_t xx = x;
5018         if (float32_is_neg(mm)) {
5019             mm = float32_abs(mm);
5020             xx += 8;
5021         }
5022         d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5023     }
5024 }
5025 
5026 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5027 {
5028     static const float64 coeff[16] = {
5029         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5030         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5031         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5032         0x3de5d8408868552full, 0x0000000000000000ull,
5033         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5034         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5035         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5036         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5037     };
5038     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5039     intptr_t x = simd_data(desc);
5040     float64 *d = vd, *n = vn, *m = vm;
5041     for (i = 0; i < opr_sz; i++) {
5042         float64 mm = m[i];
5043         intptr_t xx = x;
5044         if (float64_is_neg(mm)) {
5045             mm = float64_abs(mm);
5046             xx += 8;
5047         }
5048         d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5049     }
5050 }
5051 
5052 /*
5053  * FP Complex Add
5054  */
5055 
5056 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5057                          void *vs, uint32_t desc)
5058 {
5059     intptr_t j, i = simd_oprsz(desc);
5060     uint64_t *g = vg;
5061     float16 neg_imag = float16_set_sign(0, simd_data(desc));
5062     float16 neg_real = float16_chs(neg_imag);
5063 
5064     do {
5065         uint64_t pg = g[(i - 1) >> 6];
5066         do {
5067             float16 e0, e1, e2, e3;
5068 
5069             /* I holds the real index; J holds the imag index.  */
5070             j = i - sizeof(float16);
5071             i -= 2 * sizeof(float16);
5072 
5073             e0 = *(float16 *)(vn + H1_2(i));
5074             e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5075             e2 = *(float16 *)(vn + H1_2(j));
5076             e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5077 
5078             if (likely((pg >> (i & 63)) & 1)) {
5079                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5080             }
5081             if (likely((pg >> (j & 63)) & 1)) {
5082                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5083             }
5084         } while (i & 63);
5085     } while (i != 0);
5086 }
5087 
5088 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5089                          void *vs, uint32_t desc)
5090 {
5091     intptr_t j, i = simd_oprsz(desc);
5092     uint64_t *g = vg;
5093     float32 neg_imag = float32_set_sign(0, simd_data(desc));
5094     float32 neg_real = float32_chs(neg_imag);
5095 
5096     do {
5097         uint64_t pg = g[(i - 1) >> 6];
5098         do {
5099             float32 e0, e1, e2, e3;
5100 
5101             /* I holds the real index; J holds the imag index.  */
5102             j = i - sizeof(float32);
5103             i -= 2 * sizeof(float32);
5104 
5105             e0 = *(float32 *)(vn + H1_2(i));
5106             e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5107             e2 = *(float32 *)(vn + H1_2(j));
5108             e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5109 
5110             if (likely((pg >> (i & 63)) & 1)) {
5111                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5112             }
5113             if (likely((pg >> (j & 63)) & 1)) {
5114                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5115             }
5116         } while (i & 63);
5117     } while (i != 0);
5118 }
5119 
5120 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5121                          void *vs, uint32_t desc)
5122 {
5123     intptr_t j, i = simd_oprsz(desc);
5124     uint64_t *g = vg;
5125     float64 neg_imag = float64_set_sign(0, simd_data(desc));
5126     float64 neg_real = float64_chs(neg_imag);
5127 
5128     do {
5129         uint64_t pg = g[(i - 1) >> 6];
5130         do {
5131             float64 e0, e1, e2, e3;
5132 
5133             /* I holds the real index; J holds the imag index.  */
5134             j = i - sizeof(float64);
5135             i -= 2 * sizeof(float64);
5136 
5137             e0 = *(float64 *)(vn + H1_2(i));
5138             e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5139             e2 = *(float64 *)(vn + H1_2(j));
5140             e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5141 
5142             if (likely((pg >> (i & 63)) & 1)) {
5143                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5144             }
5145             if (likely((pg >> (j & 63)) & 1)) {
5146                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5147             }
5148         } while (i & 63);
5149     } while (i != 0);
5150 }
5151 
5152 /*
5153  * FP Complex Multiply
5154  */
5155 
5156 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5157                                void *vg, void *status, uint32_t desc)
5158 {
5159     intptr_t j, i = simd_oprsz(desc);
5160     unsigned rot = simd_data(desc);
5161     bool flip = rot & 1;
5162     float16 neg_imag, neg_real;
5163     uint64_t *g = vg;
5164 
5165     neg_imag = float16_set_sign(0, (rot & 2) != 0);
5166     neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5167 
5168     do {
5169         uint64_t pg = g[(i - 1) >> 6];
5170         do {
5171             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5172 
5173             /* I holds the real index; J holds the imag index.  */
5174             j = i - sizeof(float16);
5175             i -= 2 * sizeof(float16);
5176 
5177             nr = *(float16 *)(vn + H1_2(i));
5178             ni = *(float16 *)(vn + H1_2(j));
5179             mr = *(float16 *)(vm + H1_2(i));
5180             mi = *(float16 *)(vm + H1_2(j));
5181 
5182             e2 = (flip ? ni : nr);
5183             e1 = (flip ? mi : mr) ^ neg_real;
5184             e4 = e2;
5185             e3 = (flip ? mr : mi) ^ neg_imag;
5186 
5187             if (likely((pg >> (i & 63)) & 1)) {
5188                 d = *(float16 *)(va + H1_2(i));
5189                 d = float16_muladd(e2, e1, d, 0, status);
5190                 *(float16 *)(vd + H1_2(i)) = d;
5191             }
5192             if (likely((pg >> (j & 63)) & 1)) {
5193                 d = *(float16 *)(va + H1_2(j));
5194                 d = float16_muladd(e4, e3, d, 0, status);
5195                 *(float16 *)(vd + H1_2(j)) = d;
5196             }
5197         } while (i & 63);
5198     } while (i != 0);
5199 }
5200 
5201 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5202                                void *vg, void *status, uint32_t desc)
5203 {
5204     intptr_t j, i = simd_oprsz(desc);
5205     unsigned rot = simd_data(desc);
5206     bool flip = rot & 1;
5207     float32 neg_imag, neg_real;
5208     uint64_t *g = vg;
5209 
5210     neg_imag = float32_set_sign(0, (rot & 2) != 0);
5211     neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5212 
5213     do {
5214         uint64_t pg = g[(i - 1) >> 6];
5215         do {
5216             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5217 
5218             /* I holds the real index; J holds the imag index.  */
5219             j = i - sizeof(float32);
5220             i -= 2 * sizeof(float32);
5221 
5222             nr = *(float32 *)(vn + H1_2(i));
5223             ni = *(float32 *)(vn + H1_2(j));
5224             mr = *(float32 *)(vm + H1_2(i));
5225             mi = *(float32 *)(vm + H1_2(j));
5226 
5227             e2 = (flip ? ni : nr);
5228             e1 = (flip ? mi : mr) ^ neg_real;
5229             e4 = e2;
5230             e3 = (flip ? mr : mi) ^ neg_imag;
5231 
5232             if (likely((pg >> (i & 63)) & 1)) {
5233                 d = *(float32 *)(va + H1_2(i));
5234                 d = float32_muladd(e2, e1, d, 0, status);
5235                 *(float32 *)(vd + H1_2(i)) = d;
5236             }
5237             if (likely((pg >> (j & 63)) & 1)) {
5238                 d = *(float32 *)(va + H1_2(j));
5239                 d = float32_muladd(e4, e3, d, 0, status);
5240                 *(float32 *)(vd + H1_2(j)) = d;
5241             }
5242         } while (i & 63);
5243     } while (i != 0);
5244 }
5245 
5246 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5247                                void *vg, void *status, uint32_t desc)
5248 {
5249     intptr_t j, i = simd_oprsz(desc);
5250     unsigned rot = simd_data(desc);
5251     bool flip = rot & 1;
5252     float64 neg_imag, neg_real;
5253     uint64_t *g = vg;
5254 
5255     neg_imag = float64_set_sign(0, (rot & 2) != 0);
5256     neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5257 
5258     do {
5259         uint64_t pg = g[(i - 1) >> 6];
5260         do {
5261             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5262 
5263             /* I holds the real index; J holds the imag index.  */
5264             j = i - sizeof(float64);
5265             i -= 2 * sizeof(float64);
5266 
5267             nr = *(float64 *)(vn + H1_2(i));
5268             ni = *(float64 *)(vn + H1_2(j));
5269             mr = *(float64 *)(vm + H1_2(i));
5270             mi = *(float64 *)(vm + H1_2(j));
5271 
5272             e2 = (flip ? ni : nr);
5273             e1 = (flip ? mi : mr) ^ neg_real;
5274             e4 = e2;
5275             e3 = (flip ? mr : mi) ^ neg_imag;
5276 
5277             if (likely((pg >> (i & 63)) & 1)) {
5278                 d = *(float64 *)(va + H1_2(i));
5279                 d = float64_muladd(e2, e1, d, 0, status);
5280                 *(float64 *)(vd + H1_2(i)) = d;
5281             }
5282             if (likely((pg >> (j & 63)) & 1)) {
5283                 d = *(float64 *)(va + H1_2(j));
5284                 d = float64_muladd(e4, e3, d, 0, status);
5285                 *(float64 *)(vd + H1_2(j)) = d;
5286             }
5287         } while (i & 63);
5288     } while (i != 0);
5289 }
5290 
5291 /*
5292  * Load contiguous data, protected by a governing predicate.
5293  */
5294 
5295 /*
5296  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5297  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5298  * element >= @reg_off, or @reg_max if there were no active elements at all.
5299  */
5300 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5301                                  intptr_t reg_max, int esz)
5302 {
5303     uint64_t pg_mask = pred_esz_masks[esz];
5304     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5305 
5306     /* In normal usage, the first element is active.  */
5307     if (likely(pg & 1)) {
5308         return reg_off;
5309     }
5310 
5311     if (pg == 0) {
5312         reg_off &= -64;
5313         do {
5314             reg_off += 64;
5315             if (unlikely(reg_off >= reg_max)) {
5316                 /* The entire predicate was false.  */
5317                 return reg_max;
5318             }
5319             pg = vg[reg_off >> 6] & pg_mask;
5320         } while (pg == 0);
5321     }
5322     reg_off += ctz64(pg);
5323 
5324     /* We should never see an out of range predicate bit set.  */
5325     tcg_debug_assert(reg_off < reg_max);
5326     return reg_off;
5327 }
5328 
5329 /*
5330  * Resolve the guest virtual address to info->host and info->flags.
5331  * If @nofault, return false if the page is invalid, otherwise
5332  * exit via page fault exception.
5333  */
5334 
5335 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5336                     target_ulong addr, int mem_off, MMUAccessType access_type,
5337                     int mmu_idx, uintptr_t retaddr)
5338 {
5339     int flags;
5340 
5341     addr += mem_off;
5342 
5343     /*
5344      * User-only currently always issues with TBI.  See the comment
5345      * above useronly_clean_ptr.  Usually we clean this top byte away
5346      * during translation, but we can't do that for e.g. vector + imm
5347      * addressing modes.
5348      *
5349      * We currently always enable TBI for user-only, and do not provide
5350      * a way to turn it off.  So clean the pointer unconditionally here,
5351      * rather than look it up here, or pass it down from above.
5352      */
5353     addr = useronly_clean_ptr(addr);
5354 
5355 #ifdef CONFIG_USER_ONLY
5356     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5357                                &info->host, retaddr);
5358 #else
5359     CPUTLBEntryFull *full;
5360     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5361                               &info->host, &full, retaddr);
5362 #endif
5363     info->flags = flags;
5364 
5365     if (flags & TLB_INVALID_MASK) {
5366         g_assert(nofault);
5367         return false;
5368     }
5369 
5370 #ifdef CONFIG_USER_ONLY
5371     memset(&info->attrs, 0, sizeof(info->attrs));
5372     /* Require both ANON and MTE; see allocation_tag_mem(). */
5373     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5374 #else
5375     info->attrs = full->attrs;
5376     info->tagged = full->pte_attrs == 0xf0;
5377 #endif
5378 
5379     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5380     info->host -= mem_off;
5381     return true;
5382 }
5383 
5384 /*
5385  * Find first active element on each page, and a loose bound for the
5386  * final element on each page.  Identify any single element that spans
5387  * the page boundary.  Return true if there are any active elements.
5388  */
5389 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5390                             intptr_t reg_max, int esz, int msize)
5391 {
5392     const int esize = 1 << esz;
5393     const uint64_t pg_mask = pred_esz_masks[esz];
5394     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5395     intptr_t mem_off_last, mem_off_split;
5396     intptr_t page_split, elt_split;
5397     intptr_t i;
5398 
5399     /* Set all of the element indices to -1, and the TLB data to 0. */
5400     memset(info, -1, offsetof(SVEContLdSt, page));
5401     memset(info->page, 0, sizeof(info->page));
5402 
5403     /* Gross scan over the entire predicate to find bounds. */
5404     i = 0;
5405     do {
5406         uint64_t pg = vg[i] & pg_mask;
5407         if (pg) {
5408             reg_off_last = i * 64 + 63 - clz64(pg);
5409             if (reg_off_first < 0) {
5410                 reg_off_first = i * 64 + ctz64(pg);
5411             }
5412         }
5413     } while (++i * 64 < reg_max);
5414 
5415     if (unlikely(reg_off_first < 0)) {
5416         /* No active elements, no pages touched. */
5417         return false;
5418     }
5419     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5420 
5421     info->reg_off_first[0] = reg_off_first;
5422     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5423     mem_off_last = (reg_off_last >> esz) * msize;
5424 
5425     page_split = -(addr | TARGET_PAGE_MASK);
5426     if (likely(mem_off_last + msize <= page_split)) {
5427         /* The entire operation fits within a single page. */
5428         info->reg_off_last[0] = reg_off_last;
5429         return true;
5430     }
5431 
5432     info->page_split = page_split;
5433     elt_split = page_split / msize;
5434     reg_off_split = elt_split << esz;
5435     mem_off_split = elt_split * msize;
5436 
5437     /*
5438      * This is the last full element on the first page, but it is not
5439      * necessarily active.  If there is no full element, i.e. the first
5440      * active element is the one that's split, this value remains -1.
5441      * It is useful as iteration bounds.
5442      */
5443     if (elt_split != 0) {
5444         info->reg_off_last[0] = reg_off_split - esize;
5445     }
5446 
5447     /* Determine if an unaligned element spans the pages.  */
5448     if (page_split % msize != 0) {
5449         /* It is helpful to know if the split element is active. */
5450         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5451             info->reg_off_split = reg_off_split;
5452             info->mem_off_split = mem_off_split;
5453 
5454             if (reg_off_split == reg_off_last) {
5455                 /* The page crossing element is last. */
5456                 return true;
5457             }
5458         }
5459         reg_off_split += esize;
5460         mem_off_split += msize;
5461     }
5462 
5463     /*
5464      * We do want the first active element on the second page, because
5465      * this may affect the address reported in an exception.
5466      */
5467     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5468     tcg_debug_assert(reg_off_split <= reg_off_last);
5469     info->reg_off_first[1] = reg_off_split;
5470     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5471     info->reg_off_last[1] = reg_off_last;
5472     return true;
5473 }
5474 
5475 /*
5476  * Resolve the guest virtual addresses to info->page[].
5477  * Control the generation of page faults with @fault.  Return false if
5478  * there is no work to do, which can only happen with @fault == FAULT_NO.
5479  */
5480 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5481                          CPUARMState *env, target_ulong addr,
5482                          MMUAccessType access_type, uintptr_t retaddr)
5483 {
5484     int mmu_idx = cpu_mmu_index(env, false);
5485     int mem_off = info->mem_off_first[0];
5486     bool nofault = fault == FAULT_NO;
5487     bool have_work = true;
5488 
5489     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5490                         access_type, mmu_idx, retaddr)) {
5491         /* No work to be done. */
5492         return false;
5493     }
5494 
5495     if (likely(info->page_split < 0)) {
5496         /* The entire operation was on the one page. */
5497         return true;
5498     }
5499 
5500     /*
5501      * If the second page is invalid, then we want the fault address to be
5502      * the first byte on that page which is accessed.
5503      */
5504     if (info->mem_off_split >= 0) {
5505         /*
5506          * There is an element split across the pages.  The fault address
5507          * should be the first byte of the second page.
5508          */
5509         mem_off = info->page_split;
5510         /*
5511          * If the split element is also the first active element
5512          * of the vector, then:  For first-fault we should continue
5513          * to generate faults for the second page.  For no-fault,
5514          * we have work only if the second page is valid.
5515          */
5516         if (info->mem_off_first[0] < info->mem_off_split) {
5517             nofault = FAULT_FIRST;
5518             have_work = false;
5519         }
5520     } else {
5521         /*
5522          * There is no element split across the pages.  The fault address
5523          * should be the first active element on the second page.
5524          */
5525         mem_off = info->mem_off_first[1];
5526         /*
5527          * There must have been one active element on the first page,
5528          * so we're out of first-fault territory.
5529          */
5530         nofault = fault != FAULT_ALL;
5531     }
5532 
5533     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5534                                 access_type, mmu_idx, retaddr);
5535     return have_work;
5536 }
5537 
5538 #ifndef CONFIG_USER_ONLY
5539 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5540                                uint64_t *vg, target_ulong addr,
5541                                int esize, int msize, int wp_access,
5542                                uintptr_t retaddr)
5543 {
5544     intptr_t mem_off, reg_off, reg_last;
5545     int flags0 = info->page[0].flags;
5546     int flags1 = info->page[1].flags;
5547 
5548     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5549         return;
5550     }
5551 
5552     /* Indicate that watchpoints are handled. */
5553     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5554     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5555 
5556     if (flags0 & TLB_WATCHPOINT) {
5557         mem_off = info->mem_off_first[0];
5558         reg_off = info->reg_off_first[0];
5559         reg_last = info->reg_off_last[0];
5560 
5561         while (reg_off <= reg_last) {
5562             uint64_t pg = vg[reg_off >> 6];
5563             do {
5564                 if ((pg >> (reg_off & 63)) & 1) {
5565                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5566                                          msize, info->page[0].attrs,
5567                                          wp_access, retaddr);
5568                 }
5569                 reg_off += esize;
5570                 mem_off += msize;
5571             } while (reg_off <= reg_last && (reg_off & 63));
5572         }
5573     }
5574 
5575     mem_off = info->mem_off_split;
5576     if (mem_off >= 0) {
5577         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5578                              info->page[0].attrs, wp_access, retaddr);
5579     }
5580 
5581     mem_off = info->mem_off_first[1];
5582     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5583         reg_off = info->reg_off_first[1];
5584         reg_last = info->reg_off_last[1];
5585 
5586         do {
5587             uint64_t pg = vg[reg_off >> 6];
5588             do {
5589                 if ((pg >> (reg_off & 63)) & 1) {
5590                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5591                                          msize, info->page[1].attrs,
5592                                          wp_access, retaddr);
5593                 }
5594                 reg_off += esize;
5595                 mem_off += msize;
5596             } while (reg_off & 63);
5597         } while (reg_off <= reg_last);
5598     }
5599 }
5600 #endif
5601 
5602 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5603                              uint64_t *vg, target_ulong addr, int esize,
5604                              int msize, uint32_t mtedesc, uintptr_t ra)
5605 {
5606     intptr_t mem_off, reg_off, reg_last;
5607 
5608     /* Process the page only if MemAttr == Tagged. */
5609     if (info->page[0].tagged) {
5610         mem_off = info->mem_off_first[0];
5611         reg_off = info->reg_off_first[0];
5612         reg_last = info->reg_off_split;
5613         if (reg_last < 0) {
5614             reg_last = info->reg_off_last[0];
5615         }
5616 
5617         do {
5618             uint64_t pg = vg[reg_off >> 6];
5619             do {
5620                 if ((pg >> (reg_off & 63)) & 1) {
5621                     mte_check(env, mtedesc, addr, ra);
5622                 }
5623                 reg_off += esize;
5624                 mem_off += msize;
5625             } while (reg_off <= reg_last && (reg_off & 63));
5626         } while (reg_off <= reg_last);
5627     }
5628 
5629     mem_off = info->mem_off_first[1];
5630     if (mem_off >= 0 && info->page[1].tagged) {
5631         reg_off = info->reg_off_first[1];
5632         reg_last = info->reg_off_last[1];
5633 
5634         do {
5635             uint64_t pg = vg[reg_off >> 6];
5636             do {
5637                 if ((pg >> (reg_off & 63)) & 1) {
5638                     mte_check(env, mtedesc, addr, ra);
5639                 }
5640                 reg_off += esize;
5641                 mem_off += msize;
5642             } while (reg_off & 63);
5643         } while (reg_off <= reg_last);
5644     }
5645 }
5646 
5647 /*
5648  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5649  */
5650 static inline QEMU_ALWAYS_INLINE
5651 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5652                uint32_t desc, const uintptr_t retaddr,
5653                const int esz, const int msz, const int N, uint32_t mtedesc,
5654                sve_ldst1_host_fn *host_fn,
5655                sve_ldst1_tlb_fn *tlb_fn)
5656 {
5657     const unsigned rd = simd_data(desc);
5658     const intptr_t reg_max = simd_oprsz(desc);
5659     intptr_t reg_off, reg_last, mem_off;
5660     SVEContLdSt info;
5661     void *host;
5662     int flags, i;
5663 
5664     /* Find the active elements.  */
5665     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5666         /* The entire predicate was false; no load occurs.  */
5667         for (i = 0; i < N; ++i) {
5668             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5669         }
5670         return;
5671     }
5672 
5673     /* Probe the page(s).  Exit with exception for any invalid page. */
5674     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5675 
5676     /* Handle watchpoints for all active elements. */
5677     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5678                               BP_MEM_READ, retaddr);
5679 
5680     /*
5681      * Handle mte checks for all active elements.
5682      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5683      */
5684     if (mtedesc) {
5685         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5686                                 mtedesc, retaddr);
5687     }
5688 
5689     flags = info.page[0].flags | info.page[1].flags;
5690     if (unlikely(flags != 0)) {
5691 #ifdef CONFIG_USER_ONLY
5692         g_assert_not_reached();
5693 #else
5694         /*
5695          * At least one page includes MMIO.
5696          * Any bus operation can fail with cpu_transaction_failed,
5697          * which for ARM will raise SyncExternal.  Perform the load
5698          * into scratch memory to preserve register state until the end.
5699          */
5700         ARMVectorReg scratch[4] = { };
5701 
5702         mem_off = info.mem_off_first[0];
5703         reg_off = info.reg_off_first[0];
5704         reg_last = info.reg_off_last[1];
5705         if (reg_last < 0) {
5706             reg_last = info.reg_off_split;
5707             if (reg_last < 0) {
5708                 reg_last = info.reg_off_last[0];
5709             }
5710         }
5711 
5712         do {
5713             uint64_t pg = vg[reg_off >> 6];
5714             do {
5715                 if ((pg >> (reg_off & 63)) & 1) {
5716                     for (i = 0; i < N; ++i) {
5717                         tlb_fn(env, &scratch[i], reg_off,
5718                                addr + mem_off + (i << msz), retaddr);
5719                     }
5720                 }
5721                 reg_off += 1 << esz;
5722                 mem_off += N << msz;
5723             } while (reg_off & 63);
5724         } while (reg_off <= reg_last);
5725 
5726         for (i = 0; i < N; ++i) {
5727             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5728         }
5729         return;
5730 #endif
5731     }
5732 
5733     /* The entire operation is in RAM, on valid pages. */
5734 
5735     for (i = 0; i < N; ++i) {
5736         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5737     }
5738 
5739     mem_off = info.mem_off_first[0];
5740     reg_off = info.reg_off_first[0];
5741     reg_last = info.reg_off_last[0];
5742     host = info.page[0].host;
5743 
5744     while (reg_off <= reg_last) {
5745         uint64_t pg = vg[reg_off >> 6];
5746         do {
5747             if ((pg >> (reg_off & 63)) & 1) {
5748                 for (i = 0; i < N; ++i) {
5749                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5750                             host + mem_off + (i << msz));
5751                 }
5752             }
5753             reg_off += 1 << esz;
5754             mem_off += N << msz;
5755         } while (reg_off <= reg_last && (reg_off & 63));
5756     }
5757 
5758     /*
5759      * Use the slow path to manage the cross-page misalignment.
5760      * But we know this is RAM and cannot trap.
5761      */
5762     mem_off = info.mem_off_split;
5763     if (unlikely(mem_off >= 0)) {
5764         reg_off = info.reg_off_split;
5765         for (i = 0; i < N; ++i) {
5766             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5767                    addr + mem_off + (i << msz), retaddr);
5768         }
5769     }
5770 
5771     mem_off = info.mem_off_first[1];
5772     if (unlikely(mem_off >= 0)) {
5773         reg_off = info.reg_off_first[1];
5774         reg_last = info.reg_off_last[1];
5775         host = info.page[1].host;
5776 
5777         do {
5778             uint64_t pg = vg[reg_off >> 6];
5779             do {
5780                 if ((pg >> (reg_off & 63)) & 1) {
5781                     for (i = 0; i < N; ++i) {
5782                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5783                                 host + mem_off + (i << msz));
5784                     }
5785                 }
5786                 reg_off += 1 << esz;
5787                 mem_off += N << msz;
5788             } while (reg_off & 63);
5789         } while (reg_off <= reg_last);
5790     }
5791 }
5792 
5793 static inline QEMU_ALWAYS_INLINE
5794 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5795                    uint32_t desc, const uintptr_t ra,
5796                    const int esz, const int msz, const int N,
5797                    sve_ldst1_host_fn *host_fn,
5798                    sve_ldst1_tlb_fn *tlb_fn)
5799 {
5800     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5801     int bit55 = extract64(addr, 55, 1);
5802 
5803     /* Remove mtedesc from the normal sve descriptor. */
5804     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5805 
5806     /* Perform gross MTE suppression early. */
5807     if (!tbi_check(desc, bit55) ||
5808         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5809         mtedesc = 0;
5810     }
5811 
5812     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5813 }
5814 
5815 #define DO_LD1_1(NAME, ESZ)                                             \
5816 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
5817                             target_ulong addr, uint32_t desc)           \
5818 {                                                                       \
5819     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
5820               sve_##NAME##_host, sve_##NAME##_tlb);                     \
5821 }                                                                       \
5822 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
5823                                 target_ulong addr, uint32_t desc)       \
5824 {                                                                       \
5825     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
5826                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
5827 }
5828 
5829 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
5830 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
5831                                target_ulong addr, uint32_t desc)        \
5832 {                                                                       \
5833     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5834               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
5835 }                                                                       \
5836 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
5837                                target_ulong addr, uint32_t desc)        \
5838 {                                                                       \
5839     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5840               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
5841 }                                                                       \
5842 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
5843                                    target_ulong addr, uint32_t desc)    \
5844 {                                                                       \
5845     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5846                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
5847 }                                                                       \
5848 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
5849                                    target_ulong addr, uint32_t desc)    \
5850 {                                                                       \
5851     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5852                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
5853 }
5854 
5855 DO_LD1_1(ld1bb,  MO_8)
5856 DO_LD1_1(ld1bhu, MO_16)
5857 DO_LD1_1(ld1bhs, MO_16)
5858 DO_LD1_1(ld1bsu, MO_32)
5859 DO_LD1_1(ld1bss, MO_32)
5860 DO_LD1_1(ld1bdu, MO_64)
5861 DO_LD1_1(ld1bds, MO_64)
5862 
5863 DO_LD1_2(ld1hh,  MO_16, MO_16)
5864 DO_LD1_2(ld1hsu, MO_32, MO_16)
5865 DO_LD1_2(ld1hss, MO_32, MO_16)
5866 DO_LD1_2(ld1hdu, MO_64, MO_16)
5867 DO_LD1_2(ld1hds, MO_64, MO_16)
5868 
5869 DO_LD1_2(ld1ss,  MO_32, MO_32)
5870 DO_LD1_2(ld1sdu, MO_64, MO_32)
5871 DO_LD1_2(ld1sds, MO_64, MO_32)
5872 
5873 DO_LD1_2(ld1dd,  MO_64, MO_64)
5874 
5875 #undef DO_LD1_1
5876 #undef DO_LD1_2
5877 
5878 #define DO_LDN_1(N)                                                     \
5879 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
5880                              target_ulong addr, uint32_t desc)          \
5881 {                                                                       \
5882     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
5883               sve_ld1bb_host, sve_ld1bb_tlb);                           \
5884 }                                                                       \
5885 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
5886                                  target_ulong addr, uint32_t desc)      \
5887 {                                                                       \
5888     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
5889                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
5890 }
5891 
5892 #define DO_LDN_2(N, SUFF, ESZ)                                          \
5893 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
5894                                     target_ulong addr, uint32_t desc)   \
5895 {                                                                       \
5896     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5897               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
5898 }                                                                       \
5899 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
5900                                     target_ulong addr, uint32_t desc)   \
5901 {                                                                       \
5902     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5903               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
5904 }                                                                       \
5905 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
5906                                         target_ulong addr, uint32_t desc) \
5907 {                                                                       \
5908     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5909                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
5910 }                                                                       \
5911 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
5912                                         target_ulong addr, uint32_t desc) \
5913 {                                                                       \
5914     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5915                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
5916 }
5917 
5918 DO_LDN_1(2)
5919 DO_LDN_1(3)
5920 DO_LDN_1(4)
5921 
5922 DO_LDN_2(2, hh, MO_16)
5923 DO_LDN_2(3, hh, MO_16)
5924 DO_LDN_2(4, hh, MO_16)
5925 
5926 DO_LDN_2(2, ss, MO_32)
5927 DO_LDN_2(3, ss, MO_32)
5928 DO_LDN_2(4, ss, MO_32)
5929 
5930 DO_LDN_2(2, dd, MO_64)
5931 DO_LDN_2(3, dd, MO_64)
5932 DO_LDN_2(4, dd, MO_64)
5933 
5934 #undef DO_LDN_1
5935 #undef DO_LDN_2
5936 
5937 /*
5938  * Load contiguous data, first-fault and no-fault.
5939  *
5940  * For user-only, one could argue that we should hold the mmap_lock during
5941  * the operation so that there is no race between page_check_range and the
5942  * load operation.  However, unmapping pages out from under a running thread
5943  * is extraordinarily unlikely.  This theoretical race condition also affects
5944  * linux-user/ in its get_user/put_user macros.
5945  *
5946  * TODO: Construct some helpers, written in assembly, that interact with
5947  * host_signal_handler to produce memory ops which can properly report errors
5948  * without racing.
5949  */
5950 
5951 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
5952  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5953  * option, which leaves subsequent data unchanged.
5954  */
5955 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5956 {
5957     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5958 
5959     if (i & 63) {
5960         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5961         i = ROUND_UP(i, 64);
5962     }
5963     for (; i < oprsz; i += 64) {
5964         ffr[i / 64] = 0;
5965     }
5966 }
5967 
5968 /*
5969  * Common helper for all contiguous no-fault and first-fault loads.
5970  */
5971 static inline QEMU_ALWAYS_INLINE
5972 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5973                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5974                    const int esz, const int msz, const SVEContFault fault,
5975                    sve_ldst1_host_fn *host_fn,
5976                    sve_ldst1_tlb_fn *tlb_fn)
5977 {
5978     const unsigned rd = simd_data(desc);
5979     void *vd = &env->vfp.zregs[rd];
5980     const intptr_t reg_max = simd_oprsz(desc);
5981     intptr_t reg_off, mem_off, reg_last;
5982     SVEContLdSt info;
5983     int flags;
5984     void *host;
5985 
5986     /* Find the active elements.  */
5987     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5988         /* The entire predicate was false; no load occurs.  */
5989         memset(vd, 0, reg_max);
5990         return;
5991     }
5992     reg_off = info.reg_off_first[0];
5993 
5994     /* Probe the page(s). */
5995     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5996         /* Fault on first element. */
5997         tcg_debug_assert(fault == FAULT_NO);
5998         memset(vd, 0, reg_max);
5999         goto do_fault;
6000     }
6001 
6002     mem_off = info.mem_off_first[0];
6003     flags = info.page[0].flags;
6004 
6005     /*
6006      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6007      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6008      */
6009     if (!info.page[0].tagged) {
6010         mtedesc = 0;
6011     }
6012 
6013     if (fault == FAULT_FIRST) {
6014         /* Trapping mte check for the first-fault element.  */
6015         if (mtedesc) {
6016             mte_check(env, mtedesc, addr + mem_off, retaddr);
6017         }
6018 
6019         /*
6020          * Special handling of the first active element,
6021          * if it crosses a page boundary or is MMIO.
6022          */
6023         bool is_split = mem_off == info.mem_off_split;
6024         if (unlikely(flags != 0) || unlikely(is_split)) {
6025             /*
6026              * Use the slow path for cross-page handling.
6027              * Might trap for MMIO or watchpoints.
6028              */
6029             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6030 
6031             /* After any fault, zero the other elements. */
6032             swap_memzero(vd, reg_off);
6033             reg_off += 1 << esz;
6034             mem_off += 1 << msz;
6035             swap_memzero(vd + reg_off, reg_max - reg_off);
6036 
6037             if (is_split) {
6038                 goto second_page;
6039             }
6040         } else {
6041             memset(vd, 0, reg_max);
6042         }
6043     } else {
6044         memset(vd, 0, reg_max);
6045         if (unlikely(mem_off == info.mem_off_split)) {
6046             /* The first active element crosses a page boundary. */
6047             flags |= info.page[1].flags;
6048             if (unlikely(flags & TLB_MMIO)) {
6049                 /* Some page is MMIO, see below. */
6050                 goto do_fault;
6051             }
6052             if (unlikely(flags & TLB_WATCHPOINT) &&
6053                 (cpu_watchpoint_address_matches
6054                  (env_cpu(env), addr + mem_off, 1 << msz)
6055                  & BP_MEM_READ)) {
6056                 /* Watchpoint hit, see below. */
6057                 goto do_fault;
6058             }
6059             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6060                 goto do_fault;
6061             }
6062             /*
6063              * Use the slow path for cross-page handling.
6064              * This is RAM, without a watchpoint, and will not trap.
6065              */
6066             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6067             goto second_page;
6068         }
6069     }
6070 
6071     /*
6072      * From this point on, all memory operations are MemSingleNF.
6073      *
6074      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6075      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6076      *
6077      * Unfortuately we do not have access to the memory attributes from the
6078      * PTE to tell Device memory from Normal memory.  So we make a mostly
6079      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6080      * This gives the right answer for the common cases of "Normal memory,
6081      * backed by host RAM" and "Device memory, backed by MMIO".
6082      * The architecture allows us to suppress an NF load and return
6083      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6084      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6085      * get wrong is "Device memory, backed by host RAM", for which we
6086      * should return (UNKNOWN, FAULT) for but do not.
6087      *
6088      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6089      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6090      * architectural breakpoints the same.
6091      */
6092     if (unlikely(flags & TLB_MMIO)) {
6093         goto do_fault;
6094     }
6095 
6096     reg_last = info.reg_off_last[0];
6097     host = info.page[0].host;
6098 
6099     do {
6100         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6101         do {
6102             if ((pg >> (reg_off & 63)) & 1) {
6103                 if (unlikely(flags & TLB_WATCHPOINT) &&
6104                     (cpu_watchpoint_address_matches
6105                      (env_cpu(env), addr + mem_off, 1 << msz)
6106                      & BP_MEM_READ)) {
6107                     goto do_fault;
6108                 }
6109                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6110                     goto do_fault;
6111                 }
6112                 host_fn(vd, reg_off, host + mem_off);
6113             }
6114             reg_off += 1 << esz;
6115             mem_off += 1 << msz;
6116         } while (reg_off <= reg_last && (reg_off & 63));
6117     } while (reg_off <= reg_last);
6118 
6119     /*
6120      * MemSingleNF is allowed to fail for any reason.  We have special
6121      * code above to handle the first element crossing a page boundary.
6122      * As an implementation choice, decline to handle a cross-page element
6123      * in any other position.
6124      */
6125     reg_off = info.reg_off_split;
6126     if (reg_off >= 0) {
6127         goto do_fault;
6128     }
6129 
6130  second_page:
6131     reg_off = info.reg_off_first[1];
6132     if (likely(reg_off < 0)) {
6133         /* No active elements on the second page.  All done. */
6134         return;
6135     }
6136 
6137     /*
6138      * MemSingleNF is allowed to fail for any reason.  As an implementation
6139      * choice, decline to handle elements on the second page.  This should
6140      * be low frequency as the guest walks through memory -- the next
6141      * iteration of the guest's loop should be aligned on the page boundary,
6142      * and then all following iterations will stay aligned.
6143      */
6144 
6145  do_fault:
6146     record_fault(env, reg_off, reg_max);
6147 }
6148 
6149 static inline QEMU_ALWAYS_INLINE
6150 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6151                        uint32_t desc, const uintptr_t retaddr,
6152                        const int esz, const int msz, const SVEContFault fault,
6153                        sve_ldst1_host_fn *host_fn,
6154                        sve_ldst1_tlb_fn *tlb_fn)
6155 {
6156     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6157     int bit55 = extract64(addr, 55, 1);
6158 
6159     /* Remove mtedesc from the normal sve descriptor. */
6160     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6161 
6162     /* Perform gross MTE suppression early. */
6163     if (!tbi_check(desc, bit55) ||
6164         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6165         mtedesc = 0;
6166     }
6167 
6168     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6169                   esz, msz, fault, host_fn, tlb_fn);
6170 }
6171 
6172 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6173 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6174                                  target_ulong addr, uint32_t desc)      \
6175 {                                                                       \
6176     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6177                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6178 }                                                                       \
6179 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6180                                  target_ulong addr, uint32_t desc)      \
6181 {                                                                       \
6182     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6183                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6184 }                                                                       \
6185 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6186                                      target_ulong addr, uint32_t desc)  \
6187 {                                                                       \
6188     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6189                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6190 }                                                                       \
6191 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6192                                      target_ulong addr, uint32_t desc)  \
6193 {                                                                       \
6194     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6195                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6196 }
6197 
6198 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6199 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6200                                     target_ulong addr, uint32_t desc)   \
6201 {                                                                       \
6202     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6203                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6204 }                                                                       \
6205 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6206                                     target_ulong addr, uint32_t desc)   \
6207 {                                                                       \
6208     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6209                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6210 }                                                                       \
6211 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6212                                     target_ulong addr, uint32_t desc)   \
6213 {                                                                       \
6214     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6215                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6216 }                                                                       \
6217 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6218                                     target_ulong addr, uint32_t desc)   \
6219 {                                                                       \
6220     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6221                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6222 }                                                                       \
6223 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6224                                         target_ulong addr, uint32_t desc) \
6225 {                                                                       \
6226     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6227                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6228 }                                                                       \
6229 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6230                                         target_ulong addr, uint32_t desc) \
6231 {                                                                       \
6232     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6233                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6234 }                                                                       \
6235 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6236                                         target_ulong addr, uint32_t desc) \
6237 {                                                                       \
6238     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6239                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6240 }                                                                       \
6241 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6242                                         target_ulong addr, uint32_t desc) \
6243 {                                                                       \
6244     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6245                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6246 }
6247 
6248 DO_LDFF1_LDNF1_1(bb,  MO_8)
6249 DO_LDFF1_LDNF1_1(bhu, MO_16)
6250 DO_LDFF1_LDNF1_1(bhs, MO_16)
6251 DO_LDFF1_LDNF1_1(bsu, MO_32)
6252 DO_LDFF1_LDNF1_1(bss, MO_32)
6253 DO_LDFF1_LDNF1_1(bdu, MO_64)
6254 DO_LDFF1_LDNF1_1(bds, MO_64)
6255 
6256 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6257 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6258 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6259 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6260 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6261 
6262 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6263 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6264 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6265 
6266 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6267 
6268 #undef DO_LDFF1_LDNF1_1
6269 #undef DO_LDFF1_LDNF1_2
6270 
6271 /*
6272  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6273  */
6274 
6275 static inline QEMU_ALWAYS_INLINE
6276 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6277                uint32_t desc, const uintptr_t retaddr,
6278                const int esz, const int msz, const int N, uint32_t mtedesc,
6279                sve_ldst1_host_fn *host_fn,
6280                sve_ldst1_tlb_fn *tlb_fn)
6281 {
6282     const unsigned rd = simd_data(desc);
6283     const intptr_t reg_max = simd_oprsz(desc);
6284     intptr_t reg_off, reg_last, mem_off;
6285     SVEContLdSt info;
6286     void *host;
6287     int i, flags;
6288 
6289     /* Find the active elements.  */
6290     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6291         /* The entire predicate was false; no store occurs.  */
6292         return;
6293     }
6294 
6295     /* Probe the page(s).  Exit with exception for any invalid page. */
6296     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6297 
6298     /* Handle watchpoints for all active elements. */
6299     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6300                               BP_MEM_WRITE, retaddr);
6301 
6302     /*
6303      * Handle mte checks for all active elements.
6304      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6305      */
6306     if (mtedesc) {
6307         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6308                                 mtedesc, retaddr);
6309     }
6310 
6311     flags = info.page[0].flags | info.page[1].flags;
6312     if (unlikely(flags != 0)) {
6313 #ifdef CONFIG_USER_ONLY
6314         g_assert_not_reached();
6315 #else
6316         /*
6317          * At least one page includes MMIO.
6318          * Any bus operation can fail with cpu_transaction_failed,
6319          * which for ARM will raise SyncExternal.  We cannot avoid
6320          * this fault and will leave with the store incomplete.
6321          */
6322         mem_off = info.mem_off_first[0];
6323         reg_off = info.reg_off_first[0];
6324         reg_last = info.reg_off_last[1];
6325         if (reg_last < 0) {
6326             reg_last = info.reg_off_split;
6327             if (reg_last < 0) {
6328                 reg_last = info.reg_off_last[0];
6329             }
6330         }
6331 
6332         do {
6333             uint64_t pg = vg[reg_off >> 6];
6334             do {
6335                 if ((pg >> (reg_off & 63)) & 1) {
6336                     for (i = 0; i < N; ++i) {
6337                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6338                                addr + mem_off + (i << msz), retaddr);
6339                     }
6340                 }
6341                 reg_off += 1 << esz;
6342                 mem_off += N << msz;
6343             } while (reg_off & 63);
6344         } while (reg_off <= reg_last);
6345         return;
6346 #endif
6347     }
6348 
6349     mem_off = info.mem_off_first[0];
6350     reg_off = info.reg_off_first[0];
6351     reg_last = info.reg_off_last[0];
6352     host = info.page[0].host;
6353 
6354     while (reg_off <= reg_last) {
6355         uint64_t pg = vg[reg_off >> 6];
6356         do {
6357             if ((pg >> (reg_off & 63)) & 1) {
6358                 for (i = 0; i < N; ++i) {
6359                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6360                             host + mem_off + (i << msz));
6361                 }
6362             }
6363             reg_off += 1 << esz;
6364             mem_off += N << msz;
6365         } while (reg_off <= reg_last && (reg_off & 63));
6366     }
6367 
6368     /*
6369      * Use the slow path to manage the cross-page misalignment.
6370      * But we know this is RAM and cannot trap.
6371      */
6372     mem_off = info.mem_off_split;
6373     if (unlikely(mem_off >= 0)) {
6374         reg_off = info.reg_off_split;
6375         for (i = 0; i < N; ++i) {
6376             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6377                    addr + mem_off + (i << msz), retaddr);
6378         }
6379     }
6380 
6381     mem_off = info.mem_off_first[1];
6382     if (unlikely(mem_off >= 0)) {
6383         reg_off = info.reg_off_first[1];
6384         reg_last = info.reg_off_last[1];
6385         host = info.page[1].host;
6386 
6387         do {
6388             uint64_t pg = vg[reg_off >> 6];
6389             do {
6390                 if ((pg >> (reg_off & 63)) & 1) {
6391                     for (i = 0; i < N; ++i) {
6392                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6393                                 host + mem_off + (i << msz));
6394                     }
6395                 }
6396                 reg_off += 1 << esz;
6397                 mem_off += N << msz;
6398             } while (reg_off & 63);
6399         } while (reg_off <= reg_last);
6400     }
6401 }
6402 
6403 static inline QEMU_ALWAYS_INLINE
6404 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6405                    uint32_t desc, const uintptr_t ra,
6406                    const int esz, const int msz, const int N,
6407                    sve_ldst1_host_fn *host_fn,
6408                    sve_ldst1_tlb_fn *tlb_fn)
6409 {
6410     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6411     int bit55 = extract64(addr, 55, 1);
6412 
6413     /* Remove mtedesc from the normal sve descriptor. */
6414     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6415 
6416     /* Perform gross MTE suppression early. */
6417     if (!tbi_check(desc, bit55) ||
6418         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6419         mtedesc = 0;
6420     }
6421 
6422     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6423 }
6424 
6425 #define DO_STN_1(N, NAME, ESZ)                                          \
6426 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6427                                  target_ulong addr, uint32_t desc)      \
6428 {                                                                       \
6429     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6430               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6431 }                                                                       \
6432 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6433                                      target_ulong addr, uint32_t desc)  \
6434 {                                                                       \
6435     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6436                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6437 }
6438 
6439 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6440 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6441                                     target_ulong addr, uint32_t desc)   \
6442 {                                                                       \
6443     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6444               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6445 }                                                                       \
6446 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6447                                     target_ulong addr, uint32_t desc)   \
6448 {                                                                       \
6449     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6450               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6451 }                                                                       \
6452 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6453                                         target_ulong addr, uint32_t desc) \
6454 {                                                                       \
6455     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6456                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6457 }                                                                       \
6458 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6459                                         target_ulong addr, uint32_t desc) \
6460 {                                                                       \
6461     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6462                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6463 }
6464 
6465 DO_STN_1(1, bb, MO_8)
6466 DO_STN_1(1, bh, MO_16)
6467 DO_STN_1(1, bs, MO_32)
6468 DO_STN_1(1, bd, MO_64)
6469 DO_STN_1(2, bb, MO_8)
6470 DO_STN_1(3, bb, MO_8)
6471 DO_STN_1(4, bb, MO_8)
6472 
6473 DO_STN_2(1, hh, MO_16, MO_16)
6474 DO_STN_2(1, hs, MO_32, MO_16)
6475 DO_STN_2(1, hd, MO_64, MO_16)
6476 DO_STN_2(2, hh, MO_16, MO_16)
6477 DO_STN_2(3, hh, MO_16, MO_16)
6478 DO_STN_2(4, hh, MO_16, MO_16)
6479 
6480 DO_STN_2(1, ss, MO_32, MO_32)
6481 DO_STN_2(1, sd, MO_64, MO_32)
6482 DO_STN_2(2, ss, MO_32, MO_32)
6483 DO_STN_2(3, ss, MO_32, MO_32)
6484 DO_STN_2(4, ss, MO_32, MO_32)
6485 
6486 DO_STN_2(1, dd, MO_64, MO_64)
6487 DO_STN_2(2, dd, MO_64, MO_64)
6488 DO_STN_2(3, dd, MO_64, MO_64)
6489 DO_STN_2(4, dd, MO_64, MO_64)
6490 
6491 #undef DO_STN_1
6492 #undef DO_STN_2
6493 
6494 /*
6495  * Loads with a vector index.
6496  */
6497 
6498 /*
6499  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6500  */
6501 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6502 
6503 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6504 {
6505     return *(uint32_t *)(reg + H1_4(reg_ofs));
6506 }
6507 
6508 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6509 {
6510     return *(int32_t *)(reg + H1_4(reg_ofs));
6511 }
6512 
6513 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6514 {
6515     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6516 }
6517 
6518 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6519 {
6520     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6521 }
6522 
6523 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6524 {
6525     return *(uint64_t *)(reg + reg_ofs);
6526 }
6527 
6528 static inline QEMU_ALWAYS_INLINE
6529 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6530                target_ulong base, uint32_t desc, uintptr_t retaddr,
6531                uint32_t mtedesc, int esize, int msize,
6532                zreg_off_fn *off_fn,
6533                sve_ldst1_host_fn *host_fn,
6534                sve_ldst1_tlb_fn *tlb_fn)
6535 {
6536     const int mmu_idx = cpu_mmu_index(env, false);
6537     const intptr_t reg_max = simd_oprsz(desc);
6538     const int scale = simd_data(desc);
6539     ARMVectorReg scratch;
6540     intptr_t reg_off;
6541     SVEHostPage info, info2;
6542 
6543     memset(&scratch, 0, reg_max);
6544     reg_off = 0;
6545     do {
6546         uint64_t pg = vg[reg_off >> 6];
6547         do {
6548             if (likely(pg & 1)) {
6549                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6550                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6551 
6552                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6553                                mmu_idx, retaddr);
6554 
6555                 if (likely(in_page >= msize)) {
6556                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6557                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6558                                              info.attrs, BP_MEM_READ, retaddr);
6559                     }
6560                     if (mtedesc && info.tagged) {
6561                         mte_check(env, mtedesc, addr, retaddr);
6562                     }
6563                     if (unlikely(info.flags & TLB_MMIO)) {
6564                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6565                     } else {
6566                         host_fn(&scratch, reg_off, info.host);
6567                     }
6568                 } else {
6569                     /* Element crosses the page boundary. */
6570                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6571                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6572                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6573                         cpu_check_watchpoint(env_cpu(env), addr,
6574                                              msize, info.attrs,
6575                                              BP_MEM_READ, retaddr);
6576                     }
6577                     if (mtedesc && info.tagged) {
6578                         mte_check(env, mtedesc, addr, retaddr);
6579                     }
6580                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6581                 }
6582             }
6583             reg_off += esize;
6584             pg >>= esize;
6585         } while (reg_off & 63);
6586     } while (reg_off < reg_max);
6587 
6588     /* Wait until all exceptions have been raised to write back.  */
6589     memcpy(vd, &scratch, reg_max);
6590 }
6591 
6592 static inline QEMU_ALWAYS_INLINE
6593 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6594                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6595                    int esize, int msize, zreg_off_fn *off_fn,
6596                    sve_ldst1_host_fn *host_fn,
6597                    sve_ldst1_tlb_fn *tlb_fn)
6598 {
6599     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6600     /* Remove mtedesc from the normal sve descriptor. */
6601     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6602 
6603     /*
6604      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6605      * offset base entirely over the address space hole to change the
6606      * pointer tag, or change the bit55 selector.  So we could here
6607      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6608      */
6609     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6610               esize, msize, off_fn, host_fn, tlb_fn);
6611 }
6612 
6613 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6614 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6615                                  void *vm, target_ulong base, uint32_t desc) \
6616 {                                                                            \
6617     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6618               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6619 }                                                                            \
6620 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6621      void *vm, target_ulong base, uint32_t desc)                             \
6622 {                                                                            \
6623     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6624                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6625 }
6626 
6627 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6628 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6629                                  void *vm, target_ulong base, uint32_t desc) \
6630 {                                                                            \
6631     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6632               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6633 }                                                                            \
6634 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6635     void *vm, target_ulong base, uint32_t desc)                              \
6636 {                                                                            \
6637     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6638                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6639 }
6640 
6641 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6642 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6643 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6644 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6645 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6646 
6647 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6648 DO_LD1_ZPZ_S(bss, zss, MO_8)
6649 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6650 DO_LD1_ZPZ_D(bds, zss, MO_8)
6651 DO_LD1_ZPZ_D(bds, zd, MO_8)
6652 
6653 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6654 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6655 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6656 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6657 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6658 
6659 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6660 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6661 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6662 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6663 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6664 
6665 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6666 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6667 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6668 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6669 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6670 
6671 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6672 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6673 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6674 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6675 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6676 
6677 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6678 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6679 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6680 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6681 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6682 
6683 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6684 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6685 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6686 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6687 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6688 
6689 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6690 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6691 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6692 
6693 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6694 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6695 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6696 
6697 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6698 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6699 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6700 
6701 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6702 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6703 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6704 
6705 #undef DO_LD1_ZPZ_S
6706 #undef DO_LD1_ZPZ_D
6707 
6708 /* First fault loads with a vector index.  */
6709 
6710 /*
6711  * Common helpers for all gather first-faulting loads.
6712  */
6713 
6714 static inline QEMU_ALWAYS_INLINE
6715 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6716                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6717                  uint32_t mtedesc, const int esz, const int msz,
6718                  zreg_off_fn *off_fn,
6719                  sve_ldst1_host_fn *host_fn,
6720                  sve_ldst1_tlb_fn *tlb_fn)
6721 {
6722     const int mmu_idx = cpu_mmu_index(env, false);
6723     const intptr_t reg_max = simd_oprsz(desc);
6724     const int scale = simd_data(desc);
6725     const int esize = 1 << esz;
6726     const int msize = 1 << msz;
6727     intptr_t reg_off;
6728     SVEHostPage info;
6729     target_ulong addr, in_page;
6730     ARMVectorReg scratch;
6731 
6732     /* Skip to the first true predicate.  */
6733     reg_off = find_next_active(vg, 0, reg_max, esz);
6734     if (unlikely(reg_off >= reg_max)) {
6735         /* The entire predicate was false; no load occurs.  */
6736         memset(vd, 0, reg_max);
6737         return;
6738     }
6739 
6740     /* Protect against overlap between vd and vm. */
6741     if (unlikely(vd == vm)) {
6742         vm = memcpy(&scratch, vm, reg_max);
6743     }
6744 
6745     /*
6746      * Probe the first element, allowing faults.
6747      */
6748     addr = base + (off_fn(vm, reg_off) << scale);
6749     if (mtedesc) {
6750         mte_check(env, mtedesc, addr, retaddr);
6751     }
6752     tlb_fn(env, vd, reg_off, addr, retaddr);
6753 
6754     /* After any fault, zero the other elements. */
6755     swap_memzero(vd, reg_off);
6756     reg_off += esize;
6757     swap_memzero(vd + reg_off, reg_max - reg_off);
6758 
6759     /*
6760      * Probe the remaining elements, not allowing faults.
6761      */
6762     while (reg_off < reg_max) {
6763         uint64_t pg = vg[reg_off >> 6];
6764         do {
6765             if (likely((pg >> (reg_off & 63)) & 1)) {
6766                 addr = base + (off_fn(vm, reg_off) << scale);
6767                 in_page = -(addr | TARGET_PAGE_MASK);
6768 
6769                 if (unlikely(in_page < msize)) {
6770                     /* Stop if the element crosses a page boundary. */
6771                     goto fault;
6772                 }
6773 
6774                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6775                                mmu_idx, retaddr);
6776                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6777                     goto fault;
6778                 }
6779                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6780                     (cpu_watchpoint_address_matches
6781                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6782                     goto fault;
6783                 }
6784                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6785                     goto fault;
6786                 }
6787 
6788                 host_fn(vd, reg_off, info.host);
6789             }
6790             reg_off += esize;
6791         } while (reg_off & 63);
6792     }
6793     return;
6794 
6795  fault:
6796     record_fault(env, reg_off, reg_max);
6797 }
6798 
6799 static inline QEMU_ALWAYS_INLINE
6800 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6801                      target_ulong base, uint32_t desc, uintptr_t retaddr,
6802                      const int esz, const int msz,
6803                      zreg_off_fn *off_fn,
6804                      sve_ldst1_host_fn *host_fn,
6805                      sve_ldst1_tlb_fn *tlb_fn)
6806 {
6807     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6808     /* Remove mtedesc from the normal sve descriptor. */
6809     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6810 
6811     /*
6812      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6813      * offset base entirely over the address space hole to change the
6814      * pointer tag, or change the bit55 selector.  So we could here
6815      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6816      */
6817     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6818                 esz, msz, off_fn, host_fn, tlb_fn);
6819 }
6820 
6821 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
6822 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6823     (CPUARMState *env, void *vd, void *vg,                              \
6824      void *vm, target_ulong base, uint32_t desc)                        \
6825 {                                                                       \
6826     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
6827                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6828 }                                                                       \
6829 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6830     (CPUARMState *env, void *vd, void *vg,                              \
6831      void *vm, target_ulong base, uint32_t desc)                        \
6832 {                                                                       \
6833     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
6834                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6835 }
6836 
6837 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
6838 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6839     (CPUARMState *env, void *vd, void *vg,                              \
6840      void *vm, target_ulong base, uint32_t desc)                        \
6841 {                                                                       \
6842     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
6843                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6844 }                                                                       \
6845 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6846     (CPUARMState *env, void *vd, void *vg,                              \
6847      void *vm, target_ulong base, uint32_t desc)                        \
6848 {                                                                       \
6849     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
6850                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6851 }
6852 
6853 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6854 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6855 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6856 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6857 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6858 
6859 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6860 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6861 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6862 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6863 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6864 
6865 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6866 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6867 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6868 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6869 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6870 
6871 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6872 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6873 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6874 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6875 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6876 
6877 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6878 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6879 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6880 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6881 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6882 
6883 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6884 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6885 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6886 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6887 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6888 
6889 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
6890 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
6891 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6892 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6893 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6894 
6895 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
6896 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
6897 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6898 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6899 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6900 
6901 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6902 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6903 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6904 
6905 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6906 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6907 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6908 
6909 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6910 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6911 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6912 
6913 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6914 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6915 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6916 
6917 /* Stores with a vector index.  */
6918 
6919 static inline QEMU_ALWAYS_INLINE
6920 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6921                target_ulong base, uint32_t desc, uintptr_t retaddr,
6922                uint32_t mtedesc, int esize, int msize,
6923                zreg_off_fn *off_fn,
6924                sve_ldst1_host_fn *host_fn,
6925                sve_ldst1_tlb_fn *tlb_fn)
6926 {
6927     const int mmu_idx = cpu_mmu_index(env, false);
6928     const intptr_t reg_max = simd_oprsz(desc);
6929     const int scale = simd_data(desc);
6930     void *host[ARM_MAX_VQ * 4];
6931     intptr_t reg_off, i;
6932     SVEHostPage info, info2;
6933 
6934     /*
6935      * Probe all of the elements for host addresses and flags.
6936      */
6937     i = reg_off = 0;
6938     do {
6939         uint64_t pg = vg[reg_off >> 6];
6940         do {
6941             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6942             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6943 
6944             host[i] = NULL;
6945             if (likely((pg >> (reg_off & 63)) & 1)) {
6946                 if (likely(in_page >= msize)) {
6947                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6948                                    mmu_idx, retaddr);
6949                     if (!(info.flags & TLB_MMIO)) {
6950                         host[i] = info.host;
6951                     }
6952                 } else {
6953                     /*
6954                      * Element crosses the page boundary.
6955                      * Probe both pages, but do not record the host address,
6956                      * so that we use the slow path.
6957                      */
6958                     sve_probe_page(&info, false, env, addr, 0,
6959                                    MMU_DATA_STORE, mmu_idx, retaddr);
6960                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6961                                    MMU_DATA_STORE, mmu_idx, retaddr);
6962                     info.flags |= info2.flags;
6963                 }
6964 
6965                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6966                     cpu_check_watchpoint(env_cpu(env), addr, msize,
6967                                          info.attrs, BP_MEM_WRITE, retaddr);
6968                 }
6969 
6970                 if (mtedesc && info.tagged) {
6971                     mte_check(env, mtedesc, addr, retaddr);
6972                 }
6973             }
6974             i += 1;
6975             reg_off += esize;
6976         } while (reg_off & 63);
6977     } while (reg_off < reg_max);
6978 
6979     /*
6980      * Now that we have recognized all exceptions except SyncExternal
6981      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6982      *
6983      * Note for the common case of an element in RAM, not crossing a page
6984      * boundary, we have stored the host address in host[].  This doubles
6985      * as a first-level check against the predicate, since only enabled
6986      * elements have non-null host addresses.
6987      */
6988     i = reg_off = 0;
6989     do {
6990         void *h = host[i];
6991         if (likely(h != NULL)) {
6992             host_fn(vd, reg_off, h);
6993         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6994             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6995             tlb_fn(env, vd, reg_off, addr, retaddr);
6996         }
6997         i += 1;
6998         reg_off += esize;
6999     } while (reg_off < reg_max);
7000 }
7001 
7002 static inline QEMU_ALWAYS_INLINE
7003 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7004                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7005                    int esize, int msize, zreg_off_fn *off_fn,
7006                    sve_ldst1_host_fn *host_fn,
7007                    sve_ldst1_tlb_fn *tlb_fn)
7008 {
7009     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7010     /* Remove mtedesc from the normal sve descriptor. */
7011     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7012 
7013     /*
7014      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7015      * offset base entirely over the address space hole to change the
7016      * pointer tag, or change the bit55 selector.  So we could here
7017      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7018      */
7019     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7020               esize, msize, off_fn, host_fn, tlb_fn);
7021 }
7022 
7023 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7024 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7025                                  void *vm, target_ulong base, uint32_t desc) \
7026 {                                                                       \
7027     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7028               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7029 }                                                                       \
7030 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7031     void *vm, target_ulong base, uint32_t desc)                         \
7032 {                                                                       \
7033     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7034                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7035 }
7036 
7037 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7038 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7039                                  void *vm, target_ulong base, uint32_t desc) \
7040 {                                                                       \
7041     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7042               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7043 }                                                                       \
7044 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7045     void *vm, target_ulong base, uint32_t desc)                         \
7046 {                                                                       \
7047     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7048                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7049 }
7050 
7051 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7052 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7053 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7054 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7055 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7056 
7057 DO_ST1_ZPZ_S(bs, zss, MO_8)
7058 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7059 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7060 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7061 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7062 
7063 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7064 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7065 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7066 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7067 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7068 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7069 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7070 
7071 DO_ST1_ZPZ_D(bd, zss, MO_8)
7072 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7073 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7074 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7075 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7076 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7077 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7078 
7079 DO_ST1_ZPZ_D(bd, zd, MO_8)
7080 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7081 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7082 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7083 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7084 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7085 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7086 
7087 #undef DO_ST1_ZPZ_S
7088 #undef DO_ST1_ZPZ_D
7089 
7090 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7091 {
7092     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7093     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7094 
7095     for (i = 0; i < opr_sz; ++i) {
7096         d[i] = n[i] ^ m[i] ^ k[i];
7097     }
7098 }
7099 
7100 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7101 {
7102     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7103     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7104 
7105     for (i = 0; i < opr_sz; ++i) {
7106         d[i] = n[i] ^ (m[i] & ~k[i]);
7107     }
7108 }
7109 
7110 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7111 {
7112     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7113     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7114 
7115     for (i = 0; i < opr_sz; ++i) {
7116         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7117     }
7118 }
7119 
7120 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7121 {
7122     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7123     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7124 
7125     for (i = 0; i < opr_sz; ++i) {
7126         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7127     }
7128 }
7129 
7130 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7131 {
7132     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7133     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7134 
7135     for (i = 0; i < opr_sz; ++i) {
7136         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7137     }
7138 }
7139 
7140 /*
7141  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7142  * See hasless(v,1) from
7143  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7144  */
7145 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7146 {
7147     int bits = 8 << esz;
7148     uint64_t ones = dup_const(esz, 1);
7149     uint64_t signs = ones << (bits - 1);
7150     uint64_t cmp0, cmp1;
7151 
7152     cmp1 = dup_const(esz, n);
7153     cmp0 = cmp1 ^ m0;
7154     cmp1 = cmp1 ^ m1;
7155     cmp0 = (cmp0 - ones) & ~cmp0;
7156     cmp1 = (cmp1 - ones) & ~cmp1;
7157     return (cmp0 | cmp1) & signs;
7158 }
7159 
7160 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7161                                 uint32_t desc, int esz, bool nmatch)
7162 {
7163     uint16_t esz_mask = pred_esz_masks[esz];
7164     intptr_t opr_sz = simd_oprsz(desc);
7165     uint32_t flags = PREDTEST_INIT;
7166     intptr_t i, j, k;
7167 
7168     for (i = 0; i < opr_sz; i += 16) {
7169         uint64_t m0 = *(uint64_t *)(vm + i);
7170         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7171         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7172         uint16_t out = 0;
7173 
7174         for (j = 0; j < 16; j += 8) {
7175             uint64_t n = *(uint64_t *)(vn + i + j);
7176 
7177             for (k = 0; k < 8; k += 1 << esz) {
7178                 if (pg & (1 << (j + k))) {
7179                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7180                     out |= (o ^ nmatch) << (j + k);
7181                 }
7182             }
7183         }
7184         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7185         flags = iter_predtest_fwd(out, pg, flags);
7186     }
7187     return flags;
7188 }
7189 
7190 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7191 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7192 {                                                                             \
7193     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7194 }
7195 
7196 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7197 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7198 
7199 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7200 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7201 
7202 #undef DO_PPZZ_MATCH
7203 
7204 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7205                             uint32_t desc)
7206 {
7207     ARMVectorReg scratch;
7208     intptr_t i, j;
7209     intptr_t opr_sz = simd_oprsz(desc);
7210     uint32_t *d = vd, *n = vn, *m = vm;
7211     uint8_t *pg = vg;
7212 
7213     if (d == n) {
7214         n = memcpy(&scratch, n, opr_sz);
7215         if (d == m) {
7216             m = n;
7217         }
7218     } else if (d == m) {
7219         m = memcpy(&scratch, m, opr_sz);
7220     }
7221 
7222     for (i = 0; i < opr_sz; i += 4) {
7223         uint64_t count = 0;
7224         uint8_t pred;
7225 
7226         pred = pg[H1(i >> 3)] >> (i & 7);
7227         if (pred & 1) {
7228             uint32_t nn = n[H4(i >> 2)];
7229 
7230             for (j = 0; j <= i; j += 4) {
7231                 pred = pg[H1(j >> 3)] >> (j & 7);
7232                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7233                     ++count;
7234                 }
7235             }
7236         }
7237         d[H4(i >> 2)] = count;
7238     }
7239 }
7240 
7241 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7242                             uint32_t desc)
7243 {
7244     ARMVectorReg scratch;
7245     intptr_t i, j;
7246     intptr_t opr_sz = simd_oprsz(desc);
7247     uint64_t *d = vd, *n = vn, *m = vm;
7248     uint8_t *pg = vg;
7249 
7250     if (d == n) {
7251         n = memcpy(&scratch, n, opr_sz);
7252         if (d == m) {
7253             m = n;
7254         }
7255     } else if (d == m) {
7256         m = memcpy(&scratch, m, opr_sz);
7257     }
7258 
7259     for (i = 0; i < opr_sz / 8; ++i) {
7260         uint64_t count = 0;
7261         if (pg[H1(i)] & 1) {
7262             uint64_t nn = n[i];
7263             for (j = 0; j <= i; ++j) {
7264                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7265                     ++count;
7266                 }
7267             }
7268         }
7269         d[i] = count;
7270     }
7271 }
7272 
7273 /*
7274  * Returns the number of bytes in m0 and m1 that match n.
7275  * Unlike do_match2 we don't just need true/false, we need an exact count.
7276  * This requires two extra logical operations.
7277  */
7278 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7279 {
7280     const uint64_t mask = dup_const(MO_8, 0x7f);
7281     uint64_t cmp0, cmp1;
7282 
7283     cmp1 = dup_const(MO_8, n);
7284     cmp0 = cmp1 ^ m0;
7285     cmp1 = cmp1 ^ m1;
7286 
7287     /*
7288      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7289      * 2: carry in to msb if byte != 0 (+ mask)
7290      * 3: set msb if cmp has msb set (| cmp)
7291      * 4: set ~msb to ignore them (| mask)
7292      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7293      * 5: invert, resulting in 0x80 if and only if byte == 0.
7294      */
7295     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7296     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7297 
7298     /*
7299      * Combine the two compares in a way that the bits do
7300      * not overlap, and so preserves the count of set bits.
7301      * If the host has an efficient instruction for ctpop,
7302      * then ctpop(x) + ctpop(y) has the same number of
7303      * operations as ctpop(x | (y >> 1)).  If the host does
7304      * not have an efficient ctpop, then we only want to
7305      * use it once.
7306      */
7307     return ctpop64(cmp0 | (cmp1 >> 1));
7308 }
7309 
7310 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7311 {
7312     intptr_t i, j;
7313     intptr_t opr_sz = simd_oprsz(desc);
7314 
7315     for (i = 0; i < opr_sz; i += 16) {
7316         uint64_t n0 = *(uint64_t *)(vn + i);
7317         uint64_t m0 = *(uint64_t *)(vm + i);
7318         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7319         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7320         uint64_t out0 = 0;
7321         uint64_t out1 = 0;
7322 
7323         for (j = 0; j < 64; j += 8) {
7324             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7325             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7326             out0 |= cnt0 << j;
7327             out1 |= cnt1 << j;
7328         }
7329 
7330         *(uint64_t *)(vd + i) = out0;
7331         *(uint64_t *)(vd + i + 8) = out1;
7332     }
7333 }
7334 
7335 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7336 {
7337     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7338     int shr = simd_data(desc);
7339     int shl = 8 - shr;
7340     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7341     uint64_t *d = vd, *n = vn, *m = vm;
7342 
7343     for (i = 0; i < opr_sz; ++i) {
7344         uint64_t t = n[i] ^ m[i];
7345         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7346     }
7347 }
7348 
7349 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7350 {
7351     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7352     int shr = simd_data(desc);
7353     int shl = 16 - shr;
7354     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7355     uint64_t *d = vd, *n = vn, *m = vm;
7356 
7357     for (i = 0; i < opr_sz; ++i) {
7358         uint64_t t = n[i] ^ m[i];
7359         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7360     }
7361 }
7362 
7363 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7364 {
7365     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7366     int shr = simd_data(desc);
7367     uint32_t *d = vd, *n = vn, *m = vm;
7368 
7369     for (i = 0; i < opr_sz; ++i) {
7370         d[i] = ror32(n[i] ^ m[i], shr);
7371     }
7372 }
7373 
7374 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7375                      void *status, uint32_t desc)
7376 {
7377     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7378 
7379     for (s = 0; s < opr_sz; ++s) {
7380         float32 *n = vn + s * sizeof(float32) * 4;
7381         float32 *m = vm + s * sizeof(float32) * 4;
7382         float32 *a = va + s * sizeof(float32) * 4;
7383         float32 *d = vd + s * sizeof(float32) * 4;
7384         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7385         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7386         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7387         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7388         float32 p0, p1;
7389 
7390         /* i = 0, j = 0 */
7391         p0 = float32_mul(n00, m00, status);
7392         p1 = float32_mul(n01, m01, status);
7393         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7394 
7395         /* i = 0, j = 1 */
7396         p0 = float32_mul(n00, m10, status);
7397         p1 = float32_mul(n01, m11, status);
7398         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7399 
7400         /* i = 1, j = 0 */
7401         p0 = float32_mul(n10, m00, status);
7402         p1 = float32_mul(n11, m01, status);
7403         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7404 
7405         /* i = 1, j = 1 */
7406         p0 = float32_mul(n10, m10, status);
7407         p1 = float32_mul(n11, m11, status);
7408         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7409     }
7410 }
7411 
7412 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7413                      void *status, uint32_t desc)
7414 {
7415     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7416 
7417     for (s = 0; s < opr_sz; ++s) {
7418         float64 *n = vn + s * sizeof(float64) * 4;
7419         float64 *m = vm + s * sizeof(float64) * 4;
7420         float64 *a = va + s * sizeof(float64) * 4;
7421         float64 *d = vd + s * sizeof(float64) * 4;
7422         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7423         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7424         float64 p0, p1;
7425 
7426         /* i = 0, j = 0 */
7427         p0 = float64_mul(n00, m00, status);
7428         p1 = float64_mul(n01, m01, status);
7429         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7430 
7431         /* i = 0, j = 1 */
7432         p0 = float64_mul(n00, m10, status);
7433         p1 = float64_mul(n01, m11, status);
7434         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7435 
7436         /* i = 1, j = 0 */
7437         p0 = float64_mul(n10, m00, status);
7438         p1 = float64_mul(n11, m01, status);
7439         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7440 
7441         /* i = 1, j = 1 */
7442         p0 = float64_mul(n10, m10, status);
7443         p1 = float64_mul(n11, m11, status);
7444         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7445     }
7446 }
7447 
7448 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7449 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7450 {                                                                             \
7451     intptr_t i = simd_oprsz(desc);                                            \
7452     uint64_t *g = vg;                                                         \
7453     do {                                                                      \
7454         uint64_t pg = g[(i - 1) >> 6];                                        \
7455         do {                                                                  \
7456             i -= sizeof(TYPEW);                                               \
7457             if (likely((pg >> (i & 63)) & 1)) {                               \
7458                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7459                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7460             }                                                                 \
7461         } while (i & 63);                                                     \
7462     } while (i != 0);                                                         \
7463 }
7464 
7465 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7466 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7467 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7468 
7469 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7470 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7471 {                                                                             \
7472     intptr_t i = simd_oprsz(desc);                                            \
7473     uint64_t *g = vg;                                                         \
7474     do {                                                                      \
7475         uint64_t pg = g[(i - 1) >> 6];                                        \
7476         do {                                                                  \
7477             i -= sizeof(TYPEW);                                               \
7478             if (likely((pg >> (i & 63)) & 1)) {                               \
7479                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7480                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7481             }                                                                 \
7482         } while (i & 63);                                                     \
7483     } while (i != 0);                                                         \
7484 }
7485 
7486 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7487 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7488 
7489 #undef DO_FCVTLT
7490 #undef DO_FCVTNT
7491