xref: /openbmc/qemu/target/arm/tcg/sve_helper.c (revision 72baef13b9dce71f20ae840d9951e559e14abf6d)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/page-protection.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29 #include "vec_internal.h"
30 #include "sve_ldst_internal.h"
31 #include "hw/core/tcg-cpu-ops.h"
32 
33 
34 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
35  *
36  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
37  * and bit 0 set if C is set.  Compare the definitions of these variables
38  * within CPUARMState.
39  */
40 
41 /* For no G bits set, NZCV = C.  */
42 #define PREDTEST_INIT  1
43 
44 /* This is an iterative function, called for each Pd and Pg word
45  * moving forward.
46  */
47 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
48 {
49     if (likely(g)) {
50         /* Compute N from first D & G.
51            Use bit 2 to signal first G bit seen.  */
52         if (!(flags & 4)) {
53             flags |= ((d & (g & -g)) != 0) << 31;
54             flags |= 4;
55         }
56 
57         /* Accumulate Z from each D & G.  */
58         flags |= ((d & g) != 0) << 1;
59 
60         /* Compute C from last !(D & G).  Replace previous.  */
61         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
62     }
63     return flags;
64 }
65 
66 /* This is an iterative function, called for each Pd and Pg word
67  * moving backward.
68  */
69 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
70 {
71     if (likely(g)) {
72         /* Compute C from first (i.e last) !(D & G).
73            Use bit 2 to signal first G bit seen.  */
74         if (!(flags & 4)) {
75             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
76             flags |= (d & pow2floor(g)) == 0;
77         }
78 
79         /* Accumulate Z from each D & G.  */
80         flags |= ((d & g) != 0) << 1;
81 
82         /* Compute N from last (i.e first) D & G.  Replace previous.  */
83         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
84     }
85     return flags;
86 }
87 
88 /* The same for a single word predicate.  */
89 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
90 {
91     return iter_predtest_fwd(d, g, PREDTEST_INIT);
92 }
93 
94 /* The same for a multi-word predicate.  */
95 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
96 {
97     uint32_t flags = PREDTEST_INIT;
98     uint64_t *d = vd, *g = vg;
99     uintptr_t i = 0;
100 
101     do {
102         flags = iter_predtest_fwd(d[i], g[i], flags);
103     } while (++i < words);
104 
105     return flags;
106 }
107 
108 /* Similarly for single word elements.  */
109 static inline uint64_t expand_pred_s(uint8_t byte)
110 {
111     static const uint64_t word[] = {
112         [0x01] = 0x00000000ffffffffull,
113         [0x10] = 0xffffffff00000000ull,
114         [0x11] = 0xffffffffffffffffull,
115     };
116     return word[byte & 0x11];
117 }
118 
119 #define LOGICAL_PPPP(NAME, FUNC) \
120 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
121 {                                                                         \
122     uintptr_t opr_sz = simd_oprsz(desc);                                  \
123     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
124     uintptr_t i;                                                          \
125     for (i = 0; i < opr_sz / 8; ++i) {                                    \
126         d[i] = FUNC(n[i], m[i], g[i]);                                    \
127     }                                                                     \
128 }
129 
130 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
131 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
132 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
133 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
134 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
135 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
136 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
137 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
138 
139 LOGICAL_PPPP(sve_and_pppp, DO_AND)
140 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
141 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
142 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
143 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
144 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
145 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
146 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
147 
148 #undef DO_AND
149 #undef DO_BIC
150 #undef DO_EOR
151 #undef DO_ORR
152 #undef DO_ORN
153 #undef DO_NOR
154 #undef DO_NAND
155 #undef DO_SEL
156 #undef LOGICAL_PPPP
157 
158 /* Fully general three-operand expander, controlled by a predicate.
159  * This is complicated by the host-endian storage of the register file.
160  */
161 /* ??? I don't expect the compiler could ever vectorize this itself.
162  * With some tables we can convert bit masks to byte masks, and with
163  * extra care wrt byte/word ordering we could use gcc generic vectors
164  * and do 16 bytes at a time.
165  */
166 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
167 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
168 {                                                                       \
169     intptr_t i, opr_sz = simd_oprsz(desc);                              \
170     for (i = 0; i < opr_sz; ) {                                         \
171         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
172         do {                                                            \
173             if (pg & 1) {                                               \
174                 TYPE nn = *(TYPE *)(vn + H(i));                         \
175                 TYPE mm = *(TYPE *)(vm + H(i));                         \
176                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
177             }                                                           \
178             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
179         } while (i & 15);                                               \
180     }                                                                   \
181 }
182 
183 /* Similarly, specialized for 64-bit operands.  */
184 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
185 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
186 {                                                               \
187     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
188     TYPE *d = vd, *n = vn, *m = vm;                             \
189     uint8_t *pg = vg;                                           \
190     for (i = 0; i < opr_sz; i += 1) {                           \
191         if (pg[H1(i)] & 1) {                                    \
192             TYPE nn = n[i], mm = m[i];                          \
193             d[i] = OP(nn, mm);                                  \
194         }                                                       \
195     }                                                           \
196 }
197 
198 #define DO_AND(N, M)  (N & M)
199 #define DO_EOR(N, M)  (N ^ M)
200 #define DO_ORR(N, M)  (N | M)
201 #define DO_BIC(N, M)  (N & ~M)
202 #define DO_ADD(N, M)  (N + M)
203 #define DO_SUB(N, M)  (N - M)
204 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
205 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
206 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
207 #define DO_MUL(N, M)  (N * M)
208 
209 
210 /*
211  * We must avoid the C undefined behaviour cases: division by
212  * zero and signed division of INT_MIN by -1. Both of these
213  * have architecturally defined required results for Arm.
214  * We special case all signed divisions by -1 to avoid having
215  * to deduce the minimum integer for the type involved.
216  */
217 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
218 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
219 
220 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
221 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
222 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
223 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
224 
225 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
226 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
227 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
228 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
229 
230 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
231 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
232 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
233 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
234 
235 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
236 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
237 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
238 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
239 
240 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
241 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
242 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
243 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
244 
245 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
246 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
247 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
248 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
249 
250 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
251 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
252 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
253 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
254 
255 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
256 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
257 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
258 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
259 
260 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
261 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
262 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
263 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
264 
265 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
266 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
267 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
268 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
269 
270 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
271 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
272 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
273 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
274 
275 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
276 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
277 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
278 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
279 
280 /* Because the computation type is at least twice as large as required,
281    these work for both signed and unsigned source types.  */
282 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
283 {
284     return (n * m) >> 8;
285 }
286 
287 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
288 {
289     return (n * m) >> 16;
290 }
291 
292 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
293 {
294     return (n * m) >> 32;
295 }
296 
297 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
298 {
299     uint64_t lo, hi;
300     muls64(&lo, &hi, n, m);
301     return hi;
302 }
303 
304 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
305 {
306     uint64_t lo, hi;
307     mulu64(&lo, &hi, n, m);
308     return hi;
309 }
310 
311 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
312 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
313 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
314 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
315 
316 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
317 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
318 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
319 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
320 
321 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
322 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
323 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
324 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
325 
326 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
327 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
328 
329 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
330 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
331 
332 /* Note that all bits of the shift are significant
333    and not modulo the element size.  */
334 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
335 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
336 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
337 
338 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
339 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
340 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
341 
342 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
343 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
344 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
345 
346 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
347 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
348 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
349 
350 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
351 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
352 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
353 
354 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
355 {
356     int8_t n1 = n, n2 = n >> 8;
357     return m + n1 + n2;
358 }
359 
360 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
361 {
362     int16_t n1 = n, n2 = n >> 16;
363     return m + n1 + n2;
364 }
365 
366 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
367 {
368     int32_t n1 = n, n2 = n >> 32;
369     return m + n1 + n2;
370 }
371 
372 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
373 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
374 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
375 
376 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
377 {
378     uint8_t n1 = n, n2 = n >> 8;
379     return m + n1 + n2;
380 }
381 
382 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
383 {
384     uint16_t n1 = n, n2 = n >> 16;
385     return m + n1 + n2;
386 }
387 
388 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
389 {
390     uint32_t n1 = n, n2 = n >> 32;
391     return m + n1 + n2;
392 }
393 
394 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
395 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
396 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
397 
398 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
399 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
400 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
401 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
402 
403 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
404 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
405 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
406 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
407 
408 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
409 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
410 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
411 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
412 
413 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
414 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
415 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
416 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
417 
418 /*
419  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
420  * We pass in a pointer to a dummy saturation field to trigger
421  * the saturating arithmetic but discard the information about
422  * whether it has occurred.
423  */
424 #define do_sqshl_b(n, m) \
425    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
426 #define do_sqshl_h(n, m) \
427    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
428 #define do_sqshl_s(n, m) \
429    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
430 #define do_sqshl_d(n, m) \
431    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
432 
433 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
434 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
435 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
436 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
437 
438 #define do_uqshl_b(n, m) \
439    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
440 #define do_uqshl_h(n, m) \
441    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
442 #define do_uqshl_s(n, m) \
443    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
444 #define do_uqshl_d(n, m) \
445    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
446 
447 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
448 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
449 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
450 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
451 
452 #define do_sqrshl_b(n, m) \
453    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
454 #define do_sqrshl_h(n, m) \
455    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
456 #define do_sqrshl_s(n, m) \
457    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
458 #define do_sqrshl_d(n, m) \
459    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
460 
461 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
462 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
463 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
464 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
465 
466 #undef do_sqrshl_d
467 
468 #define do_uqrshl_b(n, m) \
469    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
470 #define do_uqrshl_h(n, m) \
471    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
472 #define do_uqrshl_s(n, m) \
473    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
474 #define do_uqrshl_d(n, m) \
475    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
476 
477 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
478 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
479 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
480 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
481 
482 #undef do_uqrshl_d
483 
484 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
485 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
486 
487 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
488 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
489 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
490 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
491 
492 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
493 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
494 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
495 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
496 
497 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
498 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
499 
500 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
501 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
502 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
503 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
504 
505 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
506 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
507 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
508 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
509 
510 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
511 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
512 
513 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
514 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
515 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
516 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
517 
518 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
519 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
520 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
521 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
522 
523 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
524 {
525     return val >= max ? max : val <= min ? min : val;
526 }
527 
528 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
529 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
530 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
531 
532 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
533 {
534     int64_t r = n + m;
535     if (((r ^ n) & ~(n ^ m)) < 0) {
536         /* Signed overflow.  */
537         return r < 0 ? INT64_MAX : INT64_MIN;
538     }
539     return r;
540 }
541 
542 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
543 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
544 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
545 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
546 
547 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
548 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
549 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
550 
551 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
552 {
553     uint64_t r = n + m;
554     return r < n ? UINT64_MAX : r;
555 }
556 
557 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
558 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
559 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
560 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
561 
562 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
563 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
564 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
565 
566 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
567 {
568     int64_t r = n - m;
569     if (((r ^ n) & (n ^ m)) < 0) {
570         /* Signed overflow.  */
571         return r < 0 ? INT64_MAX : INT64_MIN;
572     }
573     return r;
574 }
575 
576 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
577 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
578 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
579 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
580 
581 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
582 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
583 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
584 
585 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
586 {
587     return n > m ? n - m : 0;
588 }
589 
590 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
591 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
592 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
593 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
594 
595 #define DO_SUQADD_B(n, m) \
596     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
597 #define DO_SUQADD_H(n, m) \
598     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
599 #define DO_SUQADD_S(n, m) \
600     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
601 
602 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
603 {
604     uint64_t r = n + m;
605 
606     if (n < 0) {
607         /* Note that m - abs(n) cannot underflow. */
608         if (r > INT64_MAX) {
609             /* Result is either very large positive or negative. */
610             if (m > -n) {
611                 /* m > abs(n), so r is a very large positive. */
612                 return INT64_MAX;
613             }
614             /* Result is negative. */
615         }
616     } else {
617         /* Both inputs are positive: check for overflow.  */
618         if (r < m || r > INT64_MAX) {
619             return INT64_MAX;
620         }
621     }
622     return r;
623 }
624 
625 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
626 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
627 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
628 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
629 
630 #define DO_USQADD_B(n, m) \
631     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
632 #define DO_USQADD_H(n, m) \
633     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
634 #define DO_USQADD_S(n, m) \
635     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
636 
637 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
638 {
639     uint64_t r = n + m;
640 
641     if (m < 0) {
642         return n < -m ? 0 : r;
643     }
644     return r < n ? UINT64_MAX : r;
645 }
646 
647 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
648 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
649 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
650 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
651 
652 #undef DO_ZPZZ
653 #undef DO_ZPZZ_D
654 
655 /*
656  * Three operand expander, operating on element pairs.
657  * If the slot I is even, the elements from from VN {I, I+1}.
658  * If the slot I is odd, the elements from from VM {I-1, I}.
659  * Load all of the input elements in each pair before overwriting output.
660  */
661 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
662 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
663 {                                                               \
664     intptr_t i, opr_sz = simd_oprsz(desc);                      \
665     for (i = 0; i < opr_sz; ) {                                 \
666         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
667         do {                                                    \
668             TYPE n0 = *(TYPE *)(vn + H(i));                     \
669             TYPE m0 = *(TYPE *)(vm + H(i));                     \
670             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
671             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
672             if (pg & 1) {                                       \
673                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
674             }                                                   \
675             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
676             if (pg & 1) {                                       \
677                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
678             }                                                   \
679             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
680         } while (i & 15);                                       \
681     }                                                           \
682 }
683 
684 /* Similarly, specialized for 64-bit operands.  */
685 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
686 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
687 {                                                               \
688     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
689     TYPE *d = vd, *n = vn, *m = vm;                             \
690     uint8_t *pg = vg;                                           \
691     for (i = 0; i < opr_sz; i += 2) {                           \
692         TYPE n0 = n[i], n1 = n[i + 1];                          \
693         TYPE m0 = m[i], m1 = m[i + 1];                          \
694         if (pg[H1(i)] & 1) {                                    \
695             d[i] = OP(n0, n1);                                  \
696         }                                                       \
697         if (pg[H1(i + 1)] & 1) {                                \
698             d[i + 1] = OP(m0, m1);                              \
699         }                                                       \
700     }                                                           \
701 }
702 
703 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
704 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
705 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
706 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
707 
708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
709 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
710 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
711 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
712 
713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
714 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
715 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
716 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
717 
718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
719 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
720 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
721 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
722 
723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
724 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
725 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
726 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
727 
728 #undef DO_ZPZZ_PAIR
729 #undef DO_ZPZZ_PAIR_D
730 
731 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
732 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
733                   void *status, uint32_t desc)                          \
734 {                                                                       \
735     intptr_t i, opr_sz = simd_oprsz(desc);                              \
736     for (i = 0; i < opr_sz; ) {                                         \
737         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
738         do {                                                            \
739             TYPE n0 = *(TYPE *)(vn + H(i));                             \
740             TYPE m0 = *(TYPE *)(vm + H(i));                             \
741             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
742             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
743             if (pg & 1) {                                               \
744                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
745             }                                                           \
746             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
747             if (pg & 1) {                                               \
748                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
749             }                                                           \
750             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
751         } while (i & 15);                                               \
752     }                                                                   \
753 }
754 
755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
756 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
757 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
758 
759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
760 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
761 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
762 
763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
764 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
765 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
766 
767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
768 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
769 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
770 
771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
772 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
773 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
774 
775 #undef DO_ZPZZ_PAIR_FP
776 
777 /* Three-operand expander, controlled by a predicate, in which the
778  * third operand is "wide".  That is, for D = N op M, the same 64-bit
779  * value of M is used with all of the narrower values of N.
780  */
781 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
782 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
783 {                                                                       \
784     intptr_t i, opr_sz = simd_oprsz(desc);                              \
785     for (i = 0; i < opr_sz; ) {                                         \
786         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
787         TYPEW mm = *(TYPEW *)(vm + i);                                  \
788         do {                                                            \
789             if (pg & 1) {                                               \
790                 TYPE nn = *(TYPE *)(vn + H(i));                         \
791                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
792             }                                                           \
793             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
794         } while (i & 7);                                                \
795     }                                                                   \
796 }
797 
798 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
799 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
800 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
801 
802 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
803 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
804 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
805 
806 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
807 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
808 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
809 
810 #undef DO_ZPZW
811 
812 /* Fully general two-operand expander, controlled by a predicate.
813  */
814 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
815 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
816 {                                                               \
817     intptr_t i, opr_sz = simd_oprsz(desc);                      \
818     for (i = 0; i < opr_sz; ) {                                 \
819         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
820         do {                                                    \
821             if (pg & 1) {                                       \
822                 TYPE nn = *(TYPE *)(vn + H(i));                 \
823                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
824             }                                                   \
825             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
826         } while (i & 15);                                       \
827     }                                                           \
828 }
829 
830 /* Similarly, specialized for 64-bit operands.  */
831 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
832 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
833 {                                                               \
834     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
835     TYPE *d = vd, *n = vn;                                      \
836     uint8_t *pg = vg;                                           \
837     for (i = 0; i < opr_sz; i += 1) {                           \
838         if (pg[H1(i)] & 1) {                                    \
839             TYPE nn = n[i];                                     \
840             d[i] = OP(nn);                                      \
841         }                                                       \
842     }                                                           \
843 }
844 
845 #define DO_CLS_B(N)   (clrsb32(N) - 24)
846 #define DO_CLS_H(N)   (clrsb32(N) - 16)
847 
848 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
849 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
850 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
851 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
852 
853 #define DO_CLZ_B(N)   (clz32(N) - 24)
854 #define DO_CLZ_H(N)   (clz32(N) - 16)
855 
856 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
857 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
858 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
859 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
860 
861 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
862 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
863 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
864 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
865 
866 #define DO_CNOT(N)    (N == 0)
867 
868 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
869 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
870 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
871 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
872 
873 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
874 
875 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
876 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
877 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
878 
879 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
880 
881 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
882 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
883 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
884 
885 #define DO_NOT(N)    (~N)
886 
887 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
888 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
889 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
890 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
891 
892 #define DO_SXTB(N)    ((int8_t)N)
893 #define DO_SXTH(N)    ((int16_t)N)
894 #define DO_SXTS(N)    ((int32_t)N)
895 #define DO_UXTB(N)    ((uint8_t)N)
896 #define DO_UXTH(N)    ((uint16_t)N)
897 #define DO_UXTS(N)    ((uint32_t)N)
898 
899 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
900 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
901 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
902 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
903 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
904 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
905 
906 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
907 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
908 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
909 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
910 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
911 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
912 
913 #define DO_ABS(N)    (N < 0 ? -N : N)
914 
915 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
916 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
917 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
918 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
919 
920 #define DO_NEG(N)    (-N)
921 
922 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
923 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
924 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
925 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
926 
927 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
928 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
929 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
930 
931 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
932 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
933 
934 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
935 
936 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
937 {
938     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
939     uint64_t *d = vd, *n = vn;
940     uint8_t *pg = vg;
941 
942     for (i = 0; i < opr_sz; i += 2) {
943         if (pg[H1(i)] & 1) {
944             uint64_t n0 = n[i + 0];
945             uint64_t n1 = n[i + 1];
946             d[i + 0] = n1;
947             d[i + 1] = n0;
948         }
949     }
950 }
951 
952 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
953 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
954 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
955 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
956 
957 #define DO_SQABS(X) \
958     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
959        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
960 
961 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
962 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
963 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
964 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
965 
966 #define DO_SQNEG(X) \
967     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
968        x_ == min_ ? -min_ - 1 : -x_; })
969 
970 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
971 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
972 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
973 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
974 
975 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
976 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
977 
978 /* Three-operand expander, unpredicated, in which the third operand is "wide".
979  */
980 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
981 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
982 {                                                              \
983     intptr_t i, opr_sz = simd_oprsz(desc);                     \
984     for (i = 0; i < opr_sz; ) {                                \
985         TYPEW mm = *(TYPEW *)(vm + i);                         \
986         do {                                                   \
987             TYPE nn = *(TYPE *)(vn + H(i));                    \
988             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
989             i += sizeof(TYPE);                                 \
990         } while (i & 7);                                       \
991     }                                                          \
992 }
993 
994 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
995 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
996 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
997 
998 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
999 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1000 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1001 
1002 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1003 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1004 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1005 
1006 #undef DO_ZZW
1007 
1008 #undef DO_CLS_B
1009 #undef DO_CLS_H
1010 #undef DO_CLZ_B
1011 #undef DO_CLZ_H
1012 #undef DO_CNOT
1013 #undef DO_FABS
1014 #undef DO_FNEG
1015 #undef DO_ABS
1016 #undef DO_NEG
1017 #undef DO_ZPZ
1018 #undef DO_ZPZ_D
1019 
1020 /*
1021  * Three-operand expander, unpredicated, in which the two inputs are
1022  * selected from the top or bottom half of the wide column.
1023  */
1024 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1025 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1026 {                                                                       \
1027     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1028     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1029     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1030     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1031         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1032         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1033         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1034     }                                                                   \
1035 }
1036 
1037 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1038 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1039 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1040 
1041 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1042 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1043 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1044 
1045 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1046 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1047 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1048 
1049 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1050 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1051 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1052 
1053 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1054 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1055 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1056 
1057 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1058 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1059 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1060 
1061 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1062 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1063 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1064 
1065 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1066 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1067 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1068 
1069 /* Note that the multiply cannot overflow, but the doubling can. */
1070 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1071 {
1072     int16_t val = n * m;
1073     return DO_SQADD_H(val, val);
1074 }
1075 
1076 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1077 {
1078     int32_t val = n * m;
1079     return DO_SQADD_S(val, val);
1080 }
1081 
1082 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1083 {
1084     int64_t val = n * m;
1085     return do_sqadd_d(val, val);
1086 }
1087 
1088 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1089 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1090 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1091 
1092 #undef DO_ZZZ_TB
1093 
1094 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1095 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1096 {                                                              \
1097     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1098     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1099     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1100         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1101         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1102         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1103     }                                                          \
1104 }
1105 
1106 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1107 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1108 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1109 
1110 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1111 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1112 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1113 
1114 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1115 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1116 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1117 
1118 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1119 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1120 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1121 
1122 #undef DO_ZZZ_WTB
1123 
1124 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1125 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1126 {                                                                       \
1127     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1128     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1129     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1130     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1131         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1132         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1133         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1134     }                                                                   \
1135 }
1136 
1137 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1138 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1139 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1140 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1141 
1142 #undef DO_ZZZ_NTB
1143 
1144 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1145 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1146 {                                                               \
1147     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1148     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1149     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1150         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1151         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1152         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1153         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1154     }                                                           \
1155 }
1156 
1157 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1158 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1159 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1160 
1161 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1162 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1163 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1164 
1165 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1166 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1167 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1168 
1169 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1170 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1171 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1172 
1173 #define DO_NMUL(N, M)  -(N * M)
1174 
1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1176 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1177 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1178 
1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1180 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1181 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1182 
1183 #undef DO_ZZZW_ACC
1184 
1185 #define DO_XTNB(NAME, TYPE, OP) \
1186 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1187 {                                                            \
1188     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1189     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1190         TYPE nn = *(TYPE *)(vn + i);                         \
1191         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1192         *(TYPE *)(vd + i) = nn;                              \
1193     }                                                        \
1194 }
1195 
1196 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1197 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1198 {                                                                       \
1199     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1200     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1201         TYPE nn = *(TYPE *)(vn + i);                                    \
1202         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1203     }                                                                   \
1204 }
1205 
1206 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1207 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1208 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1209 
1210 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1211 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1212 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1213 
1214 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1215 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1216 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1217 
1218 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1219 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1220 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1221 
1222 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1223 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1224 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1225 
1226 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1227 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1228 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1229 
1230 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1231 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1232 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1233 
1234 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1235 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1236 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1237 
1238 #undef DO_XTNB
1239 #undef DO_XTNT
1240 
1241 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1242 {
1243     intptr_t i, opr_sz = simd_oprsz(desc);
1244     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1245     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1246     uint32_t *a = va, *n = vn;
1247     uint64_t *d = vd, *m = vm;
1248 
1249     for (i = 0; i < opr_sz / 8; ++i) {
1250         uint32_t e1 = a[2 * i + H4(0)];
1251         uint32_t e2 = n[2 * i + sel] ^ inv;
1252         uint64_t c = extract64(m[i], 32, 1);
1253         /* Compute and store the entire 33-bit result at once. */
1254         d[i] = c + e1 + e2;
1255     }
1256 }
1257 
1258 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1259 {
1260     intptr_t i, opr_sz = simd_oprsz(desc);
1261     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1262     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1263     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1264 
1265     for (i = 0; i < opr_sz / 8; i += 2) {
1266         Int128 e1 = int128_make64(a[i]);
1267         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1268         Int128 c = int128_make64(m[i + 1] & 1);
1269         Int128 r = int128_add(int128_add(e1, e2), c);
1270         d[i + 0] = int128_getlo(r);
1271         d[i + 1] = int128_gethi(r);
1272     }
1273 }
1274 
1275 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1276 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1277 {                                                                       \
1278     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1279     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1280     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1281     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1282         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1283         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1284         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1285         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1286     }                                                                   \
1287 }
1288 
1289 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1290            do_sqdmull_h, DO_SQADD_H)
1291 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1292            do_sqdmull_s, DO_SQADD_S)
1293 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1294            do_sqdmull_d, do_sqadd_d)
1295 
1296 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1297            do_sqdmull_h, DO_SQSUB_H)
1298 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1299            do_sqdmull_s, DO_SQSUB_S)
1300 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1301            do_sqdmull_d, do_sqsub_d)
1302 
1303 #undef DO_SQDMLAL
1304 
1305 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1306 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1307 {                                                               \
1308     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1309     int rot = simd_data(desc);                                  \
1310     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1311     bool sub_r = rot == 1 || rot == 2;                          \
1312     bool sub_i = rot >= 2;                                      \
1313     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1314     for (i = 0; i < opr_sz; i += 2) {                           \
1315         TYPE elt1_a = n[H(i + sel_a)];                          \
1316         TYPE elt2_a = m[H(i + sel_a)];                          \
1317         TYPE elt2_b = m[H(i + sel_b)];                          \
1318         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1319         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1320     }                                                           \
1321 }
1322 
1323 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1324 
1325 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1326 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1327 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1328 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1329 
1330 #define DO_SQRDMLAH_B(N, M, A, S) \
1331     do_sqrdmlah_b(N, M, A, S, true)
1332 #define DO_SQRDMLAH_H(N, M, A, S) \
1333     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1334 #define DO_SQRDMLAH_S(N, M, A, S) \
1335     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1336 #define DO_SQRDMLAH_D(N, M, A, S) \
1337     do_sqrdmlah_d(N, M, A, S, true)
1338 
1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1341 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1342 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1343 
1344 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1345 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1346 {                                                                           \
1347     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1348     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1349     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1350     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1351     bool sub_r = rot == 1 || rot == 2;                                      \
1352     bool sub_i = rot >= 2;                                                  \
1353     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1354     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1355         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1356         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1357         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1358             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1359             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1360             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1361         }                                                                   \
1362     }                                                                       \
1363 }
1364 
1365 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1366 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1367 
1368 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1369 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1370 
1371 #undef DO_CMLA
1372 #undef DO_CMLA_FUNC
1373 #undef DO_CMLA_IDX_FUNC
1374 #undef DO_SQRDMLAH_B
1375 #undef DO_SQRDMLAH_H
1376 #undef DO_SQRDMLAH_S
1377 #undef DO_SQRDMLAH_D
1378 
1379 /* Note N and M are 4 elements bundled into one unit. */
1380 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1381                          int sel_a, int sel_b, int sub_i)
1382 {
1383     for (int i = 0; i <= 1; i++) {
1384         int32_t elt1_r = (int8_t)(n >> (16 * i));
1385         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1386         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1387         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1388 
1389         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1390     }
1391     return a;
1392 }
1393 
1394 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1395                          int sel_a, int sel_b, int sub_i)
1396 {
1397     for (int i = 0; i <= 1; i++) {
1398         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1399         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1400         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1401         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1402 
1403         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1404     }
1405     return a;
1406 }
1407 
1408 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1409                               void *va, uint32_t desc)
1410 {
1411     int opr_sz = simd_oprsz(desc);
1412     int rot = simd_data(desc);
1413     int sel_a = rot & 1;
1414     int sel_b = sel_a ^ 1;
1415     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1416     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1417 
1418     for (int e = 0; e < opr_sz / 4; e++) {
1419         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1420     }
1421 }
1422 
1423 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1424                               void *va, uint32_t desc)
1425 {
1426     int opr_sz = simd_oprsz(desc);
1427     int rot = simd_data(desc);
1428     int sel_a = rot & 1;
1429     int sel_b = sel_a ^ 1;
1430     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1431     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1432 
1433     for (int e = 0; e < opr_sz / 8; e++) {
1434         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1435     }
1436 }
1437 
1438 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1439                              void *va, uint32_t desc)
1440 {
1441     int opr_sz = simd_oprsz(desc);
1442     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1443     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1444     int sel_a = rot & 1;
1445     int sel_b = sel_a ^ 1;
1446     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1447     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1448 
1449     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1450         uint32_t seg_m = m[seg + idx];
1451         for (int e = 0; e < 4; e++) {
1452             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1453                                    sel_a, sel_b, sub_i);
1454         }
1455     }
1456 }
1457 
1458 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1459                              void *va, uint32_t desc)
1460 {
1461     int seg, opr_sz = simd_oprsz(desc);
1462     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1463     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1464     int sel_a = rot & 1;
1465     int sel_b = sel_a ^ 1;
1466     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1467     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1468 
1469     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1470         uint64_t seg_m = m[seg + idx];
1471         for (int e = 0; e < 2; e++) {
1472             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1473                                    sel_a, sel_b, sub_i);
1474         }
1475     }
1476 }
1477 
1478 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1479 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1480 {                                                                       \
1481     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1482     intptr_t i, j, idx = simd_data(desc);                               \
1483     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1484     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1485         TYPE mm = m[i];                                                 \
1486         for (j = 0; j < segment; j++) {                                 \
1487             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1488         }                                                               \
1489     }                                                                   \
1490 }
1491 
1492 #define DO_SQRDMLAH_H(N, M, A) \
1493     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1494 #define DO_SQRDMLAH_S(N, M, A) \
1495     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1496 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1497 
1498 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1499 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1500 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1501 
1502 #define DO_SQRDMLSH_H(N, M, A) \
1503     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1504 #define DO_SQRDMLSH_S(N, M, A) \
1505     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1506 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1507 
1508 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1509 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1510 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1511 
1512 #undef DO_ZZXZ
1513 
1514 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1515 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1516 {                                                                         \
1517     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1518     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1519     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1520     for (i = 0; i < oprsz; i += 16) {                                     \
1521         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1522         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1523             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1524             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1525             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1526         }                                                                 \
1527     }                                                                     \
1528 }
1529 
1530 #define DO_MLA(N, M, A)  (A + N * M)
1531 
1532 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1533 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1534 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1535 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1536 
1537 #define DO_MLS(N, M, A)  (A - N * M)
1538 
1539 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1540 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1541 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1542 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1543 
1544 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1545 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1546 
1547 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1548 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1549 
1550 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1551 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1552 
1553 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1554 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1555 
1556 #undef DO_MLA
1557 #undef DO_MLS
1558 #undef DO_ZZXW
1559 
1560 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1561 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1562 {                                                                         \
1563     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1564     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1565     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1566     for (i = 0; i < oprsz; i += 16) {                                     \
1567         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1568         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1569             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1570             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1571         }                                                                 \
1572     }                                                                     \
1573 }
1574 
1575 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1576 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1577 
1578 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1579 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1580 
1581 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1582 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1583 
1584 #undef DO_ZZX
1585 
1586 #define DO_BITPERM(NAME, TYPE, OP) \
1587 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1588 {                                                              \
1589     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1590     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1591         TYPE nn = *(TYPE *)(vn + i);                           \
1592         TYPE mm = *(TYPE *)(vm + i);                           \
1593         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1594     }                                                          \
1595 }
1596 
1597 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1598 {
1599     uint64_t res = 0;
1600     int db, rb = 0;
1601 
1602     for (db = 0; db < n; ++db) {
1603         if ((mask >> db) & 1) {
1604             res |= ((data >> db) & 1) << rb;
1605             ++rb;
1606         }
1607     }
1608     return res;
1609 }
1610 
1611 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1612 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1613 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1614 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1615 
1616 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1617 {
1618     uint64_t res = 0;
1619     int rb, db = 0;
1620 
1621     for (rb = 0; rb < n; ++rb) {
1622         if ((mask >> rb) & 1) {
1623             res |= ((data >> db) & 1) << rb;
1624             ++db;
1625         }
1626     }
1627     return res;
1628 }
1629 
1630 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1631 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1632 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1633 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1634 
1635 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1636 {
1637     uint64_t resm = 0, resu = 0;
1638     int db, rbm = 0, rbu = 0;
1639 
1640     for (db = 0; db < n; ++db) {
1641         uint64_t val = (data >> db) & 1;
1642         if ((mask >> db) & 1) {
1643             resm |= val << rbm++;
1644         } else {
1645             resu |= val << rbu++;
1646         }
1647     }
1648 
1649     return resm | (resu << rbm);
1650 }
1651 
1652 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1653 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1654 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1655 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1656 
1657 #undef DO_BITPERM
1658 
1659 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1660 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1661 {                                                               \
1662     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1663     int sub_r = simd_data(desc);                                \
1664     if (sub_r) {                                                \
1665         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1666             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1667             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1668             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1669             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1670             acc_r = ADD_OP(acc_r, el2_i);                       \
1671             acc_i = SUB_OP(acc_i, el2_r);                       \
1672             *(TYPE *)(vd + H(i)) = acc_r;                       \
1673             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1674         }                                                       \
1675     } else {                                                    \
1676         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1677             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1678             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1679             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1680             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1681             acc_r = SUB_OP(acc_r, el2_i);                       \
1682             acc_i = ADD_OP(acc_i, el2_r);                       \
1683             *(TYPE *)(vd + H(i)) = acc_r;                       \
1684             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1685         }                                                       \
1686     }                                                           \
1687 }
1688 
1689 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1690 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1691 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1692 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1693 
1694 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1695 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1696 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1697 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1698 
1699 #undef DO_CADD
1700 
1701 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1702 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1703 {                                                              \
1704     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1705     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1706     int shift = simd_data(desc) >> 1;                          \
1707     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1708         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1709         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1710     }                                                          \
1711 }
1712 
1713 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1714 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1715 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1716 
1717 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1718 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1719 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1720 
1721 #undef DO_ZZI_SHLL
1722 
1723 /* Two-operand reduction expander, controlled by a predicate.
1724  * The difference between TYPERED and TYPERET has to do with
1725  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1726  * but TYPERET must be unsigned so that e.g. a 32-bit value
1727  * is not sign-extended to the ABI uint64_t return type.
1728  */
1729 /* ??? If we were to vectorize this by hand the reduction ordering
1730  * would change.  For integer operands, this is perfectly fine.
1731  */
1732 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1733 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1734 {                                                          \
1735     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1736     TYPERED ret = INIT;                                    \
1737     for (i = 0; i < opr_sz; ) {                            \
1738         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1739         do {                                               \
1740             if (pg & 1) {                                  \
1741                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1742                 ret = OP(ret, nn);                         \
1743             }                                              \
1744             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1745         } while (i & 15);                                  \
1746     }                                                      \
1747     return (TYPERET)ret;                                   \
1748 }
1749 
1750 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1751 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1752 {                                                          \
1753     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1754     TYPEE *n = vn;                                         \
1755     uint8_t *pg = vg;                                      \
1756     TYPER ret = INIT;                                      \
1757     for (i = 0; i < opr_sz; i += 1) {                      \
1758         if (pg[H1(i)] & 1) {                               \
1759             TYPEE nn = n[i];                               \
1760             ret = OP(ret, nn);                             \
1761         }                                                  \
1762     }                                                      \
1763     return ret;                                            \
1764 }
1765 
1766 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1767 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1768 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1769 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1770 
1771 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1772 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1773 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1774 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1775 
1776 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1777 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1778 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1779 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1780 
1781 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1782 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1783 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1784 
1785 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1786 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1787 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1788 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1789 
1790 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1791 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1792 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1793 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1794 
1795 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1796 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1797 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1798 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1799 
1800 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1801 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1802 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1803 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1804 
1805 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1806 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1807 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1808 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1809 
1810 #undef DO_VPZ
1811 #undef DO_VPZ_D
1812 
1813 /* Two vector operand, one scalar operand, unpredicated.  */
1814 #define DO_ZZI(NAME, TYPE, OP)                                       \
1815 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1816 {                                                                    \
1817     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1818     TYPE s = s64, *d = vd, *n = vn;                                  \
1819     for (i = 0; i < opr_sz; ++i) {                                   \
1820         d[i] = OP(n[i], s);                                          \
1821     }                                                                \
1822 }
1823 
1824 #define DO_SUBR(X, Y)   (Y - X)
1825 
1826 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1827 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1828 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1829 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1830 
1831 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1832 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1833 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1834 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1835 
1836 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1837 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1838 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1839 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1840 
1841 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1842 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1843 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1844 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1845 
1846 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1847 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1848 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1849 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1850 
1851 #undef DO_ZZI
1852 
1853 #undef DO_AND
1854 #undef DO_ORR
1855 #undef DO_EOR
1856 #undef DO_BIC
1857 #undef DO_ADD
1858 #undef DO_SUB
1859 #undef DO_MAX
1860 #undef DO_MIN
1861 #undef DO_ABD
1862 #undef DO_MUL
1863 #undef DO_DIV
1864 #undef DO_ASR
1865 #undef DO_LSR
1866 #undef DO_LSL
1867 #undef DO_SUBR
1868 
1869 /* Similar to the ARM LastActiveElement pseudocode function, except the
1870    result is multiplied by the element size.  This includes the not found
1871    indication; e.g. not found for esz=3 is -8.  */
1872 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1873 {
1874     uint64_t mask = pred_esz_masks[esz];
1875     intptr_t i = words;
1876 
1877     do {
1878         uint64_t this_g = g[--i] & mask;
1879         if (this_g) {
1880             return i * 64 + (63 - clz64(this_g));
1881         }
1882     } while (i > 0);
1883     return (intptr_t)-1 << esz;
1884 }
1885 
1886 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1887 {
1888     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1889     uint32_t flags = PREDTEST_INIT;
1890     uint64_t *d = vd, *g = vg;
1891     intptr_t i = 0;
1892 
1893     do {
1894         uint64_t this_d = d[i];
1895         uint64_t this_g = g[i];
1896 
1897         if (this_g) {
1898             if (!(flags & 4)) {
1899                 /* Set in D the first bit of G.  */
1900                 this_d |= this_g & -this_g;
1901                 d[i] = this_d;
1902             }
1903             flags = iter_predtest_fwd(this_d, this_g, flags);
1904         }
1905     } while (++i < words);
1906 
1907     return flags;
1908 }
1909 
1910 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1911 {
1912     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1913     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1914     uint32_t flags = PREDTEST_INIT;
1915     uint64_t *d = vd, *g = vg, esz_mask;
1916     intptr_t i, next;
1917 
1918     next = last_active_element(vd, words, esz) + (1 << esz);
1919     esz_mask = pred_esz_masks[esz];
1920 
1921     /* Similar to the pseudocode for pnext, but scaled by ESZ
1922        so that we find the correct bit.  */
1923     if (next < words * 64) {
1924         uint64_t mask = -1;
1925 
1926         if (next & 63) {
1927             mask = ~((1ull << (next & 63)) - 1);
1928             next &= -64;
1929         }
1930         do {
1931             uint64_t this_g = g[next / 64] & esz_mask & mask;
1932             if (this_g != 0) {
1933                 next = (next & -64) + ctz64(this_g);
1934                 break;
1935             }
1936             next += 64;
1937             mask = -1;
1938         } while (next < words * 64);
1939     }
1940 
1941     i = 0;
1942     do {
1943         uint64_t this_d = 0;
1944         if (i == next / 64) {
1945             this_d = 1ull << (next & 63);
1946         }
1947         d[i] = this_d;
1948         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1949     } while (++i < words);
1950 
1951     return flags;
1952 }
1953 
1954 /*
1955  * Copy Zn into Zd, and store zero into inactive elements.
1956  * If inv, store zeros into the active elements.
1957  */
1958 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1959 {
1960     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1961     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1962     uint64_t *d = vd, *n = vn;
1963     uint8_t *pg = vg;
1964 
1965     for (i = 0; i < opr_sz; i += 1) {
1966         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1967     }
1968 }
1969 
1970 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1971 {
1972     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1973     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1974     uint64_t *d = vd, *n = vn;
1975     uint8_t *pg = vg;
1976 
1977     for (i = 0; i < opr_sz; i += 1) {
1978         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1979     }
1980 }
1981 
1982 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1983 {
1984     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1985     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1986     uint64_t *d = vd, *n = vn;
1987     uint8_t *pg = vg;
1988 
1989     for (i = 0; i < opr_sz; i += 1) {
1990         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1991     }
1992 }
1993 
1994 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1995 {
1996     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1997     uint64_t *d = vd, *n = vn;
1998     uint8_t *pg = vg;
1999     uint8_t inv = simd_data(desc);
2000 
2001     for (i = 0; i < opr_sz; i += 1) {
2002         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2003     }
2004 }
2005 
2006 /* Three-operand expander, immediate operand, controlled by a predicate.
2007  */
2008 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2009 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2010 {                                                               \
2011     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2012     TYPE imm = simd_data(desc);                                 \
2013     for (i = 0; i < opr_sz; ) {                                 \
2014         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2015         do {                                                    \
2016             if (pg & 1) {                                       \
2017                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2018                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2019             }                                                   \
2020             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2021         } while (i & 15);                                       \
2022     }                                                           \
2023 }
2024 
2025 /* Similarly, specialized for 64-bit operands.  */
2026 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2027 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2028 {                                                               \
2029     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2030     TYPE *d = vd, *n = vn;                                      \
2031     TYPE imm = simd_data(desc);                                 \
2032     uint8_t *pg = vg;                                           \
2033     for (i = 0; i < opr_sz; i += 1) {                           \
2034         if (pg[H1(i)] & 1) {                                    \
2035             TYPE nn = n[i];                                     \
2036             d[i] = OP(nn, imm);                                 \
2037         }                                                       \
2038     }                                                           \
2039 }
2040 
2041 #define DO_SHR(N, M)  (N >> M)
2042 #define DO_SHL(N, M)  (N << M)
2043 
2044 /* Arithmetic shift right for division.  This rounds negative numbers
2045    toward zero as per signed division.  Therefore before shifting,
2046    when N is negative, add 2**M-1.  */
2047 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2048 
2049 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2050 {
2051     if (likely(sh < 64)) {
2052         return (x >> sh) + ((x >> (sh - 1)) & 1);
2053     } else if (sh == 64) {
2054         return x >> 63;
2055     } else {
2056         return 0;
2057     }
2058 }
2059 
2060 static inline int64_t do_srshr(int64_t x, unsigned sh)
2061 {
2062     if (likely(sh < 64)) {
2063         return (x >> sh) + ((x >> (sh - 1)) & 1);
2064     } else {
2065         /* Rounding the sign bit always produces 0. */
2066         return 0;
2067     }
2068 }
2069 
2070 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2071 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2072 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2073 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2074 
2075 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2076 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2077 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2078 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2079 
2080 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2081 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2082 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2083 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2084 
2085 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2086 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2087 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2088 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2089 
2090 /* SVE2 bitwise shift by immediate */
2091 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2092 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2093 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2094 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2095 
2096 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2097 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2098 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2099 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2100 
2101 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2102 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2103 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2104 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2105 
2106 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2107 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2108 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2109 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2110 
2111 #define do_suqrshl_b(n, m) \
2112    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2113 #define do_suqrshl_h(n, m) \
2114    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2115 #define do_suqrshl_s(n, m) \
2116    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2117 #define do_suqrshl_d(n, m) \
2118    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2119 
2120 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2121 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2122 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2123 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2124 
2125 #undef DO_ASRD
2126 #undef DO_ZPZI
2127 #undef DO_ZPZI_D
2128 
2129 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2130 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2131 {                                                            \
2132     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2133     int shift = simd_data(desc);                             \
2134     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2135         TYPEW nn = *(TYPEW *)(vn + i);                       \
2136         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2137     }                                                        \
2138 }
2139 
2140 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2141 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2142 {                                                                 \
2143     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2144     int shift = simd_data(desc);                                  \
2145     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2146         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2147         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2148     }                                                             \
2149 }
2150 
2151 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2152 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2153 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2154 
2155 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2156 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2157 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2158 
2159 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2160 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2161 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2162 
2163 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2164 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2165 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2166 
2167 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2168 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2169 #define DO_SQSHRUN_D(x, sh) \
2170     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2171 
2172 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2173 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2174 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2175 
2176 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2177 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2178 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2179 
2180 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2181 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2182 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2183 
2184 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2185 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2186 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2187 
2188 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2189 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2190 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2191 
2192 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2193 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2194 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2195 
2196 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2197 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2198 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2199 
2200 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2201 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2202 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2203 
2204 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2205 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2206 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2207 
2208 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2209 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2210 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2211 
2212 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2213 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2214 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2215 
2216 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2217 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2218 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2219 
2220 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2221 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2222 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2223 
2224 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2225 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2226 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2227 
2228 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2229 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2230 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2231 
2232 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2233 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2234 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2235 
2236 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2237 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2238 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2239 
2240 #undef DO_SHRNB
2241 #undef DO_SHRNT
2242 
2243 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2244 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2245 {                                                                           \
2246     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2247     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2248         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2249         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2250         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2251     }                                                                       \
2252 }
2253 
2254 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2255 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2256 {                                                                           \
2257     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2258     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2259         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2260         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2261         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2262     }                                                                       \
2263 }
2264 
2265 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2266 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2267 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2268 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2269 
2270 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2271 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2272 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2273 
2274 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2275 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2276 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2277 
2278 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2279 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2280 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2281 
2282 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2283 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2284 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2285 
2286 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2287 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2288 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2289 
2290 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2291 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2292 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2293 
2294 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2295 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2296 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2297 
2298 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2299 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2300 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2301 
2302 #undef DO_RSUBHN
2303 #undef DO_SUBHN
2304 #undef DO_RADDHN
2305 #undef DO_ADDHN
2306 
2307 #undef DO_BINOPNB
2308 
2309 /* Fully general four-operand expander, controlled by a predicate.
2310  */
2311 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2312 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2313                   void *vg, uint32_t desc)                    \
2314 {                                                             \
2315     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2316     for (i = 0; i < opr_sz; ) {                               \
2317         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2318         do {                                                  \
2319             if (pg & 1) {                                     \
2320                 TYPE nn = *(TYPE *)(vn + H(i));               \
2321                 TYPE mm = *(TYPE *)(vm + H(i));               \
2322                 TYPE aa = *(TYPE *)(va + H(i));               \
2323                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2324             }                                                 \
2325             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2326         } while (i & 15);                                     \
2327     }                                                         \
2328 }
2329 
2330 /* Similarly, specialized for 64-bit operands.  */
2331 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2332 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2333                   void *vg, uint32_t desc)                    \
2334 {                                                             \
2335     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2336     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2337     uint8_t *pg = vg;                                         \
2338     for (i = 0; i < opr_sz; i += 1) {                         \
2339         if (pg[H1(i)] & 1) {                                  \
2340             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2341             d[i] = OP(aa, nn, mm);                            \
2342         }                                                     \
2343     }                                                         \
2344 }
2345 
2346 #define DO_MLA(A, N, M)  (A + N * M)
2347 #define DO_MLS(A, N, M)  (A - N * M)
2348 
2349 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2350 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2351 
2352 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2353 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2354 
2355 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2356 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2357 
2358 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2359 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2360 
2361 #undef DO_MLA
2362 #undef DO_MLS
2363 #undef DO_ZPZZZ
2364 #undef DO_ZPZZZ_D
2365 
2366 void HELPER(sve_index_b)(void *vd, uint32_t start,
2367                          uint32_t incr, uint32_t desc)
2368 {
2369     intptr_t i, opr_sz = simd_oprsz(desc);
2370     uint8_t *d = vd;
2371     for (i = 0; i < opr_sz; i += 1) {
2372         d[H1(i)] = start + i * incr;
2373     }
2374 }
2375 
2376 void HELPER(sve_index_h)(void *vd, uint32_t start,
2377                          uint32_t incr, uint32_t desc)
2378 {
2379     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2380     uint16_t *d = vd;
2381     for (i = 0; i < opr_sz; i += 1) {
2382         d[H2(i)] = start + i * incr;
2383     }
2384 }
2385 
2386 void HELPER(sve_index_s)(void *vd, uint32_t start,
2387                          uint32_t incr, uint32_t desc)
2388 {
2389     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2390     uint32_t *d = vd;
2391     for (i = 0; i < opr_sz; i += 1) {
2392         d[H4(i)] = start + i * incr;
2393     }
2394 }
2395 
2396 void HELPER(sve_index_d)(void *vd, uint64_t start,
2397                          uint64_t incr, uint32_t desc)
2398 {
2399     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2400     uint64_t *d = vd;
2401     for (i = 0; i < opr_sz; i += 1) {
2402         d[i] = start + i * incr;
2403     }
2404 }
2405 
2406 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2407 {
2408     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2409     uint32_t sh = simd_data(desc);
2410     uint32_t *d = vd, *n = vn, *m = vm;
2411     for (i = 0; i < opr_sz; i += 1) {
2412         d[i] = n[i] + (m[i] << sh);
2413     }
2414 }
2415 
2416 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2417 {
2418     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2419     uint64_t sh = simd_data(desc);
2420     uint64_t *d = vd, *n = vn, *m = vm;
2421     for (i = 0; i < opr_sz; i += 1) {
2422         d[i] = n[i] + (m[i] << sh);
2423     }
2424 }
2425 
2426 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2427 {
2428     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2429     uint64_t sh = simd_data(desc);
2430     uint64_t *d = vd, *n = vn, *m = vm;
2431     for (i = 0; i < opr_sz; i += 1) {
2432         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2433     }
2434 }
2435 
2436 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2437 {
2438     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2439     uint64_t sh = simd_data(desc);
2440     uint64_t *d = vd, *n = vn, *m = vm;
2441     for (i = 0; i < opr_sz; i += 1) {
2442         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2443     }
2444 }
2445 
2446 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2447 {
2448     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2449     static const uint16_t coeff[] = {
2450         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2451         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2452         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2453         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2454     };
2455     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2456     uint16_t *d = vd, *n = vn;
2457 
2458     for (i = 0; i < opr_sz; i++) {
2459         uint16_t nn = n[i];
2460         intptr_t idx = extract32(nn, 0, 5);
2461         uint16_t exp = extract32(nn, 5, 5);
2462         d[i] = coeff[idx] | (exp << 10);
2463     }
2464 }
2465 
2466 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2467 {
2468     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2469     static const uint32_t coeff[] = {
2470         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2471         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2472         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2473         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2474         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2475         0x1ef532, 0x20b051, 0x227043, 0x243516,
2476         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2477         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2478         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2479         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2480         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2481         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2482         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2483         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2484         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2485         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2486     };
2487     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2488     uint32_t *d = vd, *n = vn;
2489 
2490     for (i = 0; i < opr_sz; i++) {
2491         uint32_t nn = n[i];
2492         intptr_t idx = extract32(nn, 0, 6);
2493         uint32_t exp = extract32(nn, 6, 8);
2494         d[i] = coeff[idx] | (exp << 23);
2495     }
2496 }
2497 
2498 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2499 {
2500     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2501     static const uint64_t coeff[] = {
2502         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2503         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2504         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2505         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2506         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2507         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2508         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2509         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2510         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2511         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2512         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2513         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2514         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2515         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2516         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2517         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2518         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2519         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2520         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2521         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2522         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2523         0xFA7C1819E90D8ull,
2524     };
2525     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2526     uint64_t *d = vd, *n = vn;
2527 
2528     for (i = 0; i < opr_sz; i++) {
2529         uint64_t nn = n[i];
2530         intptr_t idx = extract32(nn, 0, 6);
2531         uint64_t exp = extract32(nn, 6, 11);
2532         d[i] = coeff[idx] | (exp << 52);
2533     }
2534 }
2535 
2536 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2537 {
2538     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2539     uint16_t *d = vd, *n = vn, *m = vm;
2540     for (i = 0; i < opr_sz; i += 1) {
2541         uint16_t nn = n[i];
2542         uint16_t mm = m[i];
2543         if (mm & 1) {
2544             nn = float16_one;
2545         }
2546         d[i] = nn ^ (mm & 2) << 14;
2547     }
2548 }
2549 
2550 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2551 {
2552     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2553     uint32_t *d = vd, *n = vn, *m = vm;
2554     for (i = 0; i < opr_sz; i += 1) {
2555         uint32_t nn = n[i];
2556         uint32_t mm = m[i];
2557         if (mm & 1) {
2558             nn = float32_one;
2559         }
2560         d[i] = nn ^ (mm & 2) << 30;
2561     }
2562 }
2563 
2564 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2565 {
2566     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2567     uint64_t *d = vd, *n = vn, *m = vm;
2568     for (i = 0; i < opr_sz; i += 1) {
2569         uint64_t nn = n[i];
2570         uint64_t mm = m[i];
2571         if (mm & 1) {
2572             nn = float64_one;
2573         }
2574         d[i] = nn ^ (mm & 2) << 62;
2575     }
2576 }
2577 
2578 /*
2579  * Signed saturating addition with scalar operand.
2580  */
2581 
2582 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2583 {
2584     intptr_t i, oprsz = simd_oprsz(desc);
2585 
2586     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2587         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2588     }
2589 }
2590 
2591 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2592 {
2593     intptr_t i, oprsz = simd_oprsz(desc);
2594 
2595     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2596         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2597     }
2598 }
2599 
2600 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2601 {
2602     intptr_t i, oprsz = simd_oprsz(desc);
2603 
2604     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2605         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2606     }
2607 }
2608 
2609 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2610 {
2611     intptr_t i, oprsz = simd_oprsz(desc);
2612 
2613     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2614         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2615     }
2616 }
2617 
2618 /*
2619  * Unsigned saturating addition with scalar operand.
2620  */
2621 
2622 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2623 {
2624     intptr_t i, oprsz = simd_oprsz(desc);
2625 
2626     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2627         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2628     }
2629 }
2630 
2631 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2632 {
2633     intptr_t i, oprsz = simd_oprsz(desc);
2634 
2635     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2636         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2637     }
2638 }
2639 
2640 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2641 {
2642     intptr_t i, oprsz = simd_oprsz(desc);
2643 
2644     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2645         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2646     }
2647 }
2648 
2649 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2650 {
2651     intptr_t i, oprsz = simd_oprsz(desc);
2652 
2653     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2654         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2655     }
2656 }
2657 
2658 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2659 {
2660     intptr_t i, oprsz = simd_oprsz(desc);
2661 
2662     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2663         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2664     }
2665 }
2666 
2667 /* Two operand predicated copy immediate with merge.  All valid immediates
2668  * can fit within 17 signed bits in the simd_data field.
2669  */
2670 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2671                          uint64_t mm, uint32_t desc)
2672 {
2673     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2674     uint64_t *d = vd, *n = vn;
2675     uint8_t *pg = vg;
2676 
2677     mm = dup_const(MO_8, mm);
2678     for (i = 0; i < opr_sz; i += 1) {
2679         uint64_t nn = n[i];
2680         uint64_t pp = expand_pred_b(pg[H1(i)]);
2681         d[i] = (mm & pp) | (nn & ~pp);
2682     }
2683 }
2684 
2685 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2686                          uint64_t mm, uint32_t desc)
2687 {
2688     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2689     uint64_t *d = vd, *n = vn;
2690     uint8_t *pg = vg;
2691 
2692     mm = dup_const(MO_16, mm);
2693     for (i = 0; i < opr_sz; i += 1) {
2694         uint64_t nn = n[i];
2695         uint64_t pp = expand_pred_h(pg[H1(i)]);
2696         d[i] = (mm & pp) | (nn & ~pp);
2697     }
2698 }
2699 
2700 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2701                          uint64_t mm, uint32_t desc)
2702 {
2703     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2704     uint64_t *d = vd, *n = vn;
2705     uint8_t *pg = vg;
2706 
2707     mm = dup_const(MO_32, mm);
2708     for (i = 0; i < opr_sz; i += 1) {
2709         uint64_t nn = n[i];
2710         uint64_t pp = expand_pred_s(pg[H1(i)]);
2711         d[i] = (mm & pp) | (nn & ~pp);
2712     }
2713 }
2714 
2715 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2716                          uint64_t mm, uint32_t desc)
2717 {
2718     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2719     uint64_t *d = vd, *n = vn;
2720     uint8_t *pg = vg;
2721 
2722     for (i = 0; i < opr_sz; i += 1) {
2723         uint64_t nn = n[i];
2724         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2725     }
2726 }
2727 
2728 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2729 {
2730     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2731     uint64_t *d = vd;
2732     uint8_t *pg = vg;
2733 
2734     val = dup_const(MO_8, val);
2735     for (i = 0; i < opr_sz; i += 1) {
2736         d[i] = val & expand_pred_b(pg[H1(i)]);
2737     }
2738 }
2739 
2740 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2741 {
2742     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2743     uint64_t *d = vd;
2744     uint8_t *pg = vg;
2745 
2746     val = dup_const(MO_16, val);
2747     for (i = 0; i < opr_sz; i += 1) {
2748         d[i] = val & expand_pred_h(pg[H1(i)]);
2749     }
2750 }
2751 
2752 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2753 {
2754     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2755     uint64_t *d = vd;
2756     uint8_t *pg = vg;
2757 
2758     val = dup_const(MO_32, val);
2759     for (i = 0; i < opr_sz; i += 1) {
2760         d[i] = val & expand_pred_s(pg[H1(i)]);
2761     }
2762 }
2763 
2764 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2765 {
2766     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2767     uint64_t *d = vd;
2768     uint8_t *pg = vg;
2769 
2770     for (i = 0; i < opr_sz; i += 1) {
2771         d[i] = (pg[H1(i)] & 1 ? val : 0);
2772     }
2773 }
2774 
2775 /* Big-endian hosts need to frob the byte indices.  If the copy
2776  * happens to be 8-byte aligned, then no frobbing necessary.
2777  */
2778 static void swap_memmove(void *vd, void *vs, size_t n)
2779 {
2780     uintptr_t d = (uintptr_t)vd;
2781     uintptr_t s = (uintptr_t)vs;
2782     uintptr_t o = (d | s | n) & 7;
2783     size_t i;
2784 
2785 #if !HOST_BIG_ENDIAN
2786     o = 0;
2787 #endif
2788     switch (o) {
2789     case 0:
2790         memmove(vd, vs, n);
2791         break;
2792 
2793     case 4:
2794         if (d < s || d >= s + n) {
2795             for (i = 0; i < n; i += 4) {
2796                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2797             }
2798         } else {
2799             for (i = n; i > 0; ) {
2800                 i -= 4;
2801                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2802             }
2803         }
2804         break;
2805 
2806     case 2:
2807     case 6:
2808         if (d < s || d >= s + n) {
2809             for (i = 0; i < n; i += 2) {
2810                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2811             }
2812         } else {
2813             for (i = n; i > 0; ) {
2814                 i -= 2;
2815                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2816             }
2817         }
2818         break;
2819 
2820     default:
2821         if (d < s || d >= s + n) {
2822             for (i = 0; i < n; i++) {
2823                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2824             }
2825         } else {
2826             for (i = n; i > 0; ) {
2827                 i -= 1;
2828                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2829             }
2830         }
2831         break;
2832     }
2833 }
2834 
2835 /* Similarly for memset of 0.  */
2836 static void swap_memzero(void *vd, size_t n)
2837 {
2838     uintptr_t d = (uintptr_t)vd;
2839     uintptr_t o = (d | n) & 7;
2840     size_t i;
2841 
2842     /* Usually, the first bit of a predicate is set, so N is 0.  */
2843     if (likely(n == 0)) {
2844         return;
2845     }
2846 
2847 #if !HOST_BIG_ENDIAN
2848     o = 0;
2849 #endif
2850     switch (o) {
2851     case 0:
2852         memset(vd, 0, n);
2853         break;
2854 
2855     case 4:
2856         for (i = 0; i < n; i += 4) {
2857             *(uint32_t *)H1_4(d + i) = 0;
2858         }
2859         break;
2860 
2861     case 2:
2862     case 6:
2863         for (i = 0; i < n; i += 2) {
2864             *(uint16_t *)H1_2(d + i) = 0;
2865         }
2866         break;
2867 
2868     default:
2869         for (i = 0; i < n; i++) {
2870             *(uint8_t *)H1(d + i) = 0;
2871         }
2872         break;
2873     }
2874 }
2875 
2876 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2877 {
2878     intptr_t opr_sz = simd_oprsz(desc);
2879     size_t n_ofs = simd_data(desc);
2880     size_t n_siz = opr_sz - n_ofs;
2881 
2882     if (vd != vm) {
2883         swap_memmove(vd, vn + n_ofs, n_siz);
2884         swap_memmove(vd + n_siz, vm, n_ofs);
2885     } else if (vd != vn) {
2886         swap_memmove(vd + n_siz, vd, n_ofs);
2887         swap_memmove(vd, vn + n_ofs, n_siz);
2888     } else {
2889         /* vd == vn == vm.  Need temp space.  */
2890         ARMVectorReg tmp;
2891         swap_memmove(&tmp, vm, n_ofs);
2892         swap_memmove(vd, vd + n_ofs, n_siz);
2893         memcpy(vd + n_siz, &tmp, n_ofs);
2894     }
2895 }
2896 
2897 #define DO_INSR(NAME, TYPE, H) \
2898 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2899 {                                                                  \
2900     intptr_t opr_sz = simd_oprsz(desc);                            \
2901     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2902     *(TYPE *)(vd + H(0)) = val;                                    \
2903 }
2904 
2905 DO_INSR(sve_insr_b, uint8_t, H1)
2906 DO_INSR(sve_insr_h, uint16_t, H1_2)
2907 DO_INSR(sve_insr_s, uint32_t, H1_4)
2908 DO_INSR(sve_insr_d, uint64_t, H1_8)
2909 
2910 #undef DO_INSR
2911 
2912 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2913 {
2914     intptr_t i, j, opr_sz = simd_oprsz(desc);
2915     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2916         uint64_t f = *(uint64_t *)(vn + i);
2917         uint64_t b = *(uint64_t *)(vn + j);
2918         *(uint64_t *)(vd + i) = bswap64(b);
2919         *(uint64_t *)(vd + j) = bswap64(f);
2920     }
2921 }
2922 
2923 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2924 {
2925     intptr_t i, j, opr_sz = simd_oprsz(desc);
2926     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2927         uint64_t f = *(uint64_t *)(vn + i);
2928         uint64_t b = *(uint64_t *)(vn + j);
2929         *(uint64_t *)(vd + i) = hswap64(b);
2930         *(uint64_t *)(vd + j) = hswap64(f);
2931     }
2932 }
2933 
2934 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2935 {
2936     intptr_t i, j, opr_sz = simd_oprsz(desc);
2937     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2938         uint64_t f = *(uint64_t *)(vn + i);
2939         uint64_t b = *(uint64_t *)(vn + j);
2940         *(uint64_t *)(vd + i) = rol64(b, 32);
2941         *(uint64_t *)(vd + j) = rol64(f, 32);
2942     }
2943 }
2944 
2945 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2946 {
2947     intptr_t i, j, opr_sz = simd_oprsz(desc);
2948     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2949         uint64_t f = *(uint64_t *)(vn + i);
2950         uint64_t b = *(uint64_t *)(vn + j);
2951         *(uint64_t *)(vd + i) = b;
2952         *(uint64_t *)(vd + j) = f;
2953     }
2954 }
2955 
2956 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2957 
2958 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2959                            bool is_tbx, tb_impl_fn *fn)
2960 {
2961     ARMVectorReg scratch;
2962     uintptr_t oprsz = simd_oprsz(desc);
2963 
2964     if (unlikely(vd == vn)) {
2965         vn = memcpy(&scratch, vn, oprsz);
2966     }
2967 
2968     fn(vd, vn, NULL, vm, oprsz, is_tbx);
2969 }
2970 
2971 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2972                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2973 {
2974     ARMVectorReg scratch;
2975     uintptr_t oprsz = simd_oprsz(desc);
2976 
2977     if (unlikely(vd == vn0)) {
2978         vn0 = memcpy(&scratch, vn0, oprsz);
2979         if (vd == vn1) {
2980             vn1 = vn0;
2981         }
2982     } else if (unlikely(vd == vn1)) {
2983         vn1 = memcpy(&scratch, vn1, oprsz);
2984     }
2985 
2986     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2987 }
2988 
2989 #define DO_TB(SUFF, TYPE, H)                                            \
2990 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
2991                                 void *vm, uintptr_t oprsz, bool is_tbx) \
2992 {                                                                       \
2993     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
2994     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
2995     for (i = 0; i < nelem; ++i) {                                       \
2996         TYPE index = indexes[H1(i)], val = 0;                           \
2997         if (index < nelem) {                                            \
2998             val = tbl0[H(index)];                                       \
2999         } else {                                                        \
3000             index -= nelem;                                             \
3001             if (tbl1 && index < nelem) {                                \
3002                 val = tbl1[H(index)];                                   \
3003             } else if (is_tbx) {                                        \
3004                 continue;                                               \
3005             }                                                           \
3006         }                                                               \
3007         d[H(i)] = val;                                                  \
3008     }                                                                   \
3009 }                                                                       \
3010 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3011 {                                                                       \
3012     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3013 }                                                                       \
3014 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3015                              void *vm, uint32_t desc)                   \
3016 {                                                                       \
3017     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3018 }                                                                       \
3019 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3020 {                                                                       \
3021     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3022 }
3023 
3024 DO_TB(b, uint8_t, H1)
3025 DO_TB(h, uint16_t, H2)
3026 DO_TB(s, uint32_t, H4)
3027 DO_TB(d, uint64_t, H8)
3028 
3029 #undef DO_TB
3030 
3031 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3032 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3033 {                                                              \
3034     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3035     TYPED *d = vd;                                             \
3036     TYPES *n = vn;                                             \
3037     ARMVectorReg tmp;                                          \
3038     if (unlikely(vn - vd < opr_sz)) {                          \
3039         n = memcpy(&tmp, n, opr_sz / 2);                       \
3040     }                                                          \
3041     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3042         d[HD(i)] = n[HS(i)];                                   \
3043     }                                                          \
3044 }
3045 
3046 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3047 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3048 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3049 
3050 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3051 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3052 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3053 
3054 #undef DO_UNPK
3055 
3056 /* Mask of bits included in the even numbered predicates of width esz.
3057  * We also use this for expand_bits/compress_bits, and so extend the
3058  * same pattern out to 16-bit units.
3059  */
3060 static const uint64_t even_bit_esz_masks[5] = {
3061     0x5555555555555555ull,
3062     0x3333333333333333ull,
3063     0x0f0f0f0f0f0f0f0full,
3064     0x00ff00ff00ff00ffull,
3065     0x0000ffff0000ffffull,
3066 };
3067 
3068 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3069  * For N==0, this corresponds to the operation that in qemu/bitops.h
3070  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3071  * section 7-2 Shuffling Bits.
3072  */
3073 static uint64_t expand_bits(uint64_t x, int n)
3074 {
3075     int i;
3076 
3077     x &= 0xffffffffu;
3078     for (i = 4; i >= n; i--) {
3079         int sh = 1 << i;
3080         x = ((x << sh) | x) & even_bit_esz_masks[i];
3081     }
3082     return x;
3083 }
3084 
3085 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3086  * For N==0, this corresponds to the operation that in qemu/bitops.h
3087  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3088  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3089  */
3090 static uint64_t compress_bits(uint64_t x, int n)
3091 {
3092     int i;
3093 
3094     for (i = n; i <= 4; i++) {
3095         int sh = 1 << i;
3096         x &= even_bit_esz_masks[i];
3097         x = (x >> sh) | x;
3098     }
3099     return x & 0xffffffffu;
3100 }
3101 
3102 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3103 {
3104     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3105     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3106     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3107     int esize = 1 << esz;
3108     uint64_t *d = vd;
3109     intptr_t i;
3110 
3111     if (oprsz <= 8) {
3112         uint64_t nn = *(uint64_t *)vn;
3113         uint64_t mm = *(uint64_t *)vm;
3114         int half = 4 * oprsz;
3115 
3116         nn = extract64(nn, high * half, half);
3117         mm = extract64(mm, high * half, half);
3118         nn = expand_bits(nn, esz);
3119         mm = expand_bits(mm, esz);
3120         d[0] = nn | (mm << esize);
3121     } else {
3122         ARMPredicateReg tmp;
3123 
3124         /* We produce output faster than we consume input.
3125            Therefore we must be mindful of possible overlap.  */
3126         if (vd == vn) {
3127             vn = memcpy(&tmp, vn, oprsz);
3128             if (vd == vm) {
3129                 vm = vn;
3130             }
3131         } else if (vd == vm) {
3132             vm = memcpy(&tmp, vm, oprsz);
3133         }
3134         if (high) {
3135             high = oprsz >> 1;
3136         }
3137 
3138         if ((oprsz & 7) == 0) {
3139             uint32_t *n = vn, *m = vm;
3140             high >>= 2;
3141 
3142             for (i = 0; i < oprsz / 8; i++) {
3143                 uint64_t nn = n[H4(high + i)];
3144                 uint64_t mm = m[H4(high + i)];
3145 
3146                 nn = expand_bits(nn, esz);
3147                 mm = expand_bits(mm, esz);
3148                 d[i] = nn | (mm << esize);
3149             }
3150         } else {
3151             uint8_t *n = vn, *m = vm;
3152             uint16_t *d16 = vd;
3153 
3154             for (i = 0; i < oprsz / 2; i++) {
3155                 uint16_t nn = n[H1(high + i)];
3156                 uint16_t mm = m[H1(high + i)];
3157 
3158                 nn = expand_bits(nn, esz);
3159                 mm = expand_bits(mm, esz);
3160                 d16[H2(i)] = nn | (mm << esize);
3161             }
3162         }
3163     }
3164 }
3165 
3166 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3167 {
3168     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3169     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3170     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3171     uint64_t *d = vd, *n = vn, *m = vm;
3172     uint64_t l, h;
3173     intptr_t i;
3174 
3175     if (oprsz <= 8) {
3176         l = compress_bits(n[0] >> odd, esz);
3177         h = compress_bits(m[0] >> odd, esz);
3178         d[0] = l | (h << (4 * oprsz));
3179     } else {
3180         ARMPredicateReg tmp_m;
3181         intptr_t oprsz_16 = oprsz / 16;
3182 
3183         if ((vm - vd) < (uintptr_t)oprsz) {
3184             m = memcpy(&tmp_m, vm, oprsz);
3185         }
3186 
3187         for (i = 0; i < oprsz_16; i++) {
3188             l = n[2 * i + 0];
3189             h = n[2 * i + 1];
3190             l = compress_bits(l >> odd, esz);
3191             h = compress_bits(h >> odd, esz);
3192             d[i] = l | (h << 32);
3193         }
3194 
3195         /*
3196          * For VL which is not a multiple of 512, the results from M do not
3197          * align nicely with the uint64_t for D.  Put the aligned results
3198          * from M into TMP_M and then copy it into place afterward.
3199          */
3200         if (oprsz & 15) {
3201             int final_shift = (oprsz & 15) * 2;
3202 
3203             l = n[2 * i + 0];
3204             h = n[2 * i + 1];
3205             l = compress_bits(l >> odd, esz);
3206             h = compress_bits(h >> odd, esz);
3207             d[i] = l | (h << final_shift);
3208 
3209             for (i = 0; i < oprsz_16; i++) {
3210                 l = m[2 * i + 0];
3211                 h = m[2 * i + 1];
3212                 l = compress_bits(l >> odd, esz);
3213                 h = compress_bits(h >> odd, esz);
3214                 tmp_m.p[i] = l | (h << 32);
3215             }
3216             l = m[2 * i + 0];
3217             h = m[2 * i + 1];
3218             l = compress_bits(l >> odd, esz);
3219             h = compress_bits(h >> odd, esz);
3220             tmp_m.p[i] = l | (h << final_shift);
3221 
3222             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3223         } else {
3224             for (i = 0; i < oprsz_16; i++) {
3225                 l = m[2 * i + 0];
3226                 h = m[2 * i + 1];
3227                 l = compress_bits(l >> odd, esz);
3228                 h = compress_bits(h >> odd, esz);
3229                 d[oprsz_16 + i] = l | (h << 32);
3230             }
3231         }
3232     }
3233 }
3234 
3235 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3236 {
3237     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3238     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3239     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3240     uint64_t *d = vd, *n = vn, *m = vm;
3241     uint64_t mask;
3242     int shr, shl;
3243     intptr_t i;
3244 
3245     shl = 1 << esz;
3246     shr = 0;
3247     mask = even_bit_esz_masks[esz];
3248     if (odd) {
3249         mask <<= shl;
3250         shr = shl;
3251         shl = 0;
3252     }
3253 
3254     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3255         uint64_t nn = (n[i] & mask) >> shr;
3256         uint64_t mm = (m[i] & mask) << shl;
3257         d[i] = nn + mm;
3258     }
3259 }
3260 
3261 /* Reverse units of 2**N bits.  */
3262 static uint64_t reverse_bits_64(uint64_t x, int n)
3263 {
3264     int i, sh;
3265 
3266     x = bswap64(x);
3267     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3268         uint64_t mask = even_bit_esz_masks[i];
3269         x = ((x & mask) << sh) | ((x >> sh) & mask);
3270     }
3271     return x;
3272 }
3273 
3274 static uint8_t reverse_bits_8(uint8_t x, int n)
3275 {
3276     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3277     int i, sh;
3278 
3279     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3280         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3281     }
3282     return x;
3283 }
3284 
3285 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3286 {
3287     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3288     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3289     intptr_t i, oprsz_2 = oprsz / 2;
3290 
3291     if (oprsz <= 8) {
3292         uint64_t l = *(uint64_t *)vn;
3293         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3294         *(uint64_t *)vd = l;
3295     } else if ((oprsz & 15) == 0) {
3296         for (i = 0; i < oprsz_2; i += 8) {
3297             intptr_t ih = oprsz - 8 - i;
3298             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3299             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3300             *(uint64_t *)(vd + i) = h;
3301             *(uint64_t *)(vd + ih) = l;
3302         }
3303     } else {
3304         for (i = 0; i < oprsz_2; i += 1) {
3305             intptr_t il = H1(i);
3306             intptr_t ih = H1(oprsz - 1 - i);
3307             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3308             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3309             *(uint8_t *)(vd + il) = h;
3310             *(uint8_t *)(vd + ih) = l;
3311         }
3312     }
3313 }
3314 
3315 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3316 {
3317     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3318     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3319     uint64_t *d = vd;
3320     intptr_t i;
3321 
3322     if (oprsz <= 8) {
3323         uint64_t nn = *(uint64_t *)vn;
3324         int half = 4 * oprsz;
3325 
3326         nn = extract64(nn, high * half, half);
3327         nn = expand_bits(nn, 0);
3328         d[0] = nn;
3329     } else {
3330         ARMPredicateReg tmp_n;
3331 
3332         /* We produce output faster than we consume input.
3333            Therefore we must be mindful of possible overlap.  */
3334         if ((vn - vd) < (uintptr_t)oprsz) {
3335             vn = memcpy(&tmp_n, vn, oprsz);
3336         }
3337         if (high) {
3338             high = oprsz >> 1;
3339         }
3340 
3341         if ((oprsz & 7) == 0) {
3342             uint32_t *n = vn;
3343             high >>= 2;
3344 
3345             for (i = 0; i < oprsz / 8; i++) {
3346                 uint64_t nn = n[H4(high + i)];
3347                 d[i] = expand_bits(nn, 0);
3348             }
3349         } else {
3350             uint16_t *d16 = vd;
3351             uint8_t *n = vn;
3352 
3353             for (i = 0; i < oprsz / 2; i++) {
3354                 uint16_t nn = n[H1(high + i)];
3355                 d16[H2(i)] = expand_bits(nn, 0);
3356             }
3357         }
3358     }
3359 }
3360 
3361 #define DO_ZIP(NAME, TYPE, H) \
3362 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3363 {                                                                    \
3364     intptr_t oprsz = simd_oprsz(desc);                               \
3365     intptr_t odd_ofs = simd_data(desc);                              \
3366     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3367     ARMVectorReg tmp_n, tmp_m;                                       \
3368     /* We produce output faster than we consume input.               \
3369        Therefore we must be mindful of possible overlap.  */         \
3370     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3371         vn = memcpy(&tmp_n, vn, oprsz);                              \
3372     }                                                                \
3373     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3374         vm = memcpy(&tmp_m, vm, oprsz);                              \
3375     }                                                                \
3376     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3377         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3378         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3379             *(TYPE *)(vm + odd_ofs + H(i));                          \
3380     }                                                                \
3381     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3382         memset(vd + oprsz - 16, 0, 16);                              \
3383     }                                                                \
3384 }
3385 
3386 DO_ZIP(sve_zip_b, uint8_t, H1)
3387 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3388 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3389 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3390 DO_ZIP(sve2_zip_q, Int128, )
3391 
3392 #define DO_UZP(NAME, TYPE, H) \
3393 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3394 {                                                                      \
3395     intptr_t oprsz = simd_oprsz(desc);                                 \
3396     intptr_t odd_ofs = simd_data(desc);                                \
3397     intptr_t i, p;                                                     \
3398     ARMVectorReg tmp_m;                                                \
3399     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3400         vm = memcpy(&tmp_m, vm, oprsz);                                \
3401     }                                                                  \
3402     i = 0, p = odd_ofs;                                                \
3403     do {                                                               \
3404         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3405         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3406     } while (p < oprsz);                                               \
3407     p -= oprsz;                                                        \
3408     do {                                                               \
3409         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3410         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3411     } while (p < oprsz);                                               \
3412     tcg_debug_assert(i == oprsz);                                      \
3413 }
3414 
3415 DO_UZP(sve_uzp_b, uint8_t, H1)
3416 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3417 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3418 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3419 DO_UZP(sve2_uzp_q, Int128, )
3420 
3421 #define DO_TRN(NAME, TYPE, H) \
3422 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3423 {                                                                      \
3424     intptr_t oprsz = simd_oprsz(desc);                                 \
3425     intptr_t odd_ofs = simd_data(desc);                                \
3426     intptr_t i;                                                        \
3427     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3428         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3429         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3430         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3431         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3432     }                                                                  \
3433     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3434         memset(vd + oprsz - 16, 0, 16);                                \
3435     }                                                                  \
3436 }
3437 
3438 DO_TRN(sve_trn_b, uint8_t, H1)
3439 DO_TRN(sve_trn_h, uint16_t, H1_2)
3440 DO_TRN(sve_trn_s, uint32_t, H1_4)
3441 DO_TRN(sve_trn_d, uint64_t, H1_8)
3442 DO_TRN(sve2_trn_q, Int128, )
3443 
3444 #undef DO_ZIP
3445 #undef DO_UZP
3446 #undef DO_TRN
3447 
3448 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3449 {
3450     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3451     uint32_t *d = vd, *n = vn;
3452     uint8_t *pg = vg;
3453 
3454     for (i = j = 0; i < opr_sz; i++) {
3455         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3456             d[H4(j)] = n[H4(i)];
3457             j++;
3458         }
3459     }
3460     for (; j < opr_sz; j++) {
3461         d[H4(j)] = 0;
3462     }
3463 }
3464 
3465 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3466 {
3467     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3468     uint64_t *d = vd, *n = vn;
3469     uint8_t *pg = vg;
3470 
3471     for (i = j = 0; i < opr_sz; i++) {
3472         if (pg[H1(i)] & 1) {
3473             d[j] = n[i];
3474             j++;
3475         }
3476     }
3477     for (; j < opr_sz; j++) {
3478         d[j] = 0;
3479     }
3480 }
3481 
3482 /* Similar to the ARM LastActiveElement pseudocode function, except the
3483  * result is multiplied by the element size.  This includes the not found
3484  * indication; e.g. not found for esz=3 is -8.
3485  */
3486 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3487 {
3488     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3489     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3490 
3491     return last_active_element(vg, words, esz);
3492 }
3493 
3494 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3495 {
3496     intptr_t opr_sz = simd_oprsz(desc) / 8;
3497     int esz = simd_data(desc);
3498     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3499     intptr_t i, first_i, last_i;
3500     ARMVectorReg tmp;
3501 
3502     first_i = last_i = 0;
3503     first_g = last_g = 0;
3504 
3505     /* Find the extent of the active elements within VG.  */
3506     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3507         pg = *(uint64_t *)(vg + i) & mask;
3508         if (pg) {
3509             if (last_g == 0) {
3510                 last_g = pg;
3511                 last_i = i;
3512             }
3513             first_g = pg;
3514             first_i = i;
3515         }
3516     }
3517 
3518     len = 0;
3519     if (first_g != 0) {
3520         first_i = first_i * 8 + ctz64(first_g);
3521         last_i = last_i * 8 + 63 - clz64(last_g);
3522         len = last_i - first_i + (1 << esz);
3523         if (vd == vm) {
3524             vm = memcpy(&tmp, vm, opr_sz * 8);
3525         }
3526         swap_memmove(vd, vn + first_i, len);
3527     }
3528     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3529 }
3530 
3531 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3532                             void *vg, uint32_t desc)
3533 {
3534     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3535     uint64_t *d = vd, *n = vn, *m = vm;
3536     uint8_t *pg = vg;
3537 
3538     for (i = 0; i < opr_sz; i += 1) {
3539         uint64_t nn = n[i], mm = m[i];
3540         uint64_t pp = expand_pred_b(pg[H1(i)]);
3541         d[i] = (nn & pp) | (mm & ~pp);
3542     }
3543 }
3544 
3545 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3546                             void *vg, uint32_t desc)
3547 {
3548     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3549     uint64_t *d = vd, *n = vn, *m = vm;
3550     uint8_t *pg = vg;
3551 
3552     for (i = 0; i < opr_sz; i += 1) {
3553         uint64_t nn = n[i], mm = m[i];
3554         uint64_t pp = expand_pred_h(pg[H1(i)]);
3555         d[i] = (nn & pp) | (mm & ~pp);
3556     }
3557 }
3558 
3559 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3560                             void *vg, uint32_t desc)
3561 {
3562     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3563     uint64_t *d = vd, *n = vn, *m = vm;
3564     uint8_t *pg = vg;
3565 
3566     for (i = 0; i < opr_sz; i += 1) {
3567         uint64_t nn = n[i], mm = m[i];
3568         uint64_t pp = expand_pred_s(pg[H1(i)]);
3569         d[i] = (nn & pp) | (mm & ~pp);
3570     }
3571 }
3572 
3573 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3574                             void *vg, uint32_t desc)
3575 {
3576     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3577     uint64_t *d = vd, *n = vn, *m = vm;
3578     uint8_t *pg = vg;
3579 
3580     for (i = 0; i < opr_sz; i += 1) {
3581         uint64_t nn = n[i], mm = m[i];
3582         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3583     }
3584 }
3585 
3586 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3587                             void *vg, uint32_t desc)
3588 {
3589     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3590     Int128 *d = vd, *n = vn, *m = vm;
3591     uint16_t *pg = vg;
3592 
3593     for (i = 0; i < opr_sz; i += 1) {
3594         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3595     }
3596 }
3597 
3598 /* Two operand comparison controlled by a predicate.
3599  * ??? It is very tempting to want to be able to expand this inline
3600  * with x86 instructions, e.g.
3601  *
3602  *    vcmpeqw    zm, zn, %ymm0
3603  *    vpmovmskb  %ymm0, %eax
3604  *    and        $0x5555, %eax
3605  *    and        pg, %eax
3606  *
3607  * or even aarch64, e.g.
3608  *
3609  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3610  *    cmeq       v0.8h, zn, zm
3611  *    and        v0.8h, v0.8h, mask
3612  *    addv       h0, v0.8h
3613  *    and        v0.8b, pg
3614  *
3615  * However, coming up with an abstraction that allows vector inputs and
3616  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3617  * scalar outputs, is tricky.
3618  */
3619 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3620 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3621 {                                                                            \
3622     intptr_t opr_sz = simd_oprsz(desc);                                      \
3623     uint32_t flags = PREDTEST_INIT;                                          \
3624     intptr_t i = opr_sz;                                                     \
3625     do {                                                                     \
3626         uint64_t out = 0, pg;                                                \
3627         do {                                                                 \
3628             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3629             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3630             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3631             out |= nn OP mm;                                                 \
3632         } while (i & 63);                                                    \
3633         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3634         out &= pg;                                                           \
3635         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3636         flags = iter_predtest_bwd(out, pg, flags);                           \
3637     } while (i > 0);                                                         \
3638     return flags;                                                            \
3639 }
3640 
3641 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3642     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3643 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3644     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3645 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3646     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3647 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3648     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3649 
3650 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3651 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3652 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3653 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3654 
3655 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3656 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3657 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3658 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3659 
3660 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3661 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3662 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3663 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3664 
3665 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3666 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3667 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3668 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3669 
3670 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3671 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3672 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3673 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3674 
3675 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3676 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3677 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3678 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3679 
3680 #undef DO_CMP_PPZZ_B
3681 #undef DO_CMP_PPZZ_H
3682 #undef DO_CMP_PPZZ_S
3683 #undef DO_CMP_PPZZ_D
3684 #undef DO_CMP_PPZZ
3685 
3686 /* Similar, but the second source is "wide".  */
3687 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3688 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3689 {                                                                            \
3690     intptr_t opr_sz = simd_oprsz(desc);                                      \
3691     uint32_t flags = PREDTEST_INIT;                                          \
3692     intptr_t i = opr_sz;                                                     \
3693     do {                                                                     \
3694         uint64_t out = 0, pg;                                                \
3695         do {                                                                 \
3696             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3697             do {                                                             \
3698                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3699                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3700                 out |= nn OP mm;                                             \
3701             } while (i & 7);                                                 \
3702         } while (i & 63);                                                    \
3703         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3704         out &= pg;                                                           \
3705         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3706         flags = iter_predtest_bwd(out, pg, flags);                           \
3707     } while (i > 0);                                                         \
3708     return flags;                                                            \
3709 }
3710 
3711 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3712     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3713 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3714     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3715 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3716     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3717 
3718 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3719 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3720 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3721 
3722 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3723 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3724 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3725 
3726 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3727 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3728 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3729 
3730 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3731 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3732 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3733 
3734 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3735 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3736 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3737 
3738 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3739 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3740 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3741 
3742 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3743 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3744 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3745 
3746 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3747 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3748 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3749 
3750 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3751 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3752 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3753 
3754 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3755 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3756 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3757 
3758 #undef DO_CMP_PPZW_B
3759 #undef DO_CMP_PPZW_H
3760 #undef DO_CMP_PPZW_S
3761 #undef DO_CMP_PPZW
3762 
3763 /* Similar, but the second source is immediate.  */
3764 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3765 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3766 {                                                                    \
3767     intptr_t opr_sz = simd_oprsz(desc);                              \
3768     uint32_t flags = PREDTEST_INIT;                                  \
3769     TYPE mm = simd_data(desc);                                       \
3770     intptr_t i = opr_sz;                                             \
3771     do {                                                             \
3772         uint64_t out = 0, pg;                                        \
3773         do {                                                         \
3774             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3775             TYPE nn = *(TYPE *)(vn + H(i));                          \
3776             out |= nn OP mm;                                         \
3777         } while (i & 63);                                            \
3778         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3779         out &= pg;                                                   \
3780         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3781         flags = iter_predtest_bwd(out, pg, flags);                   \
3782     } while (i > 0);                                                 \
3783     return flags;                                                    \
3784 }
3785 
3786 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3787     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3788 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3789     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3790 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3791     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3792 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3793     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3794 
3795 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3796 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3797 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3798 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3799 
3800 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3801 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3802 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3803 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3804 
3805 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3806 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3807 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3808 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3809 
3810 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3811 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3812 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3813 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3814 
3815 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3816 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3817 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3818 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3819 
3820 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3821 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3822 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3823 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3824 
3825 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3826 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3827 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3828 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3829 
3830 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3831 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3832 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3833 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3834 
3835 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3836 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3837 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3838 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3839 
3840 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3841 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3842 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3843 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3844 
3845 #undef DO_CMP_PPZI_B
3846 #undef DO_CMP_PPZI_H
3847 #undef DO_CMP_PPZI_S
3848 #undef DO_CMP_PPZI_D
3849 #undef DO_CMP_PPZI
3850 
3851 /* Similar to the ARM LastActive pseudocode function.  */
3852 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3853 {
3854     intptr_t i;
3855 
3856     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3857         uint64_t pg = *(uint64_t *)(vg + i);
3858         if (pg) {
3859             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3860         }
3861     }
3862     return 0;
3863 }
3864 
3865 /* Compute a mask into RETB that is true for all G, up to and including
3866  * (if after) or excluding (if !after) the first G & N.
3867  * Return true if BRK found.
3868  */
3869 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3870                         bool brk, bool after)
3871 {
3872     uint64_t b;
3873 
3874     if (brk) {
3875         b = 0;
3876     } else if ((g & n) == 0) {
3877         /* For all G, no N are set; break not found.  */
3878         b = g;
3879     } else {
3880         /* Break somewhere in N.  Locate it.  */
3881         b = g & n;            /* guard true, pred true */
3882         b = b & -b;           /* first such */
3883         if (after) {
3884             b = b | (b - 1);  /* break after same */
3885         } else {
3886             b = b - 1;        /* break before same */
3887         }
3888         brk = true;
3889     }
3890 
3891     *retb = b;
3892     return brk;
3893 }
3894 
3895 /* Compute a zeroing BRK.  */
3896 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3897                           intptr_t oprsz, bool after)
3898 {
3899     bool brk = false;
3900     intptr_t i;
3901 
3902     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3903         uint64_t this_b, this_g = g[i];
3904 
3905         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3906         d[i] = this_b & this_g;
3907     }
3908 }
3909 
3910 /* Likewise, but also compute flags.  */
3911 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3912                                intptr_t oprsz, bool after)
3913 {
3914     uint32_t flags = PREDTEST_INIT;
3915     bool brk = false;
3916     intptr_t i;
3917 
3918     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3919         uint64_t this_b, this_d, this_g = g[i];
3920 
3921         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3922         d[i] = this_d = this_b & this_g;
3923         flags = iter_predtest_fwd(this_d, this_g, flags);
3924     }
3925     return flags;
3926 }
3927 
3928 /* Compute a merging BRK.  */
3929 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3930                           intptr_t oprsz, bool after)
3931 {
3932     bool brk = false;
3933     intptr_t i;
3934 
3935     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3936         uint64_t this_b, this_g = g[i];
3937 
3938         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3939         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3940     }
3941 }
3942 
3943 /* Likewise, but also compute flags.  */
3944 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3945                                intptr_t oprsz, bool after)
3946 {
3947     uint32_t flags = PREDTEST_INIT;
3948     bool brk = false;
3949     intptr_t i;
3950 
3951     for (i = 0; i < oprsz / 8; ++i) {
3952         uint64_t this_b, this_d = d[i], this_g = g[i];
3953 
3954         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3955         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3956         flags = iter_predtest_fwd(this_d, this_g, flags);
3957     }
3958     return flags;
3959 }
3960 
3961 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3962 {
3963     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3964      * The compiler should turn this into 4 64-bit integer stores.
3965      */
3966     memset(d, 0, sizeof(ARMPredicateReg));
3967     return PREDTEST_INIT;
3968 }
3969 
3970 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3971                        uint32_t pred_desc)
3972 {
3973     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3974     if (last_active_pred(vn, vg, oprsz)) {
3975         compute_brk_z(vd, vm, vg, oprsz, true);
3976     } else {
3977         do_zero(vd, oprsz);
3978     }
3979 }
3980 
3981 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3982                             uint32_t pred_desc)
3983 {
3984     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3985     if (last_active_pred(vn, vg, oprsz)) {
3986         return compute_brks_z(vd, vm, vg, oprsz, true);
3987     } else {
3988         return do_zero(vd, oprsz);
3989     }
3990 }
3991 
3992 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3993                        uint32_t pred_desc)
3994 {
3995     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3996     if (last_active_pred(vn, vg, oprsz)) {
3997         compute_brk_z(vd, vm, vg, oprsz, false);
3998     } else {
3999         do_zero(vd, oprsz);
4000     }
4001 }
4002 
4003 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4004                             uint32_t pred_desc)
4005 {
4006     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4007     if (last_active_pred(vn, vg, oprsz)) {
4008         return compute_brks_z(vd, vm, vg, oprsz, false);
4009     } else {
4010         return do_zero(vd, oprsz);
4011     }
4012 }
4013 
4014 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4015 {
4016     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4017     compute_brk_z(vd, vn, vg, oprsz, true);
4018 }
4019 
4020 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4021 {
4022     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4023     return compute_brks_z(vd, vn, vg, oprsz, true);
4024 }
4025 
4026 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4027 {
4028     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4029     compute_brk_z(vd, vn, vg, oprsz, false);
4030 }
4031 
4032 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4033 {
4034     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4035     return compute_brks_z(vd, vn, vg, oprsz, false);
4036 }
4037 
4038 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4039 {
4040     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4041     compute_brk_m(vd, vn, vg, oprsz, true);
4042 }
4043 
4044 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4045 {
4046     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4047     return compute_brks_m(vd, vn, vg, oprsz, true);
4048 }
4049 
4050 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4051 {
4052     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4053     compute_brk_m(vd, vn, vg, oprsz, false);
4054 }
4055 
4056 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4057 {
4058     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4059     return compute_brks_m(vd, vn, vg, oprsz, false);
4060 }
4061 
4062 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4063 {
4064     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4065     if (!last_active_pred(vn, vg, oprsz)) {
4066         do_zero(vd, oprsz);
4067     }
4068 }
4069 
4070 /* As if PredTest(Ones(PL), D, esz).  */
4071 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4072                               uint64_t esz_mask)
4073 {
4074     uint32_t flags = PREDTEST_INIT;
4075     intptr_t i;
4076 
4077     for (i = 0; i < oprsz / 8; i++) {
4078         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4079     }
4080     if (oprsz & 7) {
4081         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4082         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4083     }
4084     return flags;
4085 }
4086 
4087 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4088 {
4089     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4090     if (last_active_pred(vn, vg, oprsz)) {
4091         return predtest_ones(vd, oprsz, -1);
4092     } else {
4093         return do_zero(vd, oprsz);
4094     }
4095 }
4096 
4097 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4098 {
4099     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4100     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4101     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4102     intptr_t i;
4103 
4104     for (i = 0; i < words; ++i) {
4105         uint64_t t = n[i] & g[i] & mask;
4106         sum += ctpop64(t);
4107     }
4108     return sum;
4109 }
4110 
4111 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4112 {
4113     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4114     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4115     uint64_t esz_mask = pred_esz_masks[esz];
4116     ARMPredicateReg *d = vd;
4117     uint32_t flags;
4118     intptr_t i;
4119 
4120     /* Begin with a zero predicate register.  */
4121     flags = do_zero(d, oprsz);
4122     if (count == 0) {
4123         return flags;
4124     }
4125 
4126     /* Set all of the requested bits.  */
4127     for (i = 0; i < count / 64; ++i) {
4128         d->p[i] = esz_mask;
4129     }
4130     if (count & 63) {
4131         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4132     }
4133 
4134     return predtest_ones(d, oprsz, esz_mask);
4135 }
4136 
4137 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4138 {
4139     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4140     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4141     uint64_t esz_mask = pred_esz_masks[esz];
4142     ARMPredicateReg *d = vd;
4143     intptr_t i, invcount, oprbits;
4144     uint64_t bits;
4145 
4146     if (count == 0) {
4147         return do_zero(d, oprsz);
4148     }
4149 
4150     oprbits = oprsz * 8;
4151     tcg_debug_assert(count <= oprbits);
4152 
4153     bits = esz_mask;
4154     if (oprbits & 63) {
4155         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4156     }
4157 
4158     invcount = oprbits - count;
4159     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4160         d->p[i] = bits;
4161         bits = esz_mask;
4162     }
4163 
4164     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4165 
4166     while (--i >= 0) {
4167         d->p[i] = 0;
4168     }
4169 
4170     return predtest_ones(d, oprsz, esz_mask);
4171 }
4172 
4173 /* Recursive reduction on a function;
4174  * C.f. the ARM ARM function ReducePredicated.
4175  *
4176  * While it would be possible to write this without the DATA temporary,
4177  * it is much simpler to process the predicate register this way.
4178  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4179  * little to gain with a more complex non-recursive form.
4180  */
4181 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4182 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4183 {                                                                     \
4184     if (n == 1) {                                                     \
4185         return *data;                                                 \
4186     } else {                                                          \
4187         uintptr_t half = n / 2;                                       \
4188         TYPE lo = NAME##_reduce(data, status, half);                  \
4189         TYPE hi = NAME##_reduce(data + half, status, half);           \
4190         return TYPE##_##FUNC(lo, hi, status);                         \
4191     }                                                                 \
4192 }                                                                     \
4193 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
4194 {                                                                     \
4195     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4196     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4197     for (i = 0; i < oprsz; ) {                                        \
4198         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4199         do {                                                          \
4200             TYPE nn = *(TYPE *)(vn + H(i));                           \
4201             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4202             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4203         } while (i & 15);                                             \
4204     }                                                                 \
4205     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4206         *(TYPE *)((void *)data + i) = IDENT;                          \
4207     }                                                                 \
4208     return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
4209 }
4210 
4211 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4212 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4213 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4214 
4215 /* Identity is floatN_default_nan, without the function call.  */
4216 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4217 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4218 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4219 
4220 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4221 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4222 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4223 
4224 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4225 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4226 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4227 
4228 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4229 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4230 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4231 
4232 #undef DO_REDUCE
4233 
4234 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4235                              void *status, uint32_t desc)
4236 {
4237     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4238     float16 result = nn;
4239 
4240     do {
4241         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4242         do {
4243             if (pg & 1) {
4244                 float16 mm = *(float16 *)(vm + H1_2(i));
4245                 result = float16_add(result, mm, status);
4246             }
4247             i += sizeof(float16), pg >>= sizeof(float16);
4248         } while (i & 15);
4249     } while (i < opr_sz);
4250 
4251     return result;
4252 }
4253 
4254 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4255                              void *status, uint32_t desc)
4256 {
4257     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4258     float32 result = nn;
4259 
4260     do {
4261         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4262         do {
4263             if (pg & 1) {
4264                 float32 mm = *(float32 *)(vm + H1_2(i));
4265                 result = float32_add(result, mm, status);
4266             }
4267             i += sizeof(float32), pg >>= sizeof(float32);
4268         } while (i & 15);
4269     } while (i < opr_sz);
4270 
4271     return result;
4272 }
4273 
4274 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4275                              void *status, uint32_t desc)
4276 {
4277     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4278     uint64_t *m = vm;
4279     uint8_t *pg = vg;
4280 
4281     for (i = 0; i < opr_sz; i++) {
4282         if (pg[H1(i)] & 1) {
4283             nn = float64_add(nn, m[i], status);
4284         }
4285     }
4286 
4287     return nn;
4288 }
4289 
4290 /* Fully general three-operand expander, controlled by a predicate,
4291  * With the extra float_status parameter.
4292  */
4293 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4294 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4295                   void *status, uint32_t desc)                  \
4296 {                                                               \
4297     intptr_t i = simd_oprsz(desc);                              \
4298     uint64_t *g = vg;                                           \
4299     do {                                                        \
4300         uint64_t pg = g[(i - 1) >> 6];                          \
4301         do {                                                    \
4302             i -= sizeof(TYPE);                                  \
4303             if (likely((pg >> (i & 63)) & 1)) {                 \
4304                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4305                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4306                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4307             }                                                   \
4308         } while (i & 63);                                       \
4309     } while (i != 0);                                           \
4310 }
4311 
4312 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4313 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4314 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4315 
4316 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4317 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4318 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4319 
4320 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4321 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4322 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4323 
4324 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4325 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4326 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4327 
4328 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4329 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4330 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4331 
4332 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4333 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4334 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4335 
4336 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4337 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4338 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4339 
4340 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4341 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4342 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4343 
4344 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4345 {
4346     return float16_abs(float16_sub(a, b, s));
4347 }
4348 
4349 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4350 {
4351     return float32_abs(float32_sub(a, b, s));
4352 }
4353 
4354 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4355 {
4356     return float64_abs(float64_sub(a, b, s));
4357 }
4358 
4359 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4360 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4361 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4362 
4363 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4364 {
4365     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4366     return float64_scalbn(a, b_int, s);
4367 }
4368 
4369 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4370 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4371 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4372 
4373 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4374 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4375 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4376 
4377 #undef DO_ZPZZ_FP
4378 
4379 /* Three-operand expander, with one scalar operand, controlled by
4380  * a predicate, with the extra float_status parameter.
4381  */
4382 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4383 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4384                   void *status, uint32_t desc)                    \
4385 {                                                                 \
4386     intptr_t i = simd_oprsz(desc);                                \
4387     uint64_t *g = vg;                                             \
4388     TYPE mm = scalar;                                             \
4389     do {                                                          \
4390         uint64_t pg = g[(i - 1) >> 6];                            \
4391         do {                                                      \
4392             i -= sizeof(TYPE);                                    \
4393             if (likely((pg >> (i & 63)) & 1)) {                   \
4394                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4395                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4396             }                                                     \
4397         } while (i & 63);                                         \
4398     } while (i != 0);                                             \
4399 }
4400 
4401 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4402 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4403 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4404 
4405 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4406 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4407 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4408 
4409 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4410 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4411 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4412 
4413 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4414 {
4415     return float16_sub(b, a, s);
4416 }
4417 
4418 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4419 {
4420     return float32_sub(b, a, s);
4421 }
4422 
4423 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4424 {
4425     return float64_sub(b, a, s);
4426 }
4427 
4428 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4429 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4430 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4431 
4432 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4433 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4434 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4435 
4436 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4437 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4438 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4439 
4440 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4441 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4442 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4443 
4444 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4445 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4446 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4447 
4448 /* Fully general two-operand expander, controlled by a predicate,
4449  * With the extra float_status parameter.
4450  */
4451 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4452 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4453 {                                                                     \
4454     intptr_t i = simd_oprsz(desc);                                    \
4455     uint64_t *g = vg;                                                 \
4456     do {                                                              \
4457         uint64_t pg = g[(i - 1) >> 6];                                \
4458         do {                                                          \
4459             i -= sizeof(TYPE);                                        \
4460             if (likely((pg >> (i & 63)) & 1)) {                       \
4461                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4462                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4463             }                                                         \
4464         } while (i & 63);                                             \
4465     } while (i != 0);                                                 \
4466 }
4467 
4468 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4469  * FZ16.  When converting from fp16, this affects flushing input denormals;
4470  * when converting to fp16, this affects flushing output denormals.
4471  */
4472 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4473 {
4474     bool save = get_flush_inputs_to_zero(fpst);
4475     float32 ret;
4476 
4477     set_flush_inputs_to_zero(false, fpst);
4478     ret = float16_to_float32(f, true, fpst);
4479     set_flush_inputs_to_zero(save, fpst);
4480     return ret;
4481 }
4482 
4483 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4484 {
4485     bool save = get_flush_inputs_to_zero(fpst);
4486     float64 ret;
4487 
4488     set_flush_inputs_to_zero(false, fpst);
4489     ret = float16_to_float64(f, true, fpst);
4490     set_flush_inputs_to_zero(save, fpst);
4491     return ret;
4492 }
4493 
4494 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4495 {
4496     bool save = get_flush_to_zero(fpst);
4497     float16 ret;
4498 
4499     set_flush_to_zero(false, fpst);
4500     ret = float32_to_float16(f, true, fpst);
4501     set_flush_to_zero(save, fpst);
4502     return ret;
4503 }
4504 
4505 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4506 {
4507     bool save = get_flush_to_zero(fpst);
4508     float16 ret;
4509 
4510     set_flush_to_zero(false, fpst);
4511     ret = float64_to_float16(f, true, fpst);
4512     set_flush_to_zero(save, fpst);
4513     return ret;
4514 }
4515 
4516 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4517 {
4518     if (float16_is_any_nan(f)) {
4519         float_raise(float_flag_invalid, s);
4520         return 0;
4521     }
4522     return float16_to_int16_round_to_zero(f, s);
4523 }
4524 
4525 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4526 {
4527     if (float16_is_any_nan(f)) {
4528         float_raise(float_flag_invalid, s);
4529         return 0;
4530     }
4531     return float16_to_int64_round_to_zero(f, s);
4532 }
4533 
4534 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4535 {
4536     if (float32_is_any_nan(f)) {
4537         float_raise(float_flag_invalid, s);
4538         return 0;
4539     }
4540     return float32_to_int64_round_to_zero(f, s);
4541 }
4542 
4543 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4544 {
4545     if (float64_is_any_nan(f)) {
4546         float_raise(float_flag_invalid, s);
4547         return 0;
4548     }
4549     return float64_to_int64_round_to_zero(f, s);
4550 }
4551 
4552 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4553 {
4554     if (float16_is_any_nan(f)) {
4555         float_raise(float_flag_invalid, s);
4556         return 0;
4557     }
4558     return float16_to_uint16_round_to_zero(f, s);
4559 }
4560 
4561 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4562 {
4563     if (float16_is_any_nan(f)) {
4564         float_raise(float_flag_invalid, s);
4565         return 0;
4566     }
4567     return float16_to_uint64_round_to_zero(f, s);
4568 }
4569 
4570 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4571 {
4572     if (float32_is_any_nan(f)) {
4573         float_raise(float_flag_invalid, s);
4574         return 0;
4575     }
4576     return float32_to_uint64_round_to_zero(f, s);
4577 }
4578 
4579 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4580 {
4581     if (float64_is_any_nan(f)) {
4582         float_raise(float_flag_invalid, s);
4583         return 0;
4584     }
4585     return float64_to_uint64_round_to_zero(f, s);
4586 }
4587 
4588 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4589 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4590 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4591 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4592 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4593 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4594 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4595 
4596 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4597 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4598 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4599 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4600 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4601 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4602 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4603 
4604 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4605 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4606 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4607 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4608 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4609 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4610 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4611 
4612 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4613 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4614 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4615 
4616 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4617 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4618 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4619 
4620 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4621 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4622 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4623 
4624 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4625 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4626 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4627 
4628 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4629 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4630 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4631 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4632 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4633 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4634 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4635 
4636 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4637 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4638 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4639 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4640 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4641 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4642 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4643 
4644 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4645 {
4646     /* Extract frac to the top of the uint32_t. */
4647     uint32_t frac = (uint32_t)a << (16 + 6);
4648     int16_t exp = extract32(a, 10, 5);
4649 
4650     if (unlikely(exp == 0)) {
4651         if (frac != 0) {
4652             if (!get_flush_inputs_to_zero(s)) {
4653                 /* denormal: bias - fractional_zeros */
4654                 return -15 - clz32(frac);
4655             }
4656             /* flush to zero */
4657             float_raise(float_flag_input_denormal, s);
4658         }
4659     } else if (unlikely(exp == 0x1f)) {
4660         if (frac == 0) {
4661             return INT16_MAX; /* infinity */
4662         }
4663     } else {
4664         /* normal: exp - bias */
4665         return exp - 15;
4666     }
4667     /* nan or zero */
4668     float_raise(float_flag_invalid, s);
4669     return INT16_MIN;
4670 }
4671 
4672 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4673 {
4674     /* Extract frac to the top of the uint32_t. */
4675     uint32_t frac = a << 9;
4676     int32_t exp = extract32(a, 23, 8);
4677 
4678     if (unlikely(exp == 0)) {
4679         if (frac != 0) {
4680             if (!get_flush_inputs_to_zero(s)) {
4681                 /* denormal: bias - fractional_zeros */
4682                 return -127 - clz32(frac);
4683             }
4684             /* flush to zero */
4685             float_raise(float_flag_input_denormal, s);
4686         }
4687     } else if (unlikely(exp == 0xff)) {
4688         if (frac == 0) {
4689             return INT32_MAX; /* infinity */
4690         }
4691     } else {
4692         /* normal: exp - bias */
4693         return exp - 127;
4694     }
4695     /* nan or zero */
4696     float_raise(float_flag_invalid, s);
4697     return INT32_MIN;
4698 }
4699 
4700 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4701 {
4702     /* Extract frac to the top of the uint64_t. */
4703     uint64_t frac = a << 12;
4704     int64_t exp = extract64(a, 52, 11);
4705 
4706     if (unlikely(exp == 0)) {
4707         if (frac != 0) {
4708             if (!get_flush_inputs_to_zero(s)) {
4709                 /* denormal: bias - fractional_zeros */
4710                 return -1023 - clz64(frac);
4711             }
4712             /* flush to zero */
4713             float_raise(float_flag_input_denormal, s);
4714         }
4715     } else if (unlikely(exp == 0x7ff)) {
4716         if (frac == 0) {
4717             return INT64_MAX; /* infinity */
4718         }
4719     } else {
4720         /* normal: exp - bias */
4721         return exp - 1023;
4722     }
4723     /* nan or zero */
4724     float_raise(float_flag_invalid, s);
4725     return INT64_MIN;
4726 }
4727 
4728 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4729 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4730 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4731 
4732 #undef DO_ZPZ_FP
4733 
4734 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4735                             float_status *status, uint32_t desc,
4736                             uint16_t neg1, uint16_t neg3)
4737 {
4738     intptr_t i = simd_oprsz(desc);
4739     uint64_t *g = vg;
4740 
4741     do {
4742         uint64_t pg = g[(i - 1) >> 6];
4743         do {
4744             i -= 2;
4745             if (likely((pg >> (i & 63)) & 1)) {
4746                 float16 e1, e2, e3, r;
4747 
4748                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4749                 e2 = *(uint16_t *)(vm + H1_2(i));
4750                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4751                 r = float16_muladd(e1, e2, e3, 0, status);
4752                 *(uint16_t *)(vd + H1_2(i)) = r;
4753             }
4754         } while (i & 63);
4755     } while (i != 0);
4756 }
4757 
4758 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4759                               void *vg, void *status, uint32_t desc)
4760 {
4761     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4762 }
4763 
4764 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4765                               void *vg, void *status, uint32_t desc)
4766 {
4767     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4768 }
4769 
4770 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4771                                void *vg, void *status, uint32_t desc)
4772 {
4773     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4774 }
4775 
4776 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4777                                void *vg, void *status, uint32_t desc)
4778 {
4779     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4780 }
4781 
4782 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4783                             float_status *status, uint32_t desc,
4784                             uint32_t neg1, uint32_t neg3)
4785 {
4786     intptr_t i = simd_oprsz(desc);
4787     uint64_t *g = vg;
4788 
4789     do {
4790         uint64_t pg = g[(i - 1) >> 6];
4791         do {
4792             i -= 4;
4793             if (likely((pg >> (i & 63)) & 1)) {
4794                 float32 e1, e2, e3, r;
4795 
4796                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4797                 e2 = *(uint32_t *)(vm + H1_4(i));
4798                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4799                 r = float32_muladd(e1, e2, e3, 0, status);
4800                 *(uint32_t *)(vd + H1_4(i)) = r;
4801             }
4802         } while (i & 63);
4803     } while (i != 0);
4804 }
4805 
4806 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4807                               void *vg, void *status, uint32_t desc)
4808 {
4809     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4810 }
4811 
4812 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4813                               void *vg, void *status, uint32_t desc)
4814 {
4815     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4816 }
4817 
4818 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4819                                void *vg, void *status, uint32_t desc)
4820 {
4821     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4822 }
4823 
4824 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4825                                void *vg, void *status, uint32_t desc)
4826 {
4827     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4828 }
4829 
4830 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4831                             float_status *status, uint32_t desc,
4832                             uint64_t neg1, uint64_t neg3)
4833 {
4834     intptr_t i = simd_oprsz(desc);
4835     uint64_t *g = vg;
4836 
4837     do {
4838         uint64_t pg = g[(i - 1) >> 6];
4839         do {
4840             i -= 8;
4841             if (likely((pg >> (i & 63)) & 1)) {
4842                 float64 e1, e2, e3, r;
4843 
4844                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4845                 e2 = *(uint64_t *)(vm + i);
4846                 e3 = *(uint64_t *)(va + i) ^ neg3;
4847                 r = float64_muladd(e1, e2, e3, 0, status);
4848                 *(uint64_t *)(vd + i) = r;
4849             }
4850         } while (i & 63);
4851     } while (i != 0);
4852 }
4853 
4854 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4855                               void *vg, void *status, uint32_t desc)
4856 {
4857     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4858 }
4859 
4860 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4861                               void *vg, void *status, uint32_t desc)
4862 {
4863     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4864 }
4865 
4866 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4867                                void *vg, void *status, uint32_t desc)
4868 {
4869     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4870 }
4871 
4872 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4873                                void *vg, void *status, uint32_t desc)
4874 {
4875     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4876 }
4877 
4878 /* Two operand floating-point comparison controlled by a predicate.
4879  * Unlike the integer version, we are not allowed to optimistically
4880  * compare operands, since the comparison may have side effects wrt
4881  * the FPSR.
4882  */
4883 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
4884 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
4885                   void *status, uint32_t desc)                          \
4886 {                                                                       \
4887     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
4888     uint64_t *d = vd, *g = vg;                                          \
4889     do {                                                                \
4890         uint64_t out = 0, pg = g[j];                                    \
4891         do {                                                            \
4892             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
4893             if (likely((pg >> (i & 63)) & 1)) {                         \
4894                 TYPE nn = *(TYPE *)(vn + H(i));                         \
4895                 TYPE mm = *(TYPE *)(vm + H(i));                         \
4896                 out |= OP(TYPE, nn, mm, status);                        \
4897             }                                                           \
4898         } while (i & 63);                                               \
4899         d[j--] = out;                                                   \
4900     } while (i > 0);                                                    \
4901 }
4902 
4903 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4904     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4905 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4906     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4907 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4908     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4909 
4910 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4911     DO_FPCMP_PPZZ_H(NAME, OP)   \
4912     DO_FPCMP_PPZZ_S(NAME, OP)   \
4913     DO_FPCMP_PPZZ_D(NAME, OP)
4914 
4915 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
4916 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
4917 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
4918 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
4919 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
4920 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
4921 #define DO_FCMUO(TYPE, X, Y, ST)  \
4922     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4923 #define DO_FACGE(TYPE, X, Y, ST)  \
4924     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4925 #define DO_FACGT(TYPE, X, Y, ST)  \
4926     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4927 
4928 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4929 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4930 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4931 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4932 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4933 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4934 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4935 
4936 #undef DO_FPCMP_PPZZ_ALL
4937 #undef DO_FPCMP_PPZZ_D
4938 #undef DO_FPCMP_PPZZ_S
4939 #undef DO_FPCMP_PPZZ_H
4940 #undef DO_FPCMP_PPZZ
4941 
4942 /* One operand floating-point comparison against zero, controlled
4943  * by a predicate.
4944  */
4945 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
4946 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
4947                   void *status, uint32_t desc)             \
4948 {                                                          \
4949     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
4950     uint64_t *d = vd, *g = vg;                             \
4951     do {                                                   \
4952         uint64_t out = 0, pg = g[j];                       \
4953         do {                                               \
4954             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
4955             if ((pg >> (i & 63)) & 1) {                    \
4956                 TYPE nn = *(TYPE *)(vn + H(i));            \
4957                 out |= OP(TYPE, nn, 0, status);            \
4958             }                                              \
4959         } while (i & 63);                                  \
4960         d[j--] = out;                                      \
4961     } while (i > 0);                                       \
4962 }
4963 
4964 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4965     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4966 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4967     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4968 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4969     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4970 
4971 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4972     DO_FPCMP_PPZ0_H(NAME, OP)   \
4973     DO_FPCMP_PPZ0_S(NAME, OP)   \
4974     DO_FPCMP_PPZ0_D(NAME, OP)
4975 
4976 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4977 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4978 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4979 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4980 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4981 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4982 
4983 /* FP Trig Multiply-Add. */
4984 
4985 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4986 {
4987     static const float16 coeff[16] = {
4988         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4990     };
4991     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4992     intptr_t x = simd_data(desc);
4993     float16 *d = vd, *n = vn, *m = vm;
4994     for (i = 0; i < opr_sz; i++) {
4995         float16 mm = m[i];
4996         intptr_t xx = x;
4997         if (float16_is_neg(mm)) {
4998             mm = float16_abs(mm);
4999             xx += 8;
5000         }
5001         d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5002     }
5003 }
5004 
5005 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5006 {
5007     static const float32 coeff[16] = {
5008         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5009         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5010         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5011         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5012     };
5013     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5014     intptr_t x = simd_data(desc);
5015     float32 *d = vd, *n = vn, *m = vm;
5016     for (i = 0; i < opr_sz; i++) {
5017         float32 mm = m[i];
5018         intptr_t xx = x;
5019         if (float32_is_neg(mm)) {
5020             mm = float32_abs(mm);
5021             xx += 8;
5022         }
5023         d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5024     }
5025 }
5026 
5027 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5028 {
5029     static const float64 coeff[16] = {
5030         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5031         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5032         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5033         0x3de5d8408868552full, 0x0000000000000000ull,
5034         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5035         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5036         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5037         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5038     };
5039     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5040     intptr_t x = simd_data(desc);
5041     float64 *d = vd, *n = vn, *m = vm;
5042     for (i = 0; i < opr_sz; i++) {
5043         float64 mm = m[i];
5044         intptr_t xx = x;
5045         if (float64_is_neg(mm)) {
5046             mm = float64_abs(mm);
5047             xx += 8;
5048         }
5049         d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5050     }
5051 }
5052 
5053 /*
5054  * FP Complex Add
5055  */
5056 
5057 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5058                          void *vs, uint32_t desc)
5059 {
5060     intptr_t j, i = simd_oprsz(desc);
5061     uint64_t *g = vg;
5062     float16 neg_imag = float16_set_sign(0, simd_data(desc));
5063     float16 neg_real = float16_chs(neg_imag);
5064 
5065     do {
5066         uint64_t pg = g[(i - 1) >> 6];
5067         do {
5068             float16 e0, e1, e2, e3;
5069 
5070             /* I holds the real index; J holds the imag index.  */
5071             j = i - sizeof(float16);
5072             i -= 2 * sizeof(float16);
5073 
5074             e0 = *(float16 *)(vn + H1_2(i));
5075             e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5076             e2 = *(float16 *)(vn + H1_2(j));
5077             e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5078 
5079             if (likely((pg >> (i & 63)) & 1)) {
5080                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5081             }
5082             if (likely((pg >> (j & 63)) & 1)) {
5083                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5084             }
5085         } while (i & 63);
5086     } while (i != 0);
5087 }
5088 
5089 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5090                          void *vs, uint32_t desc)
5091 {
5092     intptr_t j, i = simd_oprsz(desc);
5093     uint64_t *g = vg;
5094     float32 neg_imag = float32_set_sign(0, simd_data(desc));
5095     float32 neg_real = float32_chs(neg_imag);
5096 
5097     do {
5098         uint64_t pg = g[(i - 1) >> 6];
5099         do {
5100             float32 e0, e1, e2, e3;
5101 
5102             /* I holds the real index; J holds the imag index.  */
5103             j = i - sizeof(float32);
5104             i -= 2 * sizeof(float32);
5105 
5106             e0 = *(float32 *)(vn + H1_2(i));
5107             e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5108             e2 = *(float32 *)(vn + H1_2(j));
5109             e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5110 
5111             if (likely((pg >> (i & 63)) & 1)) {
5112                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5113             }
5114             if (likely((pg >> (j & 63)) & 1)) {
5115                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5116             }
5117         } while (i & 63);
5118     } while (i != 0);
5119 }
5120 
5121 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5122                          void *vs, uint32_t desc)
5123 {
5124     intptr_t j, i = simd_oprsz(desc);
5125     uint64_t *g = vg;
5126     float64 neg_imag = float64_set_sign(0, simd_data(desc));
5127     float64 neg_real = float64_chs(neg_imag);
5128 
5129     do {
5130         uint64_t pg = g[(i - 1) >> 6];
5131         do {
5132             float64 e0, e1, e2, e3;
5133 
5134             /* I holds the real index; J holds the imag index.  */
5135             j = i - sizeof(float64);
5136             i -= 2 * sizeof(float64);
5137 
5138             e0 = *(float64 *)(vn + H1_2(i));
5139             e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5140             e2 = *(float64 *)(vn + H1_2(j));
5141             e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5142 
5143             if (likely((pg >> (i & 63)) & 1)) {
5144                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5145             }
5146             if (likely((pg >> (j & 63)) & 1)) {
5147                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5148             }
5149         } while (i & 63);
5150     } while (i != 0);
5151 }
5152 
5153 /*
5154  * FP Complex Multiply
5155  */
5156 
5157 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5158                                void *vg, void *status, uint32_t desc)
5159 {
5160     intptr_t j, i = simd_oprsz(desc);
5161     unsigned rot = simd_data(desc);
5162     bool flip = rot & 1;
5163     float16 neg_imag, neg_real;
5164     uint64_t *g = vg;
5165 
5166     neg_imag = float16_set_sign(0, (rot & 2) != 0);
5167     neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5168 
5169     do {
5170         uint64_t pg = g[(i - 1) >> 6];
5171         do {
5172             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5173 
5174             /* I holds the real index; J holds the imag index.  */
5175             j = i - sizeof(float16);
5176             i -= 2 * sizeof(float16);
5177 
5178             nr = *(float16 *)(vn + H1_2(i));
5179             ni = *(float16 *)(vn + H1_2(j));
5180             mr = *(float16 *)(vm + H1_2(i));
5181             mi = *(float16 *)(vm + H1_2(j));
5182 
5183             e2 = (flip ? ni : nr);
5184             e1 = (flip ? mi : mr) ^ neg_real;
5185             e4 = e2;
5186             e3 = (flip ? mr : mi) ^ neg_imag;
5187 
5188             if (likely((pg >> (i & 63)) & 1)) {
5189                 d = *(float16 *)(va + H1_2(i));
5190                 d = float16_muladd(e2, e1, d, 0, status);
5191                 *(float16 *)(vd + H1_2(i)) = d;
5192             }
5193             if (likely((pg >> (j & 63)) & 1)) {
5194                 d = *(float16 *)(va + H1_2(j));
5195                 d = float16_muladd(e4, e3, d, 0, status);
5196                 *(float16 *)(vd + H1_2(j)) = d;
5197             }
5198         } while (i & 63);
5199     } while (i != 0);
5200 }
5201 
5202 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5203                                void *vg, void *status, uint32_t desc)
5204 {
5205     intptr_t j, i = simd_oprsz(desc);
5206     unsigned rot = simd_data(desc);
5207     bool flip = rot & 1;
5208     float32 neg_imag, neg_real;
5209     uint64_t *g = vg;
5210 
5211     neg_imag = float32_set_sign(0, (rot & 2) != 0);
5212     neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5213 
5214     do {
5215         uint64_t pg = g[(i - 1) >> 6];
5216         do {
5217             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5218 
5219             /* I holds the real index; J holds the imag index.  */
5220             j = i - sizeof(float32);
5221             i -= 2 * sizeof(float32);
5222 
5223             nr = *(float32 *)(vn + H1_2(i));
5224             ni = *(float32 *)(vn + H1_2(j));
5225             mr = *(float32 *)(vm + H1_2(i));
5226             mi = *(float32 *)(vm + H1_2(j));
5227 
5228             e2 = (flip ? ni : nr);
5229             e1 = (flip ? mi : mr) ^ neg_real;
5230             e4 = e2;
5231             e3 = (flip ? mr : mi) ^ neg_imag;
5232 
5233             if (likely((pg >> (i & 63)) & 1)) {
5234                 d = *(float32 *)(va + H1_2(i));
5235                 d = float32_muladd(e2, e1, d, 0, status);
5236                 *(float32 *)(vd + H1_2(i)) = d;
5237             }
5238             if (likely((pg >> (j & 63)) & 1)) {
5239                 d = *(float32 *)(va + H1_2(j));
5240                 d = float32_muladd(e4, e3, d, 0, status);
5241                 *(float32 *)(vd + H1_2(j)) = d;
5242             }
5243         } while (i & 63);
5244     } while (i != 0);
5245 }
5246 
5247 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5248                                void *vg, void *status, uint32_t desc)
5249 {
5250     intptr_t j, i = simd_oprsz(desc);
5251     unsigned rot = simd_data(desc);
5252     bool flip = rot & 1;
5253     float64 neg_imag, neg_real;
5254     uint64_t *g = vg;
5255 
5256     neg_imag = float64_set_sign(0, (rot & 2) != 0);
5257     neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5258 
5259     do {
5260         uint64_t pg = g[(i - 1) >> 6];
5261         do {
5262             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5263 
5264             /* I holds the real index; J holds the imag index.  */
5265             j = i - sizeof(float64);
5266             i -= 2 * sizeof(float64);
5267 
5268             nr = *(float64 *)(vn + H1_2(i));
5269             ni = *(float64 *)(vn + H1_2(j));
5270             mr = *(float64 *)(vm + H1_2(i));
5271             mi = *(float64 *)(vm + H1_2(j));
5272 
5273             e2 = (flip ? ni : nr);
5274             e1 = (flip ? mi : mr) ^ neg_real;
5275             e4 = e2;
5276             e3 = (flip ? mr : mi) ^ neg_imag;
5277 
5278             if (likely((pg >> (i & 63)) & 1)) {
5279                 d = *(float64 *)(va + H1_2(i));
5280                 d = float64_muladd(e2, e1, d, 0, status);
5281                 *(float64 *)(vd + H1_2(i)) = d;
5282             }
5283             if (likely((pg >> (j & 63)) & 1)) {
5284                 d = *(float64 *)(va + H1_2(j));
5285                 d = float64_muladd(e4, e3, d, 0, status);
5286                 *(float64 *)(vd + H1_2(j)) = d;
5287             }
5288         } while (i & 63);
5289     } while (i != 0);
5290 }
5291 
5292 /*
5293  * Load contiguous data, protected by a governing predicate.
5294  */
5295 
5296 /*
5297  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5298  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5299  * element >= @reg_off, or @reg_max if there were no active elements at all.
5300  */
5301 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5302                                  intptr_t reg_max, int esz)
5303 {
5304     uint64_t pg_mask = pred_esz_masks[esz];
5305     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5306 
5307     /* In normal usage, the first element is active.  */
5308     if (likely(pg & 1)) {
5309         return reg_off;
5310     }
5311 
5312     if (pg == 0) {
5313         reg_off &= -64;
5314         do {
5315             reg_off += 64;
5316             if (unlikely(reg_off >= reg_max)) {
5317                 /* The entire predicate was false.  */
5318                 return reg_max;
5319             }
5320             pg = vg[reg_off >> 6] & pg_mask;
5321         } while (pg == 0);
5322     }
5323     reg_off += ctz64(pg);
5324 
5325     /* We should never see an out of range predicate bit set.  */
5326     tcg_debug_assert(reg_off < reg_max);
5327     return reg_off;
5328 }
5329 
5330 /*
5331  * Resolve the guest virtual address to info->host and info->flags.
5332  * If @nofault, return false if the page is invalid, otherwise
5333  * exit via page fault exception.
5334  */
5335 
5336 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5337                     target_ulong addr, int mem_off, MMUAccessType access_type,
5338                     int mmu_idx, uintptr_t retaddr)
5339 {
5340     int flags;
5341 
5342     addr += mem_off;
5343 
5344     /*
5345      * User-only currently always issues with TBI.  See the comment
5346      * above useronly_clean_ptr.  Usually we clean this top byte away
5347      * during translation, but we can't do that for e.g. vector + imm
5348      * addressing modes.
5349      *
5350      * We currently always enable TBI for user-only, and do not provide
5351      * a way to turn it off.  So clean the pointer unconditionally here,
5352      * rather than look it up here, or pass it down from above.
5353      */
5354     addr = useronly_clean_ptr(addr);
5355 
5356 #ifdef CONFIG_USER_ONLY
5357     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5358                                &info->host, retaddr);
5359 #else
5360     CPUTLBEntryFull *full;
5361     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5362                               &info->host, &full, retaddr);
5363 #endif
5364     info->flags = flags;
5365 
5366     if (flags & TLB_INVALID_MASK) {
5367         g_assert(nofault);
5368         return false;
5369     }
5370 
5371 #ifdef CONFIG_USER_ONLY
5372     memset(&info->attrs, 0, sizeof(info->attrs));
5373     /* Require both ANON and MTE; see allocation_tag_mem(). */
5374     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5375 #else
5376     info->attrs = full->attrs;
5377     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5378 #endif
5379 
5380     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5381     info->host -= mem_off;
5382     return true;
5383 }
5384 
5385 /*
5386  * Find first active element on each page, and a loose bound for the
5387  * final element on each page.  Identify any single element that spans
5388  * the page boundary.  Return true if there are any active elements.
5389  */
5390 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5391                             intptr_t reg_max, int esz, int msize)
5392 {
5393     const int esize = 1 << esz;
5394     const uint64_t pg_mask = pred_esz_masks[esz];
5395     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5396     intptr_t mem_off_last, mem_off_split;
5397     intptr_t page_split, elt_split;
5398     intptr_t i;
5399 
5400     /* Set all of the element indices to -1, and the TLB data to 0. */
5401     memset(info, -1, offsetof(SVEContLdSt, page));
5402     memset(info->page, 0, sizeof(info->page));
5403 
5404     /* Gross scan over the entire predicate to find bounds. */
5405     i = 0;
5406     do {
5407         uint64_t pg = vg[i] & pg_mask;
5408         if (pg) {
5409             reg_off_last = i * 64 + 63 - clz64(pg);
5410             if (reg_off_first < 0) {
5411                 reg_off_first = i * 64 + ctz64(pg);
5412             }
5413         }
5414     } while (++i * 64 < reg_max);
5415 
5416     if (unlikely(reg_off_first < 0)) {
5417         /* No active elements, no pages touched. */
5418         return false;
5419     }
5420     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5421 
5422     info->reg_off_first[0] = reg_off_first;
5423     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5424     mem_off_last = (reg_off_last >> esz) * msize;
5425 
5426     page_split = -(addr | TARGET_PAGE_MASK);
5427     if (likely(mem_off_last + msize <= page_split)) {
5428         /* The entire operation fits within a single page. */
5429         info->reg_off_last[0] = reg_off_last;
5430         return true;
5431     }
5432 
5433     info->page_split = page_split;
5434     elt_split = page_split / msize;
5435     reg_off_split = elt_split << esz;
5436     mem_off_split = elt_split * msize;
5437 
5438     /*
5439      * This is the last full element on the first page, but it is not
5440      * necessarily active.  If there is no full element, i.e. the first
5441      * active element is the one that's split, this value remains -1.
5442      * It is useful as iteration bounds.
5443      */
5444     if (elt_split != 0) {
5445         info->reg_off_last[0] = reg_off_split - esize;
5446     }
5447 
5448     /* Determine if an unaligned element spans the pages.  */
5449     if (page_split % msize != 0) {
5450         /* It is helpful to know if the split element is active. */
5451         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5452             info->reg_off_split = reg_off_split;
5453             info->mem_off_split = mem_off_split;
5454 
5455             if (reg_off_split == reg_off_last) {
5456                 /* The page crossing element is last. */
5457                 return true;
5458             }
5459         }
5460         reg_off_split += esize;
5461         mem_off_split += msize;
5462     }
5463 
5464     /*
5465      * We do want the first active element on the second page, because
5466      * this may affect the address reported in an exception.
5467      */
5468     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5469     tcg_debug_assert(reg_off_split <= reg_off_last);
5470     info->reg_off_first[1] = reg_off_split;
5471     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5472     info->reg_off_last[1] = reg_off_last;
5473     return true;
5474 }
5475 
5476 /*
5477  * Resolve the guest virtual addresses to info->page[].
5478  * Control the generation of page faults with @fault.  Return false if
5479  * there is no work to do, which can only happen with @fault == FAULT_NO.
5480  */
5481 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5482                          CPUARMState *env, target_ulong addr,
5483                          MMUAccessType access_type, uintptr_t retaddr)
5484 {
5485     int mmu_idx = arm_env_mmu_index(env);
5486     int mem_off = info->mem_off_first[0];
5487     bool nofault = fault == FAULT_NO;
5488     bool have_work = true;
5489 
5490     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5491                         access_type, mmu_idx, retaddr)) {
5492         /* No work to be done. */
5493         return false;
5494     }
5495 
5496     if (likely(info->page_split < 0)) {
5497         /* The entire operation was on the one page. */
5498         return true;
5499     }
5500 
5501     /*
5502      * If the second page is invalid, then we want the fault address to be
5503      * the first byte on that page which is accessed.
5504      */
5505     if (info->mem_off_split >= 0) {
5506         /*
5507          * There is an element split across the pages.  The fault address
5508          * should be the first byte of the second page.
5509          */
5510         mem_off = info->page_split;
5511         /*
5512          * If the split element is also the first active element
5513          * of the vector, then:  For first-fault we should continue
5514          * to generate faults for the second page.  For no-fault,
5515          * we have work only if the second page is valid.
5516          */
5517         if (info->mem_off_first[0] < info->mem_off_split) {
5518             nofault = FAULT_FIRST;
5519             have_work = false;
5520         }
5521     } else {
5522         /*
5523          * There is no element split across the pages.  The fault address
5524          * should be the first active element on the second page.
5525          */
5526         mem_off = info->mem_off_first[1];
5527         /*
5528          * There must have been one active element on the first page,
5529          * so we're out of first-fault territory.
5530          */
5531         nofault = fault != FAULT_ALL;
5532     }
5533 
5534     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5535                                 access_type, mmu_idx, retaddr);
5536     return have_work;
5537 }
5538 
5539 #ifndef CONFIG_USER_ONLY
5540 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5541                                uint64_t *vg, target_ulong addr,
5542                                int esize, int msize, int wp_access,
5543                                uintptr_t retaddr)
5544 {
5545     intptr_t mem_off, reg_off, reg_last;
5546     int flags0 = info->page[0].flags;
5547     int flags1 = info->page[1].flags;
5548 
5549     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5550         return;
5551     }
5552 
5553     /* Indicate that watchpoints are handled. */
5554     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5555     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5556 
5557     if (flags0 & TLB_WATCHPOINT) {
5558         mem_off = info->mem_off_first[0];
5559         reg_off = info->reg_off_first[0];
5560         reg_last = info->reg_off_last[0];
5561 
5562         while (reg_off <= reg_last) {
5563             uint64_t pg = vg[reg_off >> 6];
5564             do {
5565                 if ((pg >> (reg_off & 63)) & 1) {
5566                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5567                                          msize, info->page[0].attrs,
5568                                          wp_access, retaddr);
5569                 }
5570                 reg_off += esize;
5571                 mem_off += msize;
5572             } while (reg_off <= reg_last && (reg_off & 63));
5573         }
5574     }
5575 
5576     mem_off = info->mem_off_split;
5577     if (mem_off >= 0) {
5578         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5579                              info->page[0].attrs, wp_access, retaddr);
5580     }
5581 
5582     mem_off = info->mem_off_first[1];
5583     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5584         reg_off = info->reg_off_first[1];
5585         reg_last = info->reg_off_last[1];
5586 
5587         do {
5588             uint64_t pg = vg[reg_off >> 6];
5589             do {
5590                 if ((pg >> (reg_off & 63)) & 1) {
5591                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5592                                          msize, info->page[1].attrs,
5593                                          wp_access, retaddr);
5594                 }
5595                 reg_off += esize;
5596                 mem_off += msize;
5597             } while (reg_off & 63);
5598         } while (reg_off <= reg_last);
5599     }
5600 }
5601 #endif
5602 
5603 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5604                              uint64_t *vg, target_ulong addr, int esize,
5605                              int msize, uint32_t mtedesc, uintptr_t ra)
5606 {
5607     intptr_t mem_off, reg_off, reg_last;
5608 
5609     /* Process the page only if MemAttr == Tagged. */
5610     if (info->page[0].tagged) {
5611         mem_off = info->mem_off_first[0];
5612         reg_off = info->reg_off_first[0];
5613         reg_last = info->reg_off_split;
5614         if (reg_last < 0) {
5615             reg_last = info->reg_off_last[0];
5616         }
5617 
5618         do {
5619             uint64_t pg = vg[reg_off >> 6];
5620             do {
5621                 if ((pg >> (reg_off & 63)) & 1) {
5622                     mte_check(env, mtedesc, addr, ra);
5623                 }
5624                 reg_off += esize;
5625                 mem_off += msize;
5626             } while (reg_off <= reg_last && (reg_off & 63));
5627         } while (reg_off <= reg_last);
5628     }
5629 
5630     mem_off = info->mem_off_first[1];
5631     if (mem_off >= 0 && info->page[1].tagged) {
5632         reg_off = info->reg_off_first[1];
5633         reg_last = info->reg_off_last[1];
5634 
5635         do {
5636             uint64_t pg = vg[reg_off >> 6];
5637             do {
5638                 if ((pg >> (reg_off & 63)) & 1) {
5639                     mte_check(env, mtedesc, addr, ra);
5640                 }
5641                 reg_off += esize;
5642                 mem_off += msize;
5643             } while (reg_off & 63);
5644         } while (reg_off <= reg_last);
5645     }
5646 }
5647 
5648 /*
5649  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5650  */
5651 static inline QEMU_ALWAYS_INLINE
5652 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5653                uint32_t desc, const uintptr_t retaddr,
5654                const int esz, const int msz, const int N, uint32_t mtedesc,
5655                sve_ldst1_host_fn *host_fn,
5656                sve_ldst1_tlb_fn *tlb_fn)
5657 {
5658     const unsigned rd = simd_data(desc);
5659     const intptr_t reg_max = simd_oprsz(desc);
5660     intptr_t reg_off, reg_last, mem_off;
5661     SVEContLdSt info;
5662     void *host;
5663     int flags, i;
5664 
5665     /* Find the active elements.  */
5666     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5667         /* The entire predicate was false; no load occurs.  */
5668         for (i = 0; i < N; ++i) {
5669             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5670         }
5671         return;
5672     }
5673 
5674     /* Probe the page(s).  Exit with exception for any invalid page. */
5675     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5676 
5677     /* Handle watchpoints for all active elements. */
5678     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5679                               BP_MEM_READ, retaddr);
5680 
5681     /*
5682      * Handle mte checks for all active elements.
5683      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5684      */
5685     if (mtedesc) {
5686         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5687                                 mtedesc, retaddr);
5688     }
5689 
5690     flags = info.page[0].flags | info.page[1].flags;
5691     if (unlikely(flags != 0)) {
5692         /*
5693          * At least one page includes MMIO.
5694          * Any bus operation can fail with cpu_transaction_failed,
5695          * which for ARM will raise SyncExternal.  Perform the load
5696          * into scratch memory to preserve register state until the end.
5697          */
5698         ARMVectorReg scratch[4] = { };
5699 
5700         mem_off = info.mem_off_first[0];
5701         reg_off = info.reg_off_first[0];
5702         reg_last = info.reg_off_last[1];
5703         if (reg_last < 0) {
5704             reg_last = info.reg_off_split;
5705             if (reg_last < 0) {
5706                 reg_last = info.reg_off_last[0];
5707             }
5708         }
5709 
5710         do {
5711             uint64_t pg = vg[reg_off >> 6];
5712             do {
5713                 if ((pg >> (reg_off & 63)) & 1) {
5714                     for (i = 0; i < N; ++i) {
5715                         tlb_fn(env, &scratch[i], reg_off,
5716                                addr + mem_off + (i << msz), retaddr);
5717                     }
5718                 }
5719                 reg_off += 1 << esz;
5720                 mem_off += N << msz;
5721             } while (reg_off & 63);
5722         } while (reg_off <= reg_last);
5723 
5724         for (i = 0; i < N; ++i) {
5725             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5726         }
5727         return;
5728     }
5729 
5730     /* The entire operation is in RAM, on valid pages. */
5731 
5732     for (i = 0; i < N; ++i) {
5733         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5734     }
5735 
5736     mem_off = info.mem_off_first[0];
5737     reg_off = info.reg_off_first[0];
5738     reg_last = info.reg_off_last[0];
5739     host = info.page[0].host;
5740 
5741     while (reg_off <= reg_last) {
5742         uint64_t pg = vg[reg_off >> 6];
5743         do {
5744             if ((pg >> (reg_off & 63)) & 1) {
5745                 for (i = 0; i < N; ++i) {
5746                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5747                             host + mem_off + (i << msz));
5748                 }
5749             }
5750             reg_off += 1 << esz;
5751             mem_off += N << msz;
5752         } while (reg_off <= reg_last && (reg_off & 63));
5753     }
5754 
5755     /*
5756      * Use the slow path to manage the cross-page misalignment.
5757      * But we know this is RAM and cannot trap.
5758      */
5759     mem_off = info.mem_off_split;
5760     if (unlikely(mem_off >= 0)) {
5761         reg_off = info.reg_off_split;
5762         for (i = 0; i < N; ++i) {
5763             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5764                    addr + mem_off + (i << msz), retaddr);
5765         }
5766     }
5767 
5768     mem_off = info.mem_off_first[1];
5769     if (unlikely(mem_off >= 0)) {
5770         reg_off = info.reg_off_first[1];
5771         reg_last = info.reg_off_last[1];
5772         host = info.page[1].host;
5773 
5774         do {
5775             uint64_t pg = vg[reg_off >> 6];
5776             do {
5777                 if ((pg >> (reg_off & 63)) & 1) {
5778                     for (i = 0; i < N; ++i) {
5779                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5780                                 host + mem_off + (i << msz));
5781                     }
5782                 }
5783                 reg_off += 1 << esz;
5784                 mem_off += N << msz;
5785             } while (reg_off & 63);
5786         } while (reg_off <= reg_last);
5787     }
5788 }
5789 
5790 static inline QEMU_ALWAYS_INLINE
5791 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5792                    uint32_t desc, const uintptr_t ra,
5793                    const int esz, const int msz, const int N,
5794                    sve_ldst1_host_fn *host_fn,
5795                    sve_ldst1_tlb_fn *tlb_fn)
5796 {
5797     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5798     int bit55 = extract64(addr, 55, 1);
5799 
5800     /* Remove mtedesc from the normal sve descriptor. */
5801     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5802 
5803     /* Perform gross MTE suppression early. */
5804     if (!tbi_check(mtedesc, bit55) ||
5805         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
5806         mtedesc = 0;
5807     }
5808 
5809     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5810 }
5811 
5812 #define DO_LD1_1(NAME, ESZ)                                             \
5813 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
5814                             target_ulong addr, uint32_t desc)           \
5815 {                                                                       \
5816     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
5817               sve_##NAME##_host, sve_##NAME##_tlb);                     \
5818 }                                                                       \
5819 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
5820                                 target_ulong addr, uint32_t desc)       \
5821 {                                                                       \
5822     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
5823                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
5824 }
5825 
5826 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
5827 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
5828                                target_ulong addr, uint32_t desc)        \
5829 {                                                                       \
5830     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5831               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
5832 }                                                                       \
5833 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
5834                                target_ulong addr, uint32_t desc)        \
5835 {                                                                       \
5836     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5837               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
5838 }                                                                       \
5839 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
5840                                    target_ulong addr, uint32_t desc)    \
5841 {                                                                       \
5842     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5843                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
5844 }                                                                       \
5845 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
5846                                    target_ulong addr, uint32_t desc)    \
5847 {                                                                       \
5848     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5849                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
5850 }
5851 
5852 DO_LD1_1(ld1bb,  MO_8)
5853 DO_LD1_1(ld1bhu, MO_16)
5854 DO_LD1_1(ld1bhs, MO_16)
5855 DO_LD1_1(ld1bsu, MO_32)
5856 DO_LD1_1(ld1bss, MO_32)
5857 DO_LD1_1(ld1bdu, MO_64)
5858 DO_LD1_1(ld1bds, MO_64)
5859 
5860 DO_LD1_2(ld1hh,  MO_16, MO_16)
5861 DO_LD1_2(ld1hsu, MO_32, MO_16)
5862 DO_LD1_2(ld1hss, MO_32, MO_16)
5863 DO_LD1_2(ld1hdu, MO_64, MO_16)
5864 DO_LD1_2(ld1hds, MO_64, MO_16)
5865 
5866 DO_LD1_2(ld1ss,  MO_32, MO_32)
5867 DO_LD1_2(ld1sdu, MO_64, MO_32)
5868 DO_LD1_2(ld1sds, MO_64, MO_32)
5869 
5870 DO_LD1_2(ld1dd,  MO_64, MO_64)
5871 
5872 #undef DO_LD1_1
5873 #undef DO_LD1_2
5874 
5875 #define DO_LDN_1(N)                                                     \
5876 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
5877                              target_ulong addr, uint32_t desc)          \
5878 {                                                                       \
5879     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
5880               sve_ld1bb_host, sve_ld1bb_tlb);                           \
5881 }                                                                       \
5882 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
5883                                  target_ulong addr, uint32_t desc)      \
5884 {                                                                       \
5885     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
5886                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
5887 }
5888 
5889 #define DO_LDN_2(N, SUFF, ESZ)                                          \
5890 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
5891                                     target_ulong addr, uint32_t desc)   \
5892 {                                                                       \
5893     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5894               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
5895 }                                                                       \
5896 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
5897                                     target_ulong addr, uint32_t desc)   \
5898 {                                                                       \
5899     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5900               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
5901 }                                                                       \
5902 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
5903                                         target_ulong addr, uint32_t desc) \
5904 {                                                                       \
5905     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5906                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
5907 }                                                                       \
5908 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
5909                                         target_ulong addr, uint32_t desc) \
5910 {                                                                       \
5911     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5912                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
5913 }
5914 
5915 DO_LDN_1(2)
5916 DO_LDN_1(3)
5917 DO_LDN_1(4)
5918 
5919 DO_LDN_2(2, hh, MO_16)
5920 DO_LDN_2(3, hh, MO_16)
5921 DO_LDN_2(4, hh, MO_16)
5922 
5923 DO_LDN_2(2, ss, MO_32)
5924 DO_LDN_2(3, ss, MO_32)
5925 DO_LDN_2(4, ss, MO_32)
5926 
5927 DO_LDN_2(2, dd, MO_64)
5928 DO_LDN_2(3, dd, MO_64)
5929 DO_LDN_2(4, dd, MO_64)
5930 
5931 #undef DO_LDN_1
5932 #undef DO_LDN_2
5933 
5934 /*
5935  * Load contiguous data, first-fault and no-fault.
5936  *
5937  * For user-only, one could argue that we should hold the mmap_lock during
5938  * the operation so that there is no race between page_check_range and the
5939  * load operation.  However, unmapping pages out from under a running thread
5940  * is extraordinarily unlikely.  This theoretical race condition also affects
5941  * linux-user/ in its get_user/put_user macros.
5942  *
5943  * TODO: Construct some helpers, written in assembly, that interact with
5944  * host_signal_handler to produce memory ops which can properly report errors
5945  * without racing.
5946  */
5947 
5948 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
5949  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5950  * option, which leaves subsequent data unchanged.
5951  */
5952 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5953 {
5954     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5955 
5956     if (i & 63) {
5957         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5958         i = ROUND_UP(i, 64);
5959     }
5960     for (; i < oprsz; i += 64) {
5961         ffr[i / 64] = 0;
5962     }
5963 }
5964 
5965 /*
5966  * Common helper for all contiguous no-fault and first-fault loads.
5967  */
5968 static inline QEMU_ALWAYS_INLINE
5969 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5970                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5971                    const int esz, const int msz, const SVEContFault fault,
5972                    sve_ldst1_host_fn *host_fn,
5973                    sve_ldst1_tlb_fn *tlb_fn)
5974 {
5975     const unsigned rd = simd_data(desc);
5976     void *vd = &env->vfp.zregs[rd];
5977     const intptr_t reg_max = simd_oprsz(desc);
5978     intptr_t reg_off, mem_off, reg_last;
5979     SVEContLdSt info;
5980     int flags;
5981     void *host;
5982 
5983     /* Find the active elements.  */
5984     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5985         /* The entire predicate was false; no load occurs.  */
5986         memset(vd, 0, reg_max);
5987         return;
5988     }
5989     reg_off = info.reg_off_first[0];
5990 
5991     /* Probe the page(s). */
5992     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5993         /* Fault on first element. */
5994         tcg_debug_assert(fault == FAULT_NO);
5995         memset(vd, 0, reg_max);
5996         goto do_fault;
5997     }
5998 
5999     mem_off = info.mem_off_first[0];
6000     flags = info.page[0].flags;
6001 
6002     /*
6003      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6004      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6005      */
6006     if (!info.page[0].tagged) {
6007         mtedesc = 0;
6008     }
6009 
6010     if (fault == FAULT_FIRST) {
6011         /* Trapping mte check for the first-fault element.  */
6012         if (mtedesc) {
6013             mte_check(env, mtedesc, addr + mem_off, retaddr);
6014         }
6015 
6016         /*
6017          * Special handling of the first active element,
6018          * if it crosses a page boundary or is MMIO.
6019          */
6020         bool is_split = mem_off == info.mem_off_split;
6021         if (unlikely(flags != 0) || unlikely(is_split)) {
6022             /*
6023              * Use the slow path for cross-page handling.
6024              * Might trap for MMIO or watchpoints.
6025              */
6026             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6027 
6028             /* After any fault, zero the other elements. */
6029             swap_memzero(vd, reg_off);
6030             reg_off += 1 << esz;
6031             mem_off += 1 << msz;
6032             swap_memzero(vd + reg_off, reg_max - reg_off);
6033 
6034             if (is_split) {
6035                 goto second_page;
6036             }
6037         } else {
6038             memset(vd, 0, reg_max);
6039         }
6040     } else {
6041         memset(vd, 0, reg_max);
6042         if (unlikely(mem_off == info.mem_off_split)) {
6043             /* The first active element crosses a page boundary. */
6044             flags |= info.page[1].flags;
6045             if (unlikely(flags & TLB_MMIO)) {
6046                 /* Some page is MMIO, see below. */
6047                 goto do_fault;
6048             }
6049             if (unlikely(flags & TLB_WATCHPOINT) &&
6050                 (cpu_watchpoint_address_matches
6051                  (env_cpu(env), addr + mem_off, 1 << msz)
6052                  & BP_MEM_READ)) {
6053                 /* Watchpoint hit, see below. */
6054                 goto do_fault;
6055             }
6056             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6057                 goto do_fault;
6058             }
6059             /*
6060              * Use the slow path for cross-page handling.
6061              * This is RAM, without a watchpoint, and will not trap.
6062              */
6063             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6064             goto second_page;
6065         }
6066     }
6067 
6068     /*
6069      * From this point on, all memory operations are MemSingleNF.
6070      *
6071      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6072      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6073      *
6074      * Unfortuately we do not have access to the memory attributes from the
6075      * PTE to tell Device memory from Normal memory.  So we make a mostly
6076      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6077      * This gives the right answer for the common cases of "Normal memory,
6078      * backed by host RAM" and "Device memory, backed by MMIO".
6079      * The architecture allows us to suppress an NF load and return
6080      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6081      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6082      * get wrong is "Device memory, backed by host RAM", for which we
6083      * should return (UNKNOWN, FAULT) for but do not.
6084      *
6085      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6086      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6087      * architectural breakpoints the same.
6088      */
6089     if (unlikely(flags & TLB_MMIO)) {
6090         goto do_fault;
6091     }
6092 
6093     reg_last = info.reg_off_last[0];
6094     host = info.page[0].host;
6095 
6096     do {
6097         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6098         do {
6099             if ((pg >> (reg_off & 63)) & 1) {
6100                 if (unlikely(flags & TLB_WATCHPOINT) &&
6101                     (cpu_watchpoint_address_matches
6102                      (env_cpu(env), addr + mem_off, 1 << msz)
6103                      & BP_MEM_READ)) {
6104                     goto do_fault;
6105                 }
6106                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6107                     goto do_fault;
6108                 }
6109                 host_fn(vd, reg_off, host + mem_off);
6110             }
6111             reg_off += 1 << esz;
6112             mem_off += 1 << msz;
6113         } while (reg_off <= reg_last && (reg_off & 63));
6114     } while (reg_off <= reg_last);
6115 
6116     /*
6117      * MemSingleNF is allowed to fail for any reason.  We have special
6118      * code above to handle the first element crossing a page boundary.
6119      * As an implementation choice, decline to handle a cross-page element
6120      * in any other position.
6121      */
6122     reg_off = info.reg_off_split;
6123     if (reg_off >= 0) {
6124         goto do_fault;
6125     }
6126 
6127  second_page:
6128     reg_off = info.reg_off_first[1];
6129     if (likely(reg_off < 0)) {
6130         /* No active elements on the second page.  All done. */
6131         return;
6132     }
6133 
6134     /*
6135      * MemSingleNF is allowed to fail for any reason.  As an implementation
6136      * choice, decline to handle elements on the second page.  This should
6137      * be low frequency as the guest walks through memory -- the next
6138      * iteration of the guest's loop should be aligned on the page boundary,
6139      * and then all following iterations will stay aligned.
6140      */
6141 
6142  do_fault:
6143     record_fault(env, reg_off, reg_max);
6144 }
6145 
6146 static inline QEMU_ALWAYS_INLINE
6147 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6148                        uint32_t desc, const uintptr_t retaddr,
6149                        const int esz, const int msz, const SVEContFault fault,
6150                        sve_ldst1_host_fn *host_fn,
6151                        sve_ldst1_tlb_fn *tlb_fn)
6152 {
6153     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6154     int bit55 = extract64(addr, 55, 1);
6155 
6156     /* Remove mtedesc from the normal sve descriptor. */
6157     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6158 
6159     /* Perform gross MTE suppression early. */
6160     if (!tbi_check(mtedesc, bit55) ||
6161         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6162         mtedesc = 0;
6163     }
6164 
6165     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6166                   esz, msz, fault, host_fn, tlb_fn);
6167 }
6168 
6169 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6170 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6171                                  target_ulong addr, uint32_t desc)      \
6172 {                                                                       \
6173     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6174                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6175 }                                                                       \
6176 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6177                                  target_ulong addr, uint32_t desc)      \
6178 {                                                                       \
6179     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6180                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6181 }                                                                       \
6182 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6183                                      target_ulong addr, uint32_t desc)  \
6184 {                                                                       \
6185     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6186                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6187 }                                                                       \
6188 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6189                                      target_ulong addr, uint32_t desc)  \
6190 {                                                                       \
6191     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6192                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6193 }
6194 
6195 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6196 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6197                                     target_ulong addr, uint32_t desc)   \
6198 {                                                                       \
6199     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6200                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6201 }                                                                       \
6202 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6203                                     target_ulong addr, uint32_t desc)   \
6204 {                                                                       \
6205     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6206                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6207 }                                                                       \
6208 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6209                                     target_ulong addr, uint32_t desc)   \
6210 {                                                                       \
6211     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6212                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6213 }                                                                       \
6214 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6215                                     target_ulong addr, uint32_t desc)   \
6216 {                                                                       \
6217     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6218                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6219 }                                                                       \
6220 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6221                                         target_ulong addr, uint32_t desc) \
6222 {                                                                       \
6223     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6224                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6225 }                                                                       \
6226 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6227                                         target_ulong addr, uint32_t desc) \
6228 {                                                                       \
6229     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6230                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6231 }                                                                       \
6232 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6233                                         target_ulong addr, uint32_t desc) \
6234 {                                                                       \
6235     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6236                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6237 }                                                                       \
6238 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6239                                         target_ulong addr, uint32_t desc) \
6240 {                                                                       \
6241     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6242                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6243 }
6244 
6245 DO_LDFF1_LDNF1_1(bb,  MO_8)
6246 DO_LDFF1_LDNF1_1(bhu, MO_16)
6247 DO_LDFF1_LDNF1_1(bhs, MO_16)
6248 DO_LDFF1_LDNF1_1(bsu, MO_32)
6249 DO_LDFF1_LDNF1_1(bss, MO_32)
6250 DO_LDFF1_LDNF1_1(bdu, MO_64)
6251 DO_LDFF1_LDNF1_1(bds, MO_64)
6252 
6253 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6254 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6255 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6256 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6257 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6258 
6259 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6260 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6261 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6262 
6263 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6264 
6265 #undef DO_LDFF1_LDNF1_1
6266 #undef DO_LDFF1_LDNF1_2
6267 
6268 /*
6269  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6270  */
6271 
6272 static inline QEMU_ALWAYS_INLINE
6273 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6274                uint32_t desc, const uintptr_t retaddr,
6275                const int esz, const int msz, const int N, uint32_t mtedesc,
6276                sve_ldst1_host_fn *host_fn,
6277                sve_ldst1_tlb_fn *tlb_fn)
6278 {
6279     const unsigned rd = simd_data(desc);
6280     const intptr_t reg_max = simd_oprsz(desc);
6281     intptr_t reg_off, reg_last, mem_off;
6282     SVEContLdSt info;
6283     void *host;
6284     int i, flags;
6285 
6286     /* Find the active elements.  */
6287     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6288         /* The entire predicate was false; no store occurs.  */
6289         return;
6290     }
6291 
6292     /* Probe the page(s).  Exit with exception for any invalid page. */
6293     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6294 
6295     /* Handle watchpoints for all active elements. */
6296     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6297                               BP_MEM_WRITE, retaddr);
6298 
6299     /*
6300      * Handle mte checks for all active elements.
6301      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6302      */
6303     if (mtedesc) {
6304         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6305                                 mtedesc, retaddr);
6306     }
6307 
6308     flags = info.page[0].flags | info.page[1].flags;
6309     if (unlikely(flags != 0)) {
6310 #ifdef CONFIG_USER_ONLY
6311         g_assert_not_reached();
6312 #else
6313         /*
6314          * At least one page includes MMIO.
6315          * Any bus operation can fail with cpu_transaction_failed,
6316          * which for ARM will raise SyncExternal.  We cannot avoid
6317          * this fault and will leave with the store incomplete.
6318          */
6319         mem_off = info.mem_off_first[0];
6320         reg_off = info.reg_off_first[0];
6321         reg_last = info.reg_off_last[1];
6322         if (reg_last < 0) {
6323             reg_last = info.reg_off_split;
6324             if (reg_last < 0) {
6325                 reg_last = info.reg_off_last[0];
6326             }
6327         }
6328 
6329         do {
6330             uint64_t pg = vg[reg_off >> 6];
6331             do {
6332                 if ((pg >> (reg_off & 63)) & 1) {
6333                     for (i = 0; i < N; ++i) {
6334                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6335                                addr + mem_off + (i << msz), retaddr);
6336                     }
6337                 }
6338                 reg_off += 1 << esz;
6339                 mem_off += N << msz;
6340             } while (reg_off & 63);
6341         } while (reg_off <= reg_last);
6342         return;
6343 #endif
6344     }
6345 
6346     mem_off = info.mem_off_first[0];
6347     reg_off = info.reg_off_first[0];
6348     reg_last = info.reg_off_last[0];
6349     host = info.page[0].host;
6350 
6351     while (reg_off <= reg_last) {
6352         uint64_t pg = vg[reg_off >> 6];
6353         do {
6354             if ((pg >> (reg_off & 63)) & 1) {
6355                 for (i = 0; i < N; ++i) {
6356                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6357                             host + mem_off + (i << msz));
6358                 }
6359             }
6360             reg_off += 1 << esz;
6361             mem_off += N << msz;
6362         } while (reg_off <= reg_last && (reg_off & 63));
6363     }
6364 
6365     /*
6366      * Use the slow path to manage the cross-page misalignment.
6367      * But we know this is RAM and cannot trap.
6368      */
6369     mem_off = info.mem_off_split;
6370     if (unlikely(mem_off >= 0)) {
6371         reg_off = info.reg_off_split;
6372         for (i = 0; i < N; ++i) {
6373             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6374                    addr + mem_off + (i << msz), retaddr);
6375         }
6376     }
6377 
6378     mem_off = info.mem_off_first[1];
6379     if (unlikely(mem_off >= 0)) {
6380         reg_off = info.reg_off_first[1];
6381         reg_last = info.reg_off_last[1];
6382         host = info.page[1].host;
6383 
6384         do {
6385             uint64_t pg = vg[reg_off >> 6];
6386             do {
6387                 if ((pg >> (reg_off & 63)) & 1) {
6388                     for (i = 0; i < N; ++i) {
6389                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6390                                 host + mem_off + (i << msz));
6391                     }
6392                 }
6393                 reg_off += 1 << esz;
6394                 mem_off += N << msz;
6395             } while (reg_off & 63);
6396         } while (reg_off <= reg_last);
6397     }
6398 }
6399 
6400 static inline QEMU_ALWAYS_INLINE
6401 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6402                    uint32_t desc, const uintptr_t ra,
6403                    const int esz, const int msz, const int N,
6404                    sve_ldst1_host_fn *host_fn,
6405                    sve_ldst1_tlb_fn *tlb_fn)
6406 {
6407     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6408     int bit55 = extract64(addr, 55, 1);
6409 
6410     /* Remove mtedesc from the normal sve descriptor. */
6411     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6412 
6413     /* Perform gross MTE suppression early. */
6414     if (!tbi_check(mtedesc, bit55) ||
6415         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6416         mtedesc = 0;
6417     }
6418 
6419     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6420 }
6421 
6422 #define DO_STN_1(N, NAME, ESZ)                                          \
6423 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6424                                  target_ulong addr, uint32_t desc)      \
6425 {                                                                       \
6426     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6427               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6428 }                                                                       \
6429 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6430                                      target_ulong addr, uint32_t desc)  \
6431 {                                                                       \
6432     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6433                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6434 }
6435 
6436 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6437 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6438                                     target_ulong addr, uint32_t desc)   \
6439 {                                                                       \
6440     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6441               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6442 }                                                                       \
6443 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6444                                     target_ulong addr, uint32_t desc)   \
6445 {                                                                       \
6446     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6447               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6448 }                                                                       \
6449 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6450                                         target_ulong addr, uint32_t desc) \
6451 {                                                                       \
6452     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6453                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6454 }                                                                       \
6455 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6456                                         target_ulong addr, uint32_t desc) \
6457 {                                                                       \
6458     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6459                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6460 }
6461 
6462 DO_STN_1(1, bb, MO_8)
6463 DO_STN_1(1, bh, MO_16)
6464 DO_STN_1(1, bs, MO_32)
6465 DO_STN_1(1, bd, MO_64)
6466 DO_STN_1(2, bb, MO_8)
6467 DO_STN_1(3, bb, MO_8)
6468 DO_STN_1(4, bb, MO_8)
6469 
6470 DO_STN_2(1, hh, MO_16, MO_16)
6471 DO_STN_2(1, hs, MO_32, MO_16)
6472 DO_STN_2(1, hd, MO_64, MO_16)
6473 DO_STN_2(2, hh, MO_16, MO_16)
6474 DO_STN_2(3, hh, MO_16, MO_16)
6475 DO_STN_2(4, hh, MO_16, MO_16)
6476 
6477 DO_STN_2(1, ss, MO_32, MO_32)
6478 DO_STN_2(1, sd, MO_64, MO_32)
6479 DO_STN_2(2, ss, MO_32, MO_32)
6480 DO_STN_2(3, ss, MO_32, MO_32)
6481 DO_STN_2(4, ss, MO_32, MO_32)
6482 
6483 DO_STN_2(1, dd, MO_64, MO_64)
6484 DO_STN_2(2, dd, MO_64, MO_64)
6485 DO_STN_2(3, dd, MO_64, MO_64)
6486 DO_STN_2(4, dd, MO_64, MO_64)
6487 
6488 #undef DO_STN_1
6489 #undef DO_STN_2
6490 
6491 /*
6492  * Loads with a vector index.
6493  */
6494 
6495 /*
6496  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6497  */
6498 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6499 
6500 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6501 {
6502     return *(uint32_t *)(reg + H1_4(reg_ofs));
6503 }
6504 
6505 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6506 {
6507     return *(int32_t *)(reg + H1_4(reg_ofs));
6508 }
6509 
6510 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6511 {
6512     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6513 }
6514 
6515 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6516 {
6517     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6518 }
6519 
6520 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6521 {
6522     return *(uint64_t *)(reg + reg_ofs);
6523 }
6524 
6525 static inline QEMU_ALWAYS_INLINE
6526 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6527                target_ulong base, uint32_t desc, uintptr_t retaddr,
6528                uint32_t mtedesc, int esize, int msize,
6529                zreg_off_fn *off_fn,
6530                sve_ldst1_host_fn *host_fn,
6531                sve_ldst1_tlb_fn *tlb_fn)
6532 {
6533     const int mmu_idx = arm_env_mmu_index(env);
6534     const intptr_t reg_max = simd_oprsz(desc);
6535     const int scale = simd_data(desc);
6536     ARMVectorReg scratch;
6537     intptr_t reg_off;
6538     SVEHostPage info, info2;
6539 
6540     memset(&scratch, 0, reg_max);
6541     reg_off = 0;
6542     do {
6543         uint64_t pg = vg[reg_off >> 6];
6544         do {
6545             if (likely(pg & 1)) {
6546                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6547                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6548 
6549                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6550                                mmu_idx, retaddr);
6551 
6552                 if (likely(in_page >= msize)) {
6553                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6554                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6555                                              info.attrs, BP_MEM_READ, retaddr);
6556                     }
6557                     if (mtedesc && info.tagged) {
6558                         mte_check(env, mtedesc, addr, retaddr);
6559                     }
6560                     if (unlikely(info.flags & TLB_MMIO)) {
6561                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6562                     } else {
6563                         host_fn(&scratch, reg_off, info.host);
6564                     }
6565                 } else {
6566                     /* Element crosses the page boundary. */
6567                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6568                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6569                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6570                         cpu_check_watchpoint(env_cpu(env), addr,
6571                                              msize, info.attrs,
6572                                              BP_MEM_READ, retaddr);
6573                     }
6574                     if (mtedesc && info.tagged) {
6575                         mte_check(env, mtedesc, addr, retaddr);
6576                     }
6577                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6578                 }
6579             }
6580             reg_off += esize;
6581             pg >>= esize;
6582         } while (reg_off & 63);
6583     } while (reg_off < reg_max);
6584 
6585     /* Wait until all exceptions have been raised to write back.  */
6586     memcpy(vd, &scratch, reg_max);
6587 }
6588 
6589 static inline QEMU_ALWAYS_INLINE
6590 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6591                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6592                    int esize, int msize, zreg_off_fn *off_fn,
6593                    sve_ldst1_host_fn *host_fn,
6594                    sve_ldst1_tlb_fn *tlb_fn)
6595 {
6596     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6597     /* Remove mtedesc from the normal sve descriptor. */
6598     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6599 
6600     /*
6601      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6602      * offset base entirely over the address space hole to change the
6603      * pointer tag, or change the bit55 selector.  So we could here
6604      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6605      */
6606     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6607               esize, msize, off_fn, host_fn, tlb_fn);
6608 }
6609 
6610 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6611 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6612                                  void *vm, target_ulong base, uint32_t desc) \
6613 {                                                                            \
6614     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6615               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6616 }                                                                            \
6617 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6618      void *vm, target_ulong base, uint32_t desc)                             \
6619 {                                                                            \
6620     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6621                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6622 }
6623 
6624 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6625 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6626                                  void *vm, target_ulong base, uint32_t desc) \
6627 {                                                                            \
6628     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6629               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6630 }                                                                            \
6631 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6632     void *vm, target_ulong base, uint32_t desc)                              \
6633 {                                                                            \
6634     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6635                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6636 }
6637 
6638 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6639 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6640 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6641 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6642 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6643 
6644 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6645 DO_LD1_ZPZ_S(bss, zss, MO_8)
6646 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6647 DO_LD1_ZPZ_D(bds, zss, MO_8)
6648 DO_LD1_ZPZ_D(bds, zd, MO_8)
6649 
6650 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6651 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6652 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6653 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6654 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6655 
6656 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6657 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6658 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6659 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6660 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6661 
6662 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6663 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6664 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6665 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6666 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6667 
6668 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6669 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6670 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6671 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6672 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6673 
6674 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6675 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6676 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6677 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6678 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6679 
6680 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6681 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6682 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6683 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6684 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6685 
6686 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6687 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6688 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6689 
6690 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6691 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6692 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6693 
6694 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6695 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6696 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6697 
6698 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6699 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6700 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6701 
6702 #undef DO_LD1_ZPZ_S
6703 #undef DO_LD1_ZPZ_D
6704 
6705 /* First fault loads with a vector index.  */
6706 
6707 /*
6708  * Common helpers for all gather first-faulting loads.
6709  */
6710 
6711 static inline QEMU_ALWAYS_INLINE
6712 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6713                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6714                  uint32_t mtedesc, const int esz, const int msz,
6715                  zreg_off_fn *off_fn,
6716                  sve_ldst1_host_fn *host_fn,
6717                  sve_ldst1_tlb_fn *tlb_fn)
6718 {
6719     const int mmu_idx = arm_env_mmu_index(env);
6720     const intptr_t reg_max = simd_oprsz(desc);
6721     const int scale = simd_data(desc);
6722     const int esize = 1 << esz;
6723     const int msize = 1 << msz;
6724     intptr_t reg_off;
6725     SVEHostPage info;
6726     target_ulong addr, in_page;
6727     ARMVectorReg scratch;
6728 
6729     /* Skip to the first true predicate.  */
6730     reg_off = find_next_active(vg, 0, reg_max, esz);
6731     if (unlikely(reg_off >= reg_max)) {
6732         /* The entire predicate was false; no load occurs.  */
6733         memset(vd, 0, reg_max);
6734         return;
6735     }
6736 
6737     /* Protect against overlap between vd and vm. */
6738     if (unlikely(vd == vm)) {
6739         vm = memcpy(&scratch, vm, reg_max);
6740     }
6741 
6742     /*
6743      * Probe the first element, allowing faults.
6744      */
6745     addr = base + (off_fn(vm, reg_off) << scale);
6746     if (mtedesc) {
6747         mte_check(env, mtedesc, addr, retaddr);
6748     }
6749     tlb_fn(env, vd, reg_off, addr, retaddr);
6750 
6751     /* After any fault, zero the other elements. */
6752     swap_memzero(vd, reg_off);
6753     reg_off += esize;
6754     swap_memzero(vd + reg_off, reg_max - reg_off);
6755 
6756     /*
6757      * Probe the remaining elements, not allowing faults.
6758      */
6759     while (reg_off < reg_max) {
6760         uint64_t pg = vg[reg_off >> 6];
6761         do {
6762             if (likely((pg >> (reg_off & 63)) & 1)) {
6763                 addr = base + (off_fn(vm, reg_off) << scale);
6764                 in_page = -(addr | TARGET_PAGE_MASK);
6765 
6766                 if (unlikely(in_page < msize)) {
6767                     /* Stop if the element crosses a page boundary. */
6768                     goto fault;
6769                 }
6770 
6771                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6772                                mmu_idx, retaddr);
6773                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6774                     goto fault;
6775                 }
6776                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6777                     (cpu_watchpoint_address_matches
6778                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6779                     goto fault;
6780                 }
6781                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6782                     goto fault;
6783                 }
6784 
6785                 host_fn(vd, reg_off, info.host);
6786             }
6787             reg_off += esize;
6788         } while (reg_off & 63);
6789     }
6790     return;
6791 
6792  fault:
6793     record_fault(env, reg_off, reg_max);
6794 }
6795 
6796 static inline QEMU_ALWAYS_INLINE
6797 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6798                      target_ulong base, uint32_t desc, uintptr_t retaddr,
6799                      const int esz, const int msz,
6800                      zreg_off_fn *off_fn,
6801                      sve_ldst1_host_fn *host_fn,
6802                      sve_ldst1_tlb_fn *tlb_fn)
6803 {
6804     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6805     /* Remove mtedesc from the normal sve descriptor. */
6806     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6807 
6808     /*
6809      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6810      * offset base entirely over the address space hole to change the
6811      * pointer tag, or change the bit55 selector.  So we could here
6812      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6813      */
6814     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6815                 esz, msz, off_fn, host_fn, tlb_fn);
6816 }
6817 
6818 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
6819 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6820     (CPUARMState *env, void *vd, void *vg,                              \
6821      void *vm, target_ulong base, uint32_t desc)                        \
6822 {                                                                       \
6823     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
6824                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6825 }                                                                       \
6826 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6827     (CPUARMState *env, void *vd, void *vg,                              \
6828      void *vm, target_ulong base, uint32_t desc)                        \
6829 {                                                                       \
6830     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
6831                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6832 }
6833 
6834 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
6835 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6836     (CPUARMState *env, void *vd, void *vg,                              \
6837      void *vm, target_ulong base, uint32_t desc)                        \
6838 {                                                                       \
6839     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
6840                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6841 }                                                                       \
6842 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6843     (CPUARMState *env, void *vd, void *vg,                              \
6844      void *vm, target_ulong base, uint32_t desc)                        \
6845 {                                                                       \
6846     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
6847                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6848 }
6849 
6850 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6851 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6852 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6853 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6854 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6855 
6856 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6857 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6858 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6859 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6860 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6861 
6862 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6863 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6864 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6865 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6866 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6867 
6868 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6869 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6870 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6871 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6872 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6873 
6874 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6875 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6876 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6877 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6878 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6879 
6880 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6881 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6882 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6883 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6884 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6885 
6886 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
6887 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
6888 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6889 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6890 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6891 
6892 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
6893 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
6894 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6895 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6896 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6897 
6898 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6899 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6900 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6901 
6902 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6903 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6904 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6905 
6906 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6907 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6908 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6909 
6910 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6911 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6912 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6913 
6914 /* Stores with a vector index.  */
6915 
6916 static inline QEMU_ALWAYS_INLINE
6917 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6918                target_ulong base, uint32_t desc, uintptr_t retaddr,
6919                uint32_t mtedesc, int esize, int msize,
6920                zreg_off_fn *off_fn,
6921                sve_ldst1_host_fn *host_fn,
6922                sve_ldst1_tlb_fn *tlb_fn)
6923 {
6924     const int mmu_idx = arm_env_mmu_index(env);
6925     const intptr_t reg_max = simd_oprsz(desc);
6926     const int scale = simd_data(desc);
6927     void *host[ARM_MAX_VQ * 4];
6928     intptr_t reg_off, i;
6929     SVEHostPage info, info2;
6930 
6931     /*
6932      * Probe all of the elements for host addresses and flags.
6933      */
6934     i = reg_off = 0;
6935     do {
6936         uint64_t pg = vg[reg_off >> 6];
6937         do {
6938             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6939             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6940 
6941             host[i] = NULL;
6942             if (likely((pg >> (reg_off & 63)) & 1)) {
6943                 if (likely(in_page >= msize)) {
6944                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6945                                    mmu_idx, retaddr);
6946                     if (!(info.flags & TLB_MMIO)) {
6947                         host[i] = info.host;
6948                     }
6949                 } else {
6950                     /*
6951                      * Element crosses the page boundary.
6952                      * Probe both pages, but do not record the host address,
6953                      * so that we use the slow path.
6954                      */
6955                     sve_probe_page(&info, false, env, addr, 0,
6956                                    MMU_DATA_STORE, mmu_idx, retaddr);
6957                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6958                                    MMU_DATA_STORE, mmu_idx, retaddr);
6959                     info.flags |= info2.flags;
6960                 }
6961 
6962                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6963                     cpu_check_watchpoint(env_cpu(env), addr, msize,
6964                                          info.attrs, BP_MEM_WRITE, retaddr);
6965                 }
6966 
6967                 if (mtedesc && info.tagged) {
6968                     mte_check(env, mtedesc, addr, retaddr);
6969                 }
6970             }
6971             i += 1;
6972             reg_off += esize;
6973         } while (reg_off & 63);
6974     } while (reg_off < reg_max);
6975 
6976     /*
6977      * Now that we have recognized all exceptions except SyncExternal
6978      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6979      *
6980      * Note for the common case of an element in RAM, not crossing a page
6981      * boundary, we have stored the host address in host[].  This doubles
6982      * as a first-level check against the predicate, since only enabled
6983      * elements have non-null host addresses.
6984      */
6985     i = reg_off = 0;
6986     do {
6987         void *h = host[i];
6988         if (likely(h != NULL)) {
6989             host_fn(vd, reg_off, h);
6990         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6991             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6992             tlb_fn(env, vd, reg_off, addr, retaddr);
6993         }
6994         i += 1;
6995         reg_off += esize;
6996     } while (reg_off < reg_max);
6997 }
6998 
6999 static inline QEMU_ALWAYS_INLINE
7000 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7001                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7002                    int esize, int msize, zreg_off_fn *off_fn,
7003                    sve_ldst1_host_fn *host_fn,
7004                    sve_ldst1_tlb_fn *tlb_fn)
7005 {
7006     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7007     /* Remove mtedesc from the normal sve descriptor. */
7008     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7009 
7010     /*
7011      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7012      * offset base entirely over the address space hole to change the
7013      * pointer tag, or change the bit55 selector.  So we could here
7014      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7015      */
7016     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7017               esize, msize, off_fn, host_fn, tlb_fn);
7018 }
7019 
7020 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7021 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7022                                  void *vm, target_ulong base, uint32_t desc) \
7023 {                                                                       \
7024     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7025               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7026 }                                                                       \
7027 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7028     void *vm, target_ulong base, uint32_t desc)                         \
7029 {                                                                       \
7030     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7031                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7032 }
7033 
7034 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7035 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7036                                  void *vm, target_ulong base, uint32_t desc) \
7037 {                                                                       \
7038     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7039               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7040 }                                                                       \
7041 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7042     void *vm, target_ulong base, uint32_t desc)                         \
7043 {                                                                       \
7044     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7045                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7046 }
7047 
7048 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7049 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7050 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7051 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7052 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7053 
7054 DO_ST1_ZPZ_S(bs, zss, MO_8)
7055 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7056 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7057 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7058 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7059 
7060 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7061 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7062 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7063 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7064 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7065 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7066 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7067 
7068 DO_ST1_ZPZ_D(bd, zss, MO_8)
7069 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7070 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7071 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7072 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7073 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7074 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7075 
7076 DO_ST1_ZPZ_D(bd, zd, MO_8)
7077 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7078 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7079 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7080 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7081 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7082 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7083 
7084 #undef DO_ST1_ZPZ_S
7085 #undef DO_ST1_ZPZ_D
7086 
7087 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7088 {
7089     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7090     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7091 
7092     for (i = 0; i < opr_sz; ++i) {
7093         d[i] = n[i] ^ m[i] ^ k[i];
7094     }
7095 }
7096 
7097 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7098 {
7099     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7100     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7101 
7102     for (i = 0; i < opr_sz; ++i) {
7103         d[i] = n[i] ^ (m[i] & ~k[i]);
7104     }
7105 }
7106 
7107 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7108 {
7109     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7110     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7111 
7112     for (i = 0; i < opr_sz; ++i) {
7113         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7114     }
7115 }
7116 
7117 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7118 {
7119     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7120     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7121 
7122     for (i = 0; i < opr_sz; ++i) {
7123         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7124     }
7125 }
7126 
7127 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7128 {
7129     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7130     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7131 
7132     for (i = 0; i < opr_sz; ++i) {
7133         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7134     }
7135 }
7136 
7137 /*
7138  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7139  * See hasless(v,1) from
7140  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7141  */
7142 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7143 {
7144     int bits = 8 << esz;
7145     uint64_t ones = dup_const(esz, 1);
7146     uint64_t signs = ones << (bits - 1);
7147     uint64_t cmp0, cmp1;
7148 
7149     cmp1 = dup_const(esz, n);
7150     cmp0 = cmp1 ^ m0;
7151     cmp1 = cmp1 ^ m1;
7152     cmp0 = (cmp0 - ones) & ~cmp0;
7153     cmp1 = (cmp1 - ones) & ~cmp1;
7154     return (cmp0 | cmp1) & signs;
7155 }
7156 
7157 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7158                                 uint32_t desc, int esz, bool nmatch)
7159 {
7160     uint16_t esz_mask = pred_esz_masks[esz];
7161     intptr_t opr_sz = simd_oprsz(desc);
7162     uint32_t flags = PREDTEST_INIT;
7163     intptr_t i, j, k;
7164 
7165     for (i = 0; i < opr_sz; i += 16) {
7166         uint64_t m0 = *(uint64_t *)(vm + i);
7167         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7168         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7169         uint16_t out = 0;
7170 
7171         for (j = 0; j < 16; j += 8) {
7172             uint64_t n = *(uint64_t *)(vn + i + j);
7173 
7174             for (k = 0; k < 8; k += 1 << esz) {
7175                 if (pg & (1 << (j + k))) {
7176                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7177                     out |= (o ^ nmatch) << (j + k);
7178                 }
7179             }
7180         }
7181         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7182         flags = iter_predtest_fwd(out, pg, flags);
7183     }
7184     return flags;
7185 }
7186 
7187 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7188 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7189 {                                                                             \
7190     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7191 }
7192 
7193 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7194 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7195 
7196 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7197 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7198 
7199 #undef DO_PPZZ_MATCH
7200 
7201 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7202                             uint32_t desc)
7203 {
7204     ARMVectorReg scratch;
7205     intptr_t i, j;
7206     intptr_t opr_sz = simd_oprsz(desc);
7207     uint32_t *d = vd, *n = vn, *m = vm;
7208     uint8_t *pg = vg;
7209 
7210     if (d == n) {
7211         n = memcpy(&scratch, n, opr_sz);
7212         if (d == m) {
7213             m = n;
7214         }
7215     } else if (d == m) {
7216         m = memcpy(&scratch, m, opr_sz);
7217     }
7218 
7219     for (i = 0; i < opr_sz; i += 4) {
7220         uint64_t count = 0;
7221         uint8_t pred;
7222 
7223         pred = pg[H1(i >> 3)] >> (i & 7);
7224         if (pred & 1) {
7225             uint32_t nn = n[H4(i >> 2)];
7226 
7227             for (j = 0; j <= i; j += 4) {
7228                 pred = pg[H1(j >> 3)] >> (j & 7);
7229                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7230                     ++count;
7231                 }
7232             }
7233         }
7234         d[H4(i >> 2)] = count;
7235     }
7236 }
7237 
7238 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7239                             uint32_t desc)
7240 {
7241     ARMVectorReg scratch;
7242     intptr_t i, j;
7243     intptr_t opr_sz = simd_oprsz(desc);
7244     uint64_t *d = vd, *n = vn, *m = vm;
7245     uint8_t *pg = vg;
7246 
7247     if (d == n) {
7248         n = memcpy(&scratch, n, opr_sz);
7249         if (d == m) {
7250             m = n;
7251         }
7252     } else if (d == m) {
7253         m = memcpy(&scratch, m, opr_sz);
7254     }
7255 
7256     for (i = 0; i < opr_sz / 8; ++i) {
7257         uint64_t count = 0;
7258         if (pg[H1(i)] & 1) {
7259             uint64_t nn = n[i];
7260             for (j = 0; j <= i; ++j) {
7261                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7262                     ++count;
7263                 }
7264             }
7265         }
7266         d[i] = count;
7267     }
7268 }
7269 
7270 /*
7271  * Returns the number of bytes in m0 and m1 that match n.
7272  * Unlike do_match2 we don't just need true/false, we need an exact count.
7273  * This requires two extra logical operations.
7274  */
7275 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7276 {
7277     const uint64_t mask = dup_const(MO_8, 0x7f);
7278     uint64_t cmp0, cmp1;
7279 
7280     cmp1 = dup_const(MO_8, n);
7281     cmp0 = cmp1 ^ m0;
7282     cmp1 = cmp1 ^ m1;
7283 
7284     /*
7285      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7286      * 2: carry in to msb if byte != 0 (+ mask)
7287      * 3: set msb if cmp has msb set (| cmp)
7288      * 4: set ~msb to ignore them (| mask)
7289      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7290      * 5: invert, resulting in 0x80 if and only if byte == 0.
7291      */
7292     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7293     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7294 
7295     /*
7296      * Combine the two compares in a way that the bits do
7297      * not overlap, and so preserves the count of set bits.
7298      * If the host has an efficient instruction for ctpop,
7299      * then ctpop(x) + ctpop(y) has the same number of
7300      * operations as ctpop(x | (y >> 1)).  If the host does
7301      * not have an efficient ctpop, then we only want to
7302      * use it once.
7303      */
7304     return ctpop64(cmp0 | (cmp1 >> 1));
7305 }
7306 
7307 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7308 {
7309     intptr_t i, j;
7310     intptr_t opr_sz = simd_oprsz(desc);
7311 
7312     for (i = 0; i < opr_sz; i += 16) {
7313         uint64_t n0 = *(uint64_t *)(vn + i);
7314         uint64_t m0 = *(uint64_t *)(vm + i);
7315         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7316         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7317         uint64_t out0 = 0;
7318         uint64_t out1 = 0;
7319 
7320         for (j = 0; j < 64; j += 8) {
7321             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7322             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7323             out0 |= cnt0 << j;
7324             out1 |= cnt1 << j;
7325         }
7326 
7327         *(uint64_t *)(vd + i) = out0;
7328         *(uint64_t *)(vd + i + 8) = out1;
7329     }
7330 }
7331 
7332 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7333 {
7334     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7335     int shr = simd_data(desc);
7336     int shl = 8 - shr;
7337     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7338     uint64_t *d = vd, *n = vn, *m = vm;
7339 
7340     for (i = 0; i < opr_sz; ++i) {
7341         uint64_t t = n[i] ^ m[i];
7342         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7343     }
7344 }
7345 
7346 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7347 {
7348     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7349     int shr = simd_data(desc);
7350     int shl = 16 - shr;
7351     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7352     uint64_t *d = vd, *n = vn, *m = vm;
7353 
7354     for (i = 0; i < opr_sz; ++i) {
7355         uint64_t t = n[i] ^ m[i];
7356         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7357     }
7358 }
7359 
7360 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7361 {
7362     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7363     int shr = simd_data(desc);
7364     uint32_t *d = vd, *n = vn, *m = vm;
7365 
7366     for (i = 0; i < opr_sz; ++i) {
7367         d[i] = ror32(n[i] ^ m[i], shr);
7368     }
7369 }
7370 
7371 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7372                      void *status, uint32_t desc)
7373 {
7374     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7375 
7376     for (s = 0; s < opr_sz; ++s) {
7377         float32 *n = vn + s * sizeof(float32) * 4;
7378         float32 *m = vm + s * sizeof(float32) * 4;
7379         float32 *a = va + s * sizeof(float32) * 4;
7380         float32 *d = vd + s * sizeof(float32) * 4;
7381         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7382         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7383         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7384         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7385         float32 p0, p1;
7386 
7387         /* i = 0, j = 0 */
7388         p0 = float32_mul(n00, m00, status);
7389         p1 = float32_mul(n01, m01, status);
7390         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7391 
7392         /* i = 0, j = 1 */
7393         p0 = float32_mul(n00, m10, status);
7394         p1 = float32_mul(n01, m11, status);
7395         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7396 
7397         /* i = 1, j = 0 */
7398         p0 = float32_mul(n10, m00, status);
7399         p1 = float32_mul(n11, m01, status);
7400         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7401 
7402         /* i = 1, j = 1 */
7403         p0 = float32_mul(n10, m10, status);
7404         p1 = float32_mul(n11, m11, status);
7405         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7406     }
7407 }
7408 
7409 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7410                      void *status, uint32_t desc)
7411 {
7412     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7413 
7414     for (s = 0; s < opr_sz; ++s) {
7415         float64 *n = vn + s * sizeof(float64) * 4;
7416         float64 *m = vm + s * sizeof(float64) * 4;
7417         float64 *a = va + s * sizeof(float64) * 4;
7418         float64 *d = vd + s * sizeof(float64) * 4;
7419         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7420         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7421         float64 p0, p1;
7422 
7423         /* i = 0, j = 0 */
7424         p0 = float64_mul(n00, m00, status);
7425         p1 = float64_mul(n01, m01, status);
7426         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7427 
7428         /* i = 0, j = 1 */
7429         p0 = float64_mul(n00, m10, status);
7430         p1 = float64_mul(n01, m11, status);
7431         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7432 
7433         /* i = 1, j = 0 */
7434         p0 = float64_mul(n10, m00, status);
7435         p1 = float64_mul(n11, m01, status);
7436         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7437 
7438         /* i = 1, j = 1 */
7439         p0 = float64_mul(n10, m10, status);
7440         p1 = float64_mul(n11, m11, status);
7441         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7442     }
7443 }
7444 
7445 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7446 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7447 {                                                                             \
7448     intptr_t i = simd_oprsz(desc);                                            \
7449     uint64_t *g = vg;                                                         \
7450     do {                                                                      \
7451         uint64_t pg = g[(i - 1) >> 6];                                        \
7452         do {                                                                  \
7453             i -= sizeof(TYPEW);                                               \
7454             if (likely((pg >> (i & 63)) & 1)) {                               \
7455                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7456                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7457             }                                                                 \
7458         } while (i & 63);                                                     \
7459     } while (i != 0);                                                         \
7460 }
7461 
7462 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7463 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7464 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7465 
7466 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7467 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7468 {                                                                             \
7469     intptr_t i = simd_oprsz(desc);                                            \
7470     uint64_t *g = vg;                                                         \
7471     do {                                                                      \
7472         uint64_t pg = g[(i - 1) >> 6];                                        \
7473         do {                                                                  \
7474             i -= sizeof(TYPEW);                                               \
7475             if (likely((pg >> (i & 63)) & 1)) {                               \
7476                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7477                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7478             }                                                                 \
7479         } while (i & 63);                                                     \
7480     } while (i != 0);                                                         \
7481 }
7482 
7483 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7484 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7485 
7486 #undef DO_FCVTLT
7487 #undef DO_FCVTNT
7488