xref: /openbmc/qemu/target/arm/tcg/sve_helper.c (revision 9c2ff9cdc9b33472333e9431cbf4417f5f228883)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/page-protection.h"
25 #include "exec/helper-proto.h"
26 #include "exec/target_page.h"
27 #include "exec/tlb-flags.h"
28 #include "tcg/tcg-gvec-desc.h"
29 #include "fpu/softfloat.h"
30 #include "tcg/tcg.h"
31 #include "vec_internal.h"
32 #include "sve_ldst_internal.h"
33 #include "accel/tcg/cpu-ops.h"
34 #ifdef CONFIG_USER_ONLY
35 #include "user/page-protection.h"
36 #endif
37 
38 
39 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
40  *
41  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
42  * and bit 0 set if C is set.  Compare the definitions of these variables
43  * within CPUARMState.
44  */
45 
46 /* For no G bits set, NZCV = C.  */
47 #define PREDTEST_INIT  1
48 
49 /* This is an iterative function, called for each Pd and Pg word
50  * moving forward.
51  */
52 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
53 {
54     if (likely(g)) {
55         /* Compute N from first D & G.
56            Use bit 2 to signal first G bit seen.  */
57         if (!(flags & 4)) {
58             flags |= ((d & (g & -g)) != 0) << 31;
59             flags |= 4;
60         }
61 
62         /* Accumulate Z from each D & G.  */
63         flags |= ((d & g) != 0) << 1;
64 
65         /* Compute C from last !(D & G).  Replace previous.  */
66         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
67     }
68     return flags;
69 }
70 
71 /* This is an iterative function, called for each Pd and Pg word
72  * moving backward.
73  */
74 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
75 {
76     if (likely(g)) {
77         /* Compute C from first (i.e last) !(D & G).
78            Use bit 2 to signal first G bit seen.  */
79         if (!(flags & 4)) {
80             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
81             flags |= (d & pow2floor(g)) == 0;
82         }
83 
84         /* Accumulate Z from each D & G.  */
85         flags |= ((d & g) != 0) << 1;
86 
87         /* Compute N from last (i.e first) D & G.  Replace previous.  */
88         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
89     }
90     return flags;
91 }
92 
93 /* The same for a single word predicate.  */
94 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
95 {
96     return iter_predtest_fwd(d, g, PREDTEST_INIT);
97 }
98 
99 /* The same for a multi-word predicate.  */
100 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
101 {
102     uint32_t flags = PREDTEST_INIT;
103     uint64_t *d = vd, *g = vg;
104     uintptr_t i = 0;
105 
106     do {
107         flags = iter_predtest_fwd(d[i], g[i], flags);
108     } while (++i < words);
109 
110     return flags;
111 }
112 
113 /* Similarly for single word elements.  */
114 static inline uint64_t expand_pred_s(uint8_t byte)
115 {
116     static const uint64_t word[] = {
117         [0x01] = 0x00000000ffffffffull,
118         [0x10] = 0xffffffff00000000ull,
119         [0x11] = 0xffffffffffffffffull,
120     };
121     return word[byte & 0x11];
122 }
123 
124 #define LOGICAL_PPPP(NAME, FUNC) \
125 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
126 {                                                                         \
127     uintptr_t opr_sz = simd_oprsz(desc);                                  \
128     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
129     uintptr_t i;                                                          \
130     for (i = 0; i < opr_sz / 8; ++i) {                                    \
131         d[i] = FUNC(n[i], m[i], g[i]);                                    \
132     }                                                                     \
133 }
134 
135 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
136 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
137 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
138 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
139 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
140 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
141 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
142 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
143 
144 LOGICAL_PPPP(sve_and_pppp, DO_AND)
145 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
146 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
147 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
148 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
149 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
150 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
151 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
152 
153 #undef DO_AND
154 #undef DO_BIC
155 #undef DO_EOR
156 #undef DO_ORR
157 #undef DO_ORN
158 #undef DO_NOR
159 #undef DO_NAND
160 #undef DO_SEL
161 #undef LOGICAL_PPPP
162 
163 /* Fully general three-operand expander, controlled by a predicate.
164  * This is complicated by the host-endian storage of the register file.
165  */
166 /* ??? I don't expect the compiler could ever vectorize this itself.
167  * With some tables we can convert bit masks to byte masks, and with
168  * extra care wrt byte/word ordering we could use gcc generic vectors
169  * and do 16 bytes at a time.
170  */
171 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
172 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
173 {                                                                       \
174     intptr_t i, opr_sz = simd_oprsz(desc);                              \
175     for (i = 0; i < opr_sz; ) {                                         \
176         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
177         do {                                                            \
178             if (pg & 1) {                                               \
179                 TYPE nn = *(TYPE *)(vn + H(i));                         \
180                 TYPE mm = *(TYPE *)(vm + H(i));                         \
181                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
182             }                                                           \
183             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
184         } while (i & 15);                                               \
185     }                                                                   \
186 }
187 
188 /* Similarly, specialized for 64-bit operands.  */
189 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
190 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
191 {                                                               \
192     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
193     TYPE *d = vd, *n = vn, *m = vm;                             \
194     uint8_t *pg = vg;                                           \
195     for (i = 0; i < opr_sz; i += 1) {                           \
196         if (pg[H1(i)] & 1) {                                    \
197             TYPE nn = n[i], mm = m[i];                          \
198             d[i] = OP(nn, mm);                                  \
199         }                                                       \
200     }                                                           \
201 }
202 
203 #define DO_AND(N, M)  (N & M)
204 #define DO_EOR(N, M)  (N ^ M)
205 #define DO_ORR(N, M)  (N | M)
206 #define DO_BIC(N, M)  (N & ~M)
207 #define DO_ADD(N, M)  (N + M)
208 #define DO_SUB(N, M)  (N - M)
209 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
210 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
211 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
212 #define DO_MUL(N, M)  (N * M)
213 
214 
215 /*
216  * We must avoid the C undefined behaviour cases: division by
217  * zero and signed division of INT_MIN by -1. Both of these
218  * have architecturally defined required results for Arm.
219  * We special case all signed divisions by -1 to avoid having
220  * to deduce the minimum integer for the type involved.
221  */
222 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
223 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
224 
225 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
226 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
227 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
228 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
229 
230 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
231 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
232 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
233 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
234 
235 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
236 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
237 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
238 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
239 
240 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
241 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
242 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
243 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
244 
245 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
246 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
247 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
248 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
249 
250 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
251 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
252 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
253 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
254 
255 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
256 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
257 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
258 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
259 
260 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
261 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
262 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
263 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
264 
265 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
266 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
267 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
268 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
269 
270 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
271 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
272 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
273 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
274 
275 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
276 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
277 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
278 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
279 
280 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
281 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
282 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
283 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
284 
285 /* Because the computation type is at least twice as large as required,
286    these work for both signed and unsigned source types.  */
287 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
288 {
289     return (n * m) >> 8;
290 }
291 
292 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
293 {
294     return (n * m) >> 16;
295 }
296 
297 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
298 {
299     return (n * m) >> 32;
300 }
301 
302 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
303 {
304     uint64_t lo, hi;
305     muls64(&lo, &hi, n, m);
306     return hi;
307 }
308 
309 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
310 {
311     uint64_t lo, hi;
312     mulu64(&lo, &hi, n, m);
313     return hi;
314 }
315 
316 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
317 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
318 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
319 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
320 
321 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
322 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
323 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
324 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
325 
326 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
327 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
328 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
329 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
330 
331 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
332 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
333 
334 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
335 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
336 
337 /* Note that all bits of the shift are significant
338    and not modulo the element size.  */
339 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
340 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
341 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
342 
343 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
344 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
345 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
346 
347 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
348 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
349 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
350 
351 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
352 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
353 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
354 
355 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
356 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
357 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
358 
359 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
360 {
361     int8_t n1 = n, n2 = n >> 8;
362     return m + n1 + n2;
363 }
364 
365 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
366 {
367     int16_t n1 = n, n2 = n >> 16;
368     return m + n1 + n2;
369 }
370 
371 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
372 {
373     int32_t n1 = n, n2 = n >> 32;
374     return m + n1 + n2;
375 }
376 
377 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
378 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
379 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
380 
381 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
382 {
383     uint8_t n1 = n, n2 = n >> 8;
384     return m + n1 + n2;
385 }
386 
387 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
388 {
389     uint16_t n1 = n, n2 = n >> 16;
390     return m + n1 + n2;
391 }
392 
393 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
394 {
395     uint32_t n1 = n, n2 = n >> 32;
396     return m + n1 + n2;
397 }
398 
399 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
400 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
401 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
402 
403 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
404 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
405 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
406 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
407 
408 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
409 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
410 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
411 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
412 
413 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
414 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
415 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
416 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
417 
418 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
419 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
420 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
421 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
422 
423 /*
424  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
425  * We pass in a pointer to a dummy saturation field to trigger
426  * the saturating arithmetic but discard the information about
427  * whether it has occurred.
428  */
429 #define do_sqshl_b(n, m) \
430    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
431 #define do_sqshl_h(n, m) \
432    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
433 #define do_sqshl_s(n, m) \
434    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
435 #define do_sqshl_d(n, m) \
436    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
437 
438 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
439 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
440 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
441 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
442 
443 #define do_uqshl_b(n, m) \
444    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
445 #define do_uqshl_h(n, m) \
446    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
447 #define do_uqshl_s(n, m) \
448    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
449 #define do_uqshl_d(n, m) \
450    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
451 
452 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
453 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
454 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
455 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
456 
457 #define do_sqrshl_b(n, m) \
458    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
459 #define do_sqrshl_h(n, m) \
460    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
461 #define do_sqrshl_s(n, m) \
462    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
463 #define do_sqrshl_d(n, m) \
464    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
465 
466 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
467 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
468 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
469 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
470 
471 #undef do_sqrshl_d
472 
473 #define do_uqrshl_b(n, m) \
474    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
475 #define do_uqrshl_h(n, m) \
476    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
477 #define do_uqrshl_s(n, m) \
478    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
479 #define do_uqrshl_d(n, m) \
480    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
481 
482 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
483 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
484 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
485 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
486 
487 #undef do_uqrshl_d
488 
489 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
490 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
491 
492 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
493 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
494 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
495 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
496 
497 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
498 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
499 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
500 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
501 
502 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
503 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
504 
505 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
506 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
507 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
508 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
509 
510 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
511 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
512 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
513 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
514 
515 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
516 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
517 
518 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
519 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
520 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
521 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
522 
523 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
524 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
525 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
526 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
527 
528 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
529 {
530     return val >= max ? max : val <= min ? min : val;
531 }
532 
533 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
534 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
535 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
536 
537 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
538 {
539     int64_t r = n + m;
540     if (((r ^ n) & ~(n ^ m)) < 0) {
541         /* Signed overflow.  */
542         return r < 0 ? INT64_MAX : INT64_MIN;
543     }
544     return r;
545 }
546 
547 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
548 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
549 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
550 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
551 
552 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
553 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
554 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
555 
556 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
557 {
558     uint64_t r = n + m;
559     return r < n ? UINT64_MAX : r;
560 }
561 
562 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
563 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
564 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
565 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
566 
567 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
568 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
569 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
570 
571 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
572 {
573     int64_t r = n - m;
574     if (((r ^ n) & (n ^ m)) < 0) {
575         /* Signed overflow.  */
576         return r < 0 ? INT64_MAX : INT64_MIN;
577     }
578     return r;
579 }
580 
581 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
582 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
583 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
584 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
585 
586 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
587 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
588 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
589 
590 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
591 {
592     return n > m ? n - m : 0;
593 }
594 
595 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
596 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
597 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
598 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
599 
600 #define DO_SUQADD_B(n, m) \
601     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
602 #define DO_SUQADD_H(n, m) \
603     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
604 #define DO_SUQADD_S(n, m) \
605     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
606 
607 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
608 {
609     uint64_t r = n + m;
610 
611     if (n < 0) {
612         /* Note that m - abs(n) cannot underflow. */
613         if (r > INT64_MAX) {
614             /* Result is either very large positive or negative. */
615             if (m > -n) {
616                 /* m > abs(n), so r is a very large positive. */
617                 return INT64_MAX;
618             }
619             /* Result is negative. */
620         }
621     } else {
622         /* Both inputs are positive: check for overflow.  */
623         if (r < m || r > INT64_MAX) {
624             return INT64_MAX;
625         }
626     }
627     return r;
628 }
629 
630 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
631 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
632 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
633 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
634 
635 #define DO_USQADD_B(n, m) \
636     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
637 #define DO_USQADD_H(n, m) \
638     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
639 #define DO_USQADD_S(n, m) \
640     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
641 
642 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
643 {
644     uint64_t r = n + m;
645 
646     if (m < 0) {
647         return n < -m ? 0 : r;
648     }
649     return r < n ? UINT64_MAX : r;
650 }
651 
652 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
653 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
654 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
655 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
656 
657 #undef DO_ZPZZ
658 #undef DO_ZPZZ_D
659 
660 /*
661  * Three operand expander, operating on element pairs.
662  * If the slot I is even, the elements from from VN {I, I+1}.
663  * If the slot I is odd, the elements from from VM {I-1, I}.
664  * Load all of the input elements in each pair before overwriting output.
665  */
666 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
667 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
668 {                                                               \
669     intptr_t i, opr_sz = simd_oprsz(desc);                      \
670     for (i = 0; i < opr_sz; ) {                                 \
671         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
672         do {                                                    \
673             TYPE n0 = *(TYPE *)(vn + H(i));                     \
674             TYPE m0 = *(TYPE *)(vm + H(i));                     \
675             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
676             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
677             if (pg & 1) {                                       \
678                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
679             }                                                   \
680             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
681             if (pg & 1) {                                       \
682                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
683             }                                                   \
684             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
685         } while (i & 15);                                       \
686     }                                                           \
687 }
688 
689 /* Similarly, specialized for 64-bit operands.  */
690 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
692 {                                                               \
693     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
694     TYPE *d = vd, *n = vn, *m = vm;                             \
695     uint8_t *pg = vg;                                           \
696     for (i = 0; i < opr_sz; i += 2) {                           \
697         TYPE n0 = n[i], n1 = n[i + 1];                          \
698         TYPE m0 = m[i], m1 = m[i + 1];                          \
699         if (pg[H1(i)] & 1) {                                    \
700             d[i] = OP(n0, n1);                                  \
701         }                                                       \
702         if (pg[H1(i + 1)] & 1) {                                \
703             d[i + 1] = OP(m0, m1);                              \
704         }                                                       \
705     }                                                           \
706 }
707 
708 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
709 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
710 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
711 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
712 
713 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
714 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
715 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
716 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
717 
718 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
719 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
720 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
721 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
722 
723 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
724 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
725 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
726 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
727 
728 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
729 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
730 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
731 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
732 
733 #undef DO_ZPZZ_PAIR
734 #undef DO_ZPZZ_PAIR_D
735 
736 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
737 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
738                   float_status *status, uint32_t desc)                  \
739 {                                                                       \
740     intptr_t i, opr_sz = simd_oprsz(desc);                              \
741     for (i = 0; i < opr_sz; ) {                                         \
742         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
743         do {                                                            \
744             TYPE n0 = *(TYPE *)(vn + H(i));                             \
745             TYPE m0 = *(TYPE *)(vm + H(i));                             \
746             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
747             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
748             if (pg & 1) {                                               \
749                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
750             }                                                           \
751             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
752             if (pg & 1) {                                               \
753                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
754             }                                                           \
755             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
756         } while (i & 15);                                               \
757     }                                                                   \
758 }
759 
760 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
761 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
762 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
763 
764 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
765 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
766 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
767 
768 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
769 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
770 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
771 
772 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
773 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
774 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
775 
776 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
777 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
778 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
779 
780 #undef DO_ZPZZ_PAIR_FP
781 
782 /* Three-operand expander, controlled by a predicate, in which the
783  * third operand is "wide".  That is, for D = N op M, the same 64-bit
784  * value of M is used with all of the narrower values of N.
785  */
786 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
787 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
788 {                                                                       \
789     intptr_t i, opr_sz = simd_oprsz(desc);                              \
790     for (i = 0; i < opr_sz; ) {                                         \
791         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
792         TYPEW mm = *(TYPEW *)(vm + i);                                  \
793         do {                                                            \
794             if (pg & 1) {                                               \
795                 TYPE nn = *(TYPE *)(vn + H(i));                         \
796                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
797             }                                                           \
798             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
799         } while (i & 7);                                                \
800     }                                                                   \
801 }
802 
803 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
804 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
805 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
806 
807 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
808 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
809 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
810 
811 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
812 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
813 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
814 
815 #undef DO_ZPZW
816 
817 /* Fully general two-operand expander, controlled by a predicate.
818  */
819 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
820 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
821 {                                                               \
822     intptr_t i, opr_sz = simd_oprsz(desc);                      \
823     for (i = 0; i < opr_sz; ) {                                 \
824         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
825         do {                                                    \
826             if (pg & 1) {                                       \
827                 TYPE nn = *(TYPE *)(vn + H(i));                 \
828                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
829             }                                                   \
830             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
831         } while (i & 15);                                       \
832     }                                                           \
833 }
834 
835 /* Similarly, specialized for 64-bit operands.  */
836 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
837 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
838 {                                                               \
839     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
840     TYPE *d = vd, *n = vn;                                      \
841     uint8_t *pg = vg;                                           \
842     for (i = 0; i < opr_sz; i += 1) {                           \
843         if (pg[H1(i)] & 1) {                                    \
844             TYPE nn = n[i];                                     \
845             d[i] = OP(nn);                                      \
846         }                                                       \
847     }                                                           \
848 }
849 
850 #define DO_CLS_B(N)   (clrsb32(N) - 24)
851 #define DO_CLS_H(N)   (clrsb32(N) - 16)
852 
853 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
854 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
855 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
856 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
857 
858 #define DO_CLZ_B(N)   (clz32(N) - 24)
859 #define DO_CLZ_H(N)   (clz32(N) - 16)
860 
861 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
862 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
863 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
864 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
865 
866 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
867 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
868 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
869 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
870 
871 #define DO_CNOT(N)    (N == 0)
872 
873 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
874 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
875 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
876 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
877 
878 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
879 
880 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
881 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
882 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
883 
884 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N))
885 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N))
886 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N))
887 
888 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H)
889 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S)
890 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D)
891 
892 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
893 
894 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
895 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
896 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
897 
898 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N))
899 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N))
900 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N))
901 
902 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H)
903 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S)
904 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D)
905 
906 #define DO_NOT(N)    (~N)
907 
908 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
909 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
910 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
911 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
912 
913 #define DO_SXTB(N)    ((int8_t)N)
914 #define DO_SXTH(N)    ((int16_t)N)
915 #define DO_SXTS(N)    ((int32_t)N)
916 #define DO_UXTB(N)    ((uint8_t)N)
917 #define DO_UXTH(N)    ((uint16_t)N)
918 #define DO_UXTS(N)    ((uint32_t)N)
919 
920 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
921 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
922 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
923 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
924 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
925 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
926 
927 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
928 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
929 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
930 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
931 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
932 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
933 
934 #define DO_ABS(N)    (N < 0 ? -N : N)
935 
936 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
937 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
938 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
939 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
940 
941 #define DO_NEG(N)    (-N)
942 
943 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
944 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
945 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
946 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
947 
948 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
949 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
950 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
951 
952 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
953 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
954 
955 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
956 
957 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
958 {
959     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
960     uint64_t *d = vd, *n = vn;
961     uint8_t *pg = vg;
962 
963     for (i = 0; i < opr_sz; i += 2) {
964         if (pg[H1(i)] & 1) {
965             uint64_t n0 = n[i + 0];
966             uint64_t n1 = n[i + 1];
967             d[i + 0] = n1;
968             d[i + 1] = n0;
969         }
970     }
971 }
972 
973 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
974 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
975 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
976 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
977 
978 #define DO_SQABS(X) \
979     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
980        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
981 
982 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
983 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
984 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
985 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
986 
987 #define DO_SQNEG(X) \
988     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
989        x_ == min_ ? -min_ - 1 : -x_; })
990 
991 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
992 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
993 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
994 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
995 
996 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
997 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
998 
999 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1000  */
1001 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
1002 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1003 {                                                              \
1004     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1005     for (i = 0; i < opr_sz; ) {                                \
1006         TYPEW mm = *(TYPEW *)(vm + i);                         \
1007         do {                                                   \
1008             TYPE nn = *(TYPE *)(vn + H(i));                    \
1009             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
1010             i += sizeof(TYPE);                                 \
1011         } while (i & 7);                                       \
1012     }                                                          \
1013 }
1014 
1015 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1016 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1017 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1018 
1019 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1020 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1021 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1022 
1023 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1024 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1025 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1026 
1027 #undef DO_ZZW
1028 
1029 #undef DO_CLS_B
1030 #undef DO_CLS_H
1031 #undef DO_CLZ_B
1032 #undef DO_CLZ_H
1033 #undef DO_CNOT
1034 #undef DO_FABS
1035 #undef DO_FNEG
1036 #undef DO_ABS
1037 #undef DO_NEG
1038 #undef DO_ZPZ
1039 #undef DO_ZPZ_D
1040 
1041 /*
1042  * Three-operand expander, unpredicated, in which the two inputs are
1043  * selected from the top or bottom half of the wide column.
1044  */
1045 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1046 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1047 {                                                                       \
1048     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1049     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1050     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1051     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1052         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1053         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1054         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1055     }                                                                   \
1056 }
1057 
1058 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1059 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1060 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1061 
1062 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1063 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1064 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1065 
1066 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1067 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1068 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1069 
1070 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1071 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1072 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1073 
1074 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1075 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1076 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1077 
1078 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1079 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1080 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1081 
1082 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1083 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1084 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1085 
1086 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1087 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1088 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1089 
1090 /* Note that the multiply cannot overflow, but the doubling can. */
1091 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1092 {
1093     int16_t val = n * m;
1094     return DO_SQADD_H(val, val);
1095 }
1096 
1097 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1098 {
1099     int32_t val = n * m;
1100     return DO_SQADD_S(val, val);
1101 }
1102 
1103 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1104 {
1105     int64_t val = n * m;
1106     return do_sqadd_d(val, val);
1107 }
1108 
1109 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1110 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1111 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1112 
1113 #undef DO_ZZZ_TB
1114 
1115 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1116 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1117 {                                                              \
1118     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1119     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1120     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1121         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1122         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1123         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1124     }                                                          \
1125 }
1126 
1127 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1128 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1129 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1130 
1131 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1132 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1133 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1134 
1135 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1136 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1137 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1138 
1139 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1140 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1141 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1142 
1143 #undef DO_ZZZ_WTB
1144 
1145 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1146 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1147 {                                                                       \
1148     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1149     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1150     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1151     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1152         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1153         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1154         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1155     }                                                                   \
1156 }
1157 
1158 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1159 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1160 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1161 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1162 
1163 #undef DO_ZZZ_NTB
1164 
1165 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1166 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1167 {                                                               \
1168     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1169     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1170     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1171         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1172         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1173         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1174         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1175     }                                                           \
1176 }
1177 
1178 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1179 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1180 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1181 
1182 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1183 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1184 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1185 
1186 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1187 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1188 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1189 
1190 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1191 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1192 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1193 
1194 #define DO_NMUL(N, M)  -(N * M)
1195 
1196 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1197 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1198 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1199 
1200 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1201 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1202 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1203 
1204 #undef DO_ZZZW_ACC
1205 
1206 #define DO_XTNB(NAME, TYPE, OP) \
1207 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1208 {                                                            \
1209     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1210     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1211         TYPE nn = *(TYPE *)(vn + i);                         \
1212         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1213         *(TYPE *)(vd + i) = nn;                              \
1214     }                                                        \
1215 }
1216 
1217 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1218 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1219 {                                                                       \
1220     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1221     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1222         TYPE nn = *(TYPE *)(vn + i);                                    \
1223         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1224     }                                                                   \
1225 }
1226 
1227 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1228 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1229 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1230 
1231 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1232 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1233 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1234 
1235 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1236 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1237 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1238 
1239 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1240 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1241 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1242 
1243 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1244 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1245 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1246 
1247 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1248 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1249 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1250 
1251 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1252 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1253 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1254 
1255 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1256 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1257 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1258 
1259 #undef DO_XTNB
1260 #undef DO_XTNT
1261 
1262 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1263 {
1264     intptr_t i, opr_sz = simd_oprsz(desc);
1265     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1266     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1267     uint32_t *a = va, *n = vn;
1268     uint64_t *d = vd, *m = vm;
1269 
1270     for (i = 0; i < opr_sz / 8; ++i) {
1271         uint32_t e1 = a[2 * i + H4(0)];
1272         uint32_t e2 = n[2 * i + sel] ^ inv;
1273         uint64_t c = extract64(m[i], 32, 1);
1274         /* Compute and store the entire 33-bit result at once. */
1275         d[i] = c + e1 + e2;
1276     }
1277 }
1278 
1279 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1280 {
1281     intptr_t i, opr_sz = simd_oprsz(desc);
1282     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1283     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1284     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1285 
1286     for (i = 0; i < opr_sz / 8; i += 2) {
1287         Int128 e1 = int128_make64(a[i]);
1288         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1289         Int128 c = int128_make64(m[i + 1] & 1);
1290         Int128 r = int128_add(int128_add(e1, e2), c);
1291         d[i + 0] = int128_getlo(r);
1292         d[i + 1] = int128_gethi(r);
1293     }
1294 }
1295 
1296 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1297 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1298 {                                                                       \
1299     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1300     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1301     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1302     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1303         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1304         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1305         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1306         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1307     }                                                                   \
1308 }
1309 
1310 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1311            do_sqdmull_h, DO_SQADD_H)
1312 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1313            do_sqdmull_s, DO_SQADD_S)
1314 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1315            do_sqdmull_d, do_sqadd_d)
1316 
1317 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1318            do_sqdmull_h, DO_SQSUB_H)
1319 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1320            do_sqdmull_s, DO_SQSUB_S)
1321 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1322            do_sqdmull_d, do_sqsub_d)
1323 
1324 #undef DO_SQDMLAL
1325 
1326 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1327 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1328 {                                                               \
1329     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1330     int rot = simd_data(desc);                                  \
1331     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1332     bool sub_r = rot == 1 || rot == 2;                          \
1333     bool sub_i = rot >= 2;                                      \
1334     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1335     for (i = 0; i < opr_sz; i += 2) {                           \
1336         TYPE elt1_a = n[H(i + sel_a)];                          \
1337         TYPE elt2_a = m[H(i + sel_a)];                          \
1338         TYPE elt2_b = m[H(i + sel_b)];                          \
1339         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1340         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1341     }                                                           \
1342 }
1343 
1344 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1345 
1346 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1347 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1348 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1349 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1350 
1351 #define DO_SQRDMLAH_B(N, M, A, S) \
1352     do_sqrdmlah_b(N, M, A, S, true)
1353 #define DO_SQRDMLAH_H(N, M, A, S) \
1354     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1355 #define DO_SQRDMLAH_S(N, M, A, S) \
1356     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1357 #define DO_SQRDMLAH_D(N, M, A, S) \
1358     do_sqrdmlah_d(N, M, A, S, true)
1359 
1360 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1361 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1362 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1363 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1364 
1365 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1366 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1367 {                                                                           \
1368     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1369     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1370     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1371     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1372     bool sub_r = rot == 1 || rot == 2;                                      \
1373     bool sub_i = rot >= 2;                                                  \
1374     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1375     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1376         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1377         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1378         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1379             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1380             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1381             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1382         }                                                                   \
1383     }                                                                       \
1384 }
1385 
1386 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1387 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1388 
1389 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1390 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1391 
1392 #undef DO_CMLA
1393 #undef DO_CMLA_FUNC
1394 #undef DO_CMLA_IDX_FUNC
1395 #undef DO_SQRDMLAH_B
1396 #undef DO_SQRDMLAH_H
1397 #undef DO_SQRDMLAH_S
1398 #undef DO_SQRDMLAH_D
1399 
1400 /* Note N and M are 4 elements bundled into one unit. */
1401 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1402                          int sel_a, int sel_b, int sub_i)
1403 {
1404     for (int i = 0; i <= 1; i++) {
1405         int32_t elt1_r = (int8_t)(n >> (16 * i));
1406         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1407         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1408         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1409 
1410         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1411     }
1412     return a;
1413 }
1414 
1415 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1416                          int sel_a, int sel_b, int sub_i)
1417 {
1418     for (int i = 0; i <= 1; i++) {
1419         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1420         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1421         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1422         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1423 
1424         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1425     }
1426     return a;
1427 }
1428 
1429 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1430                               void *va, uint32_t desc)
1431 {
1432     int opr_sz = simd_oprsz(desc);
1433     int rot = simd_data(desc);
1434     int sel_a = rot & 1;
1435     int sel_b = sel_a ^ 1;
1436     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1437     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1438 
1439     for (int e = 0; e < opr_sz / 4; e++) {
1440         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1441     }
1442 }
1443 
1444 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1445                               void *va, uint32_t desc)
1446 {
1447     int opr_sz = simd_oprsz(desc);
1448     int rot = simd_data(desc);
1449     int sel_a = rot & 1;
1450     int sel_b = sel_a ^ 1;
1451     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1452     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1453 
1454     for (int e = 0; e < opr_sz / 8; e++) {
1455         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1456     }
1457 }
1458 
1459 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1460                              void *va, uint32_t desc)
1461 {
1462     int opr_sz = simd_oprsz(desc);
1463     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1464     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1465     int sel_a = rot & 1;
1466     int sel_b = sel_a ^ 1;
1467     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1468     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1469 
1470     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1471         uint32_t seg_m = m[seg + idx];
1472         for (int e = 0; e < 4; e++) {
1473             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1474                                    sel_a, sel_b, sub_i);
1475         }
1476     }
1477 }
1478 
1479 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1480                              void *va, uint32_t desc)
1481 {
1482     int seg, opr_sz = simd_oprsz(desc);
1483     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1484     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1485     int sel_a = rot & 1;
1486     int sel_b = sel_a ^ 1;
1487     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1488     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1489 
1490     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1491         uint64_t seg_m = m[seg + idx];
1492         for (int e = 0; e < 2; e++) {
1493             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1494                                    sel_a, sel_b, sub_i);
1495         }
1496     }
1497 }
1498 
1499 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1500 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1501 {                                                                       \
1502     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1503     intptr_t i, j, idx = simd_data(desc);                               \
1504     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1505     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1506         TYPE mm = m[i];                                                 \
1507         for (j = 0; j < segment; j++) {                                 \
1508             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1509         }                                                               \
1510     }                                                                   \
1511 }
1512 
1513 #define DO_SQRDMLAH_H(N, M, A) \
1514     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1515 #define DO_SQRDMLAH_S(N, M, A) \
1516     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1517 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1518 
1519 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1520 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1521 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1522 
1523 #define DO_SQRDMLSH_H(N, M, A) \
1524     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1525 #define DO_SQRDMLSH_S(N, M, A) \
1526     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1527 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1528 
1529 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1530 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1531 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1532 
1533 #undef DO_ZZXZ
1534 
1535 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1536 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1537 {                                                                         \
1538     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1539     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1540     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1541     for (i = 0; i < oprsz; i += 16) {                                     \
1542         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1543         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1544             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1545             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1546             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1547         }                                                                 \
1548     }                                                                     \
1549 }
1550 
1551 #define DO_MLA(N, M, A)  (A + N * M)
1552 
1553 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1554 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1555 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1556 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1557 
1558 #define DO_MLS(N, M, A)  (A - N * M)
1559 
1560 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1561 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1562 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1563 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1564 
1565 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1566 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1567 
1568 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1569 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1570 
1571 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1572 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1573 
1574 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1575 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1576 
1577 #undef DO_MLA
1578 #undef DO_MLS
1579 #undef DO_ZZXW
1580 
1581 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1582 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1583 {                                                                         \
1584     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1585     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1586     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1587     for (i = 0; i < oprsz; i += 16) {                                     \
1588         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1589         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1590             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1591             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1592         }                                                                 \
1593     }                                                                     \
1594 }
1595 
1596 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1597 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1598 
1599 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1600 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1601 
1602 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1603 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1604 
1605 #undef DO_ZZX
1606 
1607 #define DO_BITPERM(NAME, TYPE, OP) \
1608 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1609 {                                                              \
1610     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1611     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1612         TYPE nn = *(TYPE *)(vn + i);                           \
1613         TYPE mm = *(TYPE *)(vm + i);                           \
1614         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1615     }                                                          \
1616 }
1617 
1618 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1619 {
1620     uint64_t res = 0;
1621     int db, rb = 0;
1622 
1623     for (db = 0; db < n; ++db) {
1624         if ((mask >> db) & 1) {
1625             res |= ((data >> db) & 1) << rb;
1626             ++rb;
1627         }
1628     }
1629     return res;
1630 }
1631 
1632 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1633 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1634 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1635 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1636 
1637 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1638 {
1639     uint64_t res = 0;
1640     int rb, db = 0;
1641 
1642     for (rb = 0; rb < n; ++rb) {
1643         if ((mask >> rb) & 1) {
1644             res |= ((data >> db) & 1) << rb;
1645             ++db;
1646         }
1647     }
1648     return res;
1649 }
1650 
1651 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1652 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1653 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1654 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1655 
1656 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1657 {
1658     uint64_t resm = 0, resu = 0;
1659     int db, rbm = 0, rbu = 0;
1660 
1661     for (db = 0; db < n; ++db) {
1662         uint64_t val = (data >> db) & 1;
1663         if ((mask >> db) & 1) {
1664             resm |= val << rbm++;
1665         } else {
1666             resu |= val << rbu++;
1667         }
1668     }
1669 
1670     return resm | (resu << rbm);
1671 }
1672 
1673 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1674 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1675 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1676 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1677 
1678 #undef DO_BITPERM
1679 
1680 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1681 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1682 {                                                               \
1683     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1684     int sub_r = simd_data(desc);                                \
1685     if (sub_r) {                                                \
1686         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1687             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1688             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1689             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1690             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1691             acc_r = ADD_OP(acc_r, el2_i);                       \
1692             acc_i = SUB_OP(acc_i, el2_r);                       \
1693             *(TYPE *)(vd + H(i)) = acc_r;                       \
1694             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1695         }                                                       \
1696     } else {                                                    \
1697         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1698             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1699             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1700             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1701             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1702             acc_r = SUB_OP(acc_r, el2_i);                       \
1703             acc_i = ADD_OP(acc_i, el2_r);                       \
1704             *(TYPE *)(vd + H(i)) = acc_r;                       \
1705             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1706         }                                                       \
1707     }                                                           \
1708 }
1709 
1710 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1711 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1712 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1713 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1714 
1715 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1716 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1717 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1718 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1719 
1720 #undef DO_CADD
1721 
1722 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1723 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1724 {                                                              \
1725     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1726     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1727     int shift = simd_data(desc) >> 1;                          \
1728     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1729         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1730         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1731     }                                                          \
1732 }
1733 
1734 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1735 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1736 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1737 
1738 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1739 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1740 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1741 
1742 #undef DO_ZZI_SHLL
1743 
1744 /* Two-operand reduction expander, controlled by a predicate.
1745  * The difference between TYPERED and TYPERET has to do with
1746  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1747  * but TYPERET must be unsigned so that e.g. a 32-bit value
1748  * is not sign-extended to the ABI uint64_t return type.
1749  */
1750 /* ??? If we were to vectorize this by hand the reduction ordering
1751  * would change.  For integer operands, this is perfectly fine.
1752  */
1753 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1754 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1755 {                                                          \
1756     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1757     TYPERED ret = INIT;                                    \
1758     for (i = 0; i < opr_sz; ) {                            \
1759         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1760         do {                                               \
1761             if (pg & 1) {                                  \
1762                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1763                 ret = OP(ret, nn);                         \
1764             }                                              \
1765             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1766         } while (i & 15);                                  \
1767     }                                                      \
1768     return (TYPERET)ret;                                   \
1769 }
1770 
1771 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1772 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1773 {                                                          \
1774     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1775     TYPEE *n = vn;                                         \
1776     uint8_t *pg = vg;                                      \
1777     TYPER ret = INIT;                                      \
1778     for (i = 0; i < opr_sz; i += 1) {                      \
1779         if (pg[H1(i)] & 1) {                               \
1780             TYPEE nn = n[i];                               \
1781             ret = OP(ret, nn);                             \
1782         }                                                  \
1783     }                                                      \
1784     return ret;                                            \
1785 }
1786 
1787 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1788 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1789 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1790 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1791 
1792 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1793 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1794 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1795 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1796 
1797 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1798 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1799 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1800 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1801 
1802 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1803 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1804 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1805 
1806 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1807 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1808 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1809 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1810 
1811 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1812 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1813 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1814 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1815 
1816 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1817 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1818 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1819 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1820 
1821 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1822 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1823 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1824 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1825 
1826 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1827 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1828 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1829 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1830 
1831 #undef DO_VPZ
1832 #undef DO_VPZ_D
1833 
1834 /* Two vector operand, one scalar operand, unpredicated.  */
1835 #define DO_ZZI(NAME, TYPE, OP)                                       \
1836 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1837 {                                                                    \
1838     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1839     TYPE s = s64, *d = vd, *n = vn;                                  \
1840     for (i = 0; i < opr_sz; ++i) {                                   \
1841         d[i] = OP(n[i], s);                                          \
1842     }                                                                \
1843 }
1844 
1845 #define DO_SUBR(X, Y)   (Y - X)
1846 
1847 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1848 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1849 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1850 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1851 
1852 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1853 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1854 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1855 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1856 
1857 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1858 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1859 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1860 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1861 
1862 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1863 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1864 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1865 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1866 
1867 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1868 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1869 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1870 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1871 
1872 #undef DO_ZZI
1873 
1874 #undef DO_AND
1875 #undef DO_ORR
1876 #undef DO_EOR
1877 #undef DO_BIC
1878 #undef DO_ADD
1879 #undef DO_SUB
1880 #undef DO_MAX
1881 #undef DO_MIN
1882 #undef DO_ABD
1883 #undef DO_MUL
1884 #undef DO_DIV
1885 #undef DO_ASR
1886 #undef DO_LSR
1887 #undef DO_LSL
1888 #undef DO_SUBR
1889 
1890 /* Similar to the ARM LastActiveElement pseudocode function, except the
1891    result is multiplied by the element size.  This includes the not found
1892    indication; e.g. not found for esz=3 is -8.  */
1893 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1894 {
1895     uint64_t mask = pred_esz_masks[esz];
1896     intptr_t i = words;
1897 
1898     do {
1899         uint64_t this_g = g[--i] & mask;
1900         if (this_g) {
1901             return i * 64 + (63 - clz64(this_g));
1902         }
1903     } while (i > 0);
1904     return (intptr_t)-1 << esz;
1905 }
1906 
1907 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1908 {
1909     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1910     uint32_t flags = PREDTEST_INIT;
1911     uint64_t *d = vd, *g = vg;
1912     intptr_t i = 0;
1913 
1914     do {
1915         uint64_t this_d = d[i];
1916         uint64_t this_g = g[i];
1917 
1918         if (this_g) {
1919             if (!(flags & 4)) {
1920                 /* Set in D the first bit of G.  */
1921                 this_d |= this_g & -this_g;
1922                 d[i] = this_d;
1923             }
1924             flags = iter_predtest_fwd(this_d, this_g, flags);
1925         }
1926     } while (++i < words);
1927 
1928     return flags;
1929 }
1930 
1931 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1932 {
1933     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1934     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1935     uint32_t flags = PREDTEST_INIT;
1936     uint64_t *d = vd, *g = vg, esz_mask;
1937     intptr_t i, next;
1938 
1939     next = last_active_element(vd, words, esz) + (1 << esz);
1940     esz_mask = pred_esz_masks[esz];
1941 
1942     /* Similar to the pseudocode for pnext, but scaled by ESZ
1943        so that we find the correct bit.  */
1944     if (next < words * 64) {
1945         uint64_t mask = -1;
1946 
1947         if (next & 63) {
1948             mask = ~((1ull << (next & 63)) - 1);
1949             next &= -64;
1950         }
1951         do {
1952             uint64_t this_g = g[next / 64] & esz_mask & mask;
1953             if (this_g != 0) {
1954                 next = (next & -64) + ctz64(this_g);
1955                 break;
1956             }
1957             next += 64;
1958             mask = -1;
1959         } while (next < words * 64);
1960     }
1961 
1962     i = 0;
1963     do {
1964         uint64_t this_d = 0;
1965         if (i == next / 64) {
1966             this_d = 1ull << (next & 63);
1967         }
1968         d[i] = this_d;
1969         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1970     } while (++i < words);
1971 
1972     return flags;
1973 }
1974 
1975 /*
1976  * Copy Zn into Zd, and store zero into inactive elements.
1977  * If inv, store zeros into the active elements.
1978  */
1979 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1980 {
1981     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1982     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1983     uint64_t *d = vd, *n = vn;
1984     uint8_t *pg = vg;
1985 
1986     for (i = 0; i < opr_sz; i += 1) {
1987         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1988     }
1989 }
1990 
1991 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1992 {
1993     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1994     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1995     uint64_t *d = vd, *n = vn;
1996     uint8_t *pg = vg;
1997 
1998     for (i = 0; i < opr_sz; i += 1) {
1999         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2000     }
2001 }
2002 
2003 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2004 {
2005     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2006     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2007     uint64_t *d = vd, *n = vn;
2008     uint8_t *pg = vg;
2009 
2010     for (i = 0; i < opr_sz; i += 1) {
2011         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2012     }
2013 }
2014 
2015 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2016 {
2017     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2018     uint64_t *d = vd, *n = vn;
2019     uint8_t *pg = vg;
2020     uint8_t inv = simd_data(desc);
2021 
2022     for (i = 0; i < opr_sz; i += 1) {
2023         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2024     }
2025 }
2026 
2027 /* Three-operand expander, immediate operand, controlled by a predicate.
2028  */
2029 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2030 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2031 {                                                               \
2032     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2033     TYPE imm = simd_data(desc);                                 \
2034     for (i = 0; i < opr_sz; ) {                                 \
2035         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2036         do {                                                    \
2037             if (pg & 1) {                                       \
2038                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2039                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2040             }                                                   \
2041             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2042         } while (i & 15);                                       \
2043     }                                                           \
2044 }
2045 
2046 /* Similarly, specialized for 64-bit operands.  */
2047 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2048 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2049 {                                                               \
2050     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2051     TYPE *d = vd, *n = vn;                                      \
2052     TYPE imm = simd_data(desc);                                 \
2053     uint8_t *pg = vg;                                           \
2054     for (i = 0; i < opr_sz; i += 1) {                           \
2055         if (pg[H1(i)] & 1) {                                    \
2056             TYPE nn = n[i];                                     \
2057             d[i] = OP(nn, imm);                                 \
2058         }                                                       \
2059     }                                                           \
2060 }
2061 
2062 #define DO_SHR(N, M)  (N >> M)
2063 #define DO_SHL(N, M)  (N << M)
2064 
2065 /* Arithmetic shift right for division.  This rounds negative numbers
2066    toward zero as per signed division.  Therefore before shifting,
2067    when N is negative, add 2**M-1.  */
2068 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2069 
2070 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2071 {
2072     if (likely(sh < 64)) {
2073         return (x >> sh) + ((x >> (sh - 1)) & 1);
2074     } else if (sh == 64) {
2075         return x >> 63;
2076     } else {
2077         return 0;
2078     }
2079 }
2080 
2081 static inline int64_t do_srshr(int64_t x, unsigned sh)
2082 {
2083     if (likely(sh < 64)) {
2084         return (x >> sh) + ((x >> (sh - 1)) & 1);
2085     } else {
2086         /* Rounding the sign bit always produces 0. */
2087         return 0;
2088     }
2089 }
2090 
2091 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2092 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2093 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2094 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2095 
2096 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2097 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2098 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2099 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2100 
2101 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2102 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2103 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2104 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2105 
2106 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2107 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2108 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2109 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2110 
2111 /* SVE2 bitwise shift by immediate */
2112 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2113 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2114 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2115 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2116 
2117 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2118 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2119 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2120 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2121 
2122 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2123 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2124 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2125 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2126 
2127 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2128 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2129 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2130 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2131 
2132 #define do_suqrshl_b(n, m) \
2133    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2134 #define do_suqrshl_h(n, m) \
2135    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2136 #define do_suqrshl_s(n, m) \
2137    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2138 #define do_suqrshl_d(n, m) \
2139    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2140 
2141 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2142 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2143 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2144 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2145 
2146 #undef DO_ASRD
2147 #undef DO_ZPZI
2148 #undef DO_ZPZI_D
2149 
2150 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2151 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2152 {                                                            \
2153     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2154     int shift = simd_data(desc);                             \
2155     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2156         TYPEW nn = *(TYPEW *)(vn + i);                       \
2157         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2158     }                                                        \
2159 }
2160 
2161 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2162 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2163 {                                                                 \
2164     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2165     int shift = simd_data(desc);                                  \
2166     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2167         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2168         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2169     }                                                             \
2170 }
2171 
2172 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2173 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2174 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2175 
2176 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2177 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2178 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2179 
2180 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2181 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2182 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2183 
2184 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2185 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2186 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2187 
2188 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2189 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2190 #define DO_SQSHRUN_D(x, sh) \
2191     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2192 
2193 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2194 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2195 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2196 
2197 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2198 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2199 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2200 
2201 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2202 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2203 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2204 
2205 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2206 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2207 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2208 
2209 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2210 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2211 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2212 
2213 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2214 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2215 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2216 
2217 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2218 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2219 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2220 
2221 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2222 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2223 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2224 
2225 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2226 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2227 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2228 
2229 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2230 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2231 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2232 
2233 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2234 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2235 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2236 
2237 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2238 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2239 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2240 
2241 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2242 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2243 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2244 
2245 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2246 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2247 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2248 
2249 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2250 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2251 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2252 
2253 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2254 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2255 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2256 
2257 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2258 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2259 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2260 
2261 #undef DO_SHRNB
2262 #undef DO_SHRNT
2263 
2264 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2265 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2266 {                                                                           \
2267     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2268     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2269         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2270         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2271         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2272     }                                                                       \
2273 }
2274 
2275 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2277 {                                                                           \
2278     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2279     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2280         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2281         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2282         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2283     }                                                                       \
2284 }
2285 
2286 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2287 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2288 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2289 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2290 
2291 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2292 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2293 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2294 
2295 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2296 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2297 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2298 
2299 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2300 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2301 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2302 
2303 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2304 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2305 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2306 
2307 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2308 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2309 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2310 
2311 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2312 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2313 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2314 
2315 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2316 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2317 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2318 
2319 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2320 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2321 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2322 
2323 #undef DO_RSUBHN
2324 #undef DO_SUBHN
2325 #undef DO_RADDHN
2326 #undef DO_ADDHN
2327 
2328 #undef DO_BINOPNB
2329 
2330 /* Fully general four-operand expander, controlled by a predicate.
2331  */
2332 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2333 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2334                   void *vg, uint32_t desc)                    \
2335 {                                                             \
2336     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2337     for (i = 0; i < opr_sz; ) {                               \
2338         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2339         do {                                                  \
2340             if (pg & 1) {                                     \
2341                 TYPE nn = *(TYPE *)(vn + H(i));               \
2342                 TYPE mm = *(TYPE *)(vm + H(i));               \
2343                 TYPE aa = *(TYPE *)(va + H(i));               \
2344                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2345             }                                                 \
2346             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2347         } while (i & 15);                                     \
2348     }                                                         \
2349 }
2350 
2351 /* Similarly, specialized for 64-bit operands.  */
2352 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2353 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2354                   void *vg, uint32_t desc)                    \
2355 {                                                             \
2356     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2357     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2358     uint8_t *pg = vg;                                         \
2359     for (i = 0; i < opr_sz; i += 1) {                         \
2360         if (pg[H1(i)] & 1) {                                  \
2361             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2362             d[i] = OP(aa, nn, mm);                            \
2363         }                                                     \
2364     }                                                         \
2365 }
2366 
2367 #define DO_MLA(A, N, M)  (A + N * M)
2368 #define DO_MLS(A, N, M)  (A - N * M)
2369 
2370 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2371 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2372 
2373 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2374 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2375 
2376 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2377 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2378 
2379 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2380 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2381 
2382 #undef DO_MLA
2383 #undef DO_MLS
2384 #undef DO_ZPZZZ
2385 #undef DO_ZPZZZ_D
2386 
2387 void HELPER(sve_index_b)(void *vd, uint32_t start,
2388                          uint32_t incr, uint32_t desc)
2389 {
2390     intptr_t i, opr_sz = simd_oprsz(desc);
2391     uint8_t *d = vd;
2392     for (i = 0; i < opr_sz; i += 1) {
2393         d[H1(i)] = start + i * incr;
2394     }
2395 }
2396 
2397 void HELPER(sve_index_h)(void *vd, uint32_t start,
2398                          uint32_t incr, uint32_t desc)
2399 {
2400     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2401     uint16_t *d = vd;
2402     for (i = 0; i < opr_sz; i += 1) {
2403         d[H2(i)] = start + i * incr;
2404     }
2405 }
2406 
2407 void HELPER(sve_index_s)(void *vd, uint32_t start,
2408                          uint32_t incr, uint32_t desc)
2409 {
2410     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2411     uint32_t *d = vd;
2412     for (i = 0; i < opr_sz; i += 1) {
2413         d[H4(i)] = start + i * incr;
2414     }
2415 }
2416 
2417 void HELPER(sve_index_d)(void *vd, uint64_t start,
2418                          uint64_t incr, uint32_t desc)
2419 {
2420     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2421     uint64_t *d = vd;
2422     for (i = 0; i < opr_sz; i += 1) {
2423         d[i] = start + i * incr;
2424     }
2425 }
2426 
2427 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2428 {
2429     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2430     uint32_t sh = simd_data(desc);
2431     uint32_t *d = vd, *n = vn, *m = vm;
2432     for (i = 0; i < opr_sz; i += 1) {
2433         d[i] = n[i] + (m[i] << sh);
2434     }
2435 }
2436 
2437 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2438 {
2439     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2440     uint64_t sh = simd_data(desc);
2441     uint64_t *d = vd, *n = vn, *m = vm;
2442     for (i = 0; i < opr_sz; i += 1) {
2443         d[i] = n[i] + (m[i] << sh);
2444     }
2445 }
2446 
2447 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2448 {
2449     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2450     uint64_t sh = simd_data(desc);
2451     uint64_t *d = vd, *n = vn, *m = vm;
2452     for (i = 0; i < opr_sz; i += 1) {
2453         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2454     }
2455 }
2456 
2457 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2458 {
2459     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2460     uint64_t sh = simd_data(desc);
2461     uint64_t *d = vd, *n = vn, *m = vm;
2462     for (i = 0; i < opr_sz; i += 1) {
2463         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2464     }
2465 }
2466 
2467 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2468 {
2469     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2470     static const uint16_t coeff[] = {
2471         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2472         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2473         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2474         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2475     };
2476     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2477     uint16_t *d = vd, *n = vn;
2478 
2479     for (i = 0; i < opr_sz; i++) {
2480         uint16_t nn = n[i];
2481         intptr_t idx = extract32(nn, 0, 5);
2482         uint16_t exp = extract32(nn, 5, 5);
2483         d[i] = coeff[idx] | (exp << 10);
2484     }
2485 }
2486 
2487 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2488 {
2489     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2490     static const uint32_t coeff[] = {
2491         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2492         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2493         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2494         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2495         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2496         0x1ef532, 0x20b051, 0x227043, 0x243516,
2497         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2498         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2499         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2500         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2501         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2502         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2503         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2504         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2505         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2506         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2507     };
2508     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2509     uint32_t *d = vd, *n = vn;
2510 
2511     for (i = 0; i < opr_sz; i++) {
2512         uint32_t nn = n[i];
2513         intptr_t idx = extract32(nn, 0, 6);
2514         uint32_t exp = extract32(nn, 6, 8);
2515         d[i] = coeff[idx] | (exp << 23);
2516     }
2517 }
2518 
2519 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2520 {
2521     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2522     static const uint64_t coeff[] = {
2523         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2524         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2525         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2526         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2527         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2528         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2529         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2530         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2531         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2532         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2533         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2534         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2535         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2536         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2537         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2538         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2539         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2540         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2541         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2542         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2543         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2544         0xFA7C1819E90D8ull,
2545     };
2546     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2547     uint64_t *d = vd, *n = vn;
2548 
2549     for (i = 0; i < opr_sz; i++) {
2550         uint64_t nn = n[i];
2551         intptr_t idx = extract32(nn, 0, 6);
2552         uint64_t exp = extract32(nn, 6, 11);
2553         d[i] = coeff[idx] | (exp << 52);
2554     }
2555 }
2556 
2557 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2558 {
2559     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2560     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2561     uint16_t *d = vd, *n = vn, *m = vm;
2562     for (i = 0; i < opr_sz; i += 1) {
2563         uint16_t nn = n[i];
2564         uint16_t mm = m[i];
2565         if (mm & 1) {
2566             nn = float16_one;
2567         }
2568         if (mm & 2) {
2569             nn = float16_maybe_ah_chs(nn, fpcr_ah);
2570         }
2571         d[i] = nn;
2572     }
2573 }
2574 
2575 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2576 {
2577     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2578     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2579     uint32_t *d = vd, *n = vn, *m = vm;
2580     for (i = 0; i < opr_sz; i += 1) {
2581         uint32_t nn = n[i];
2582         uint32_t mm = m[i];
2583         if (mm & 1) {
2584             nn = float32_one;
2585         }
2586         if (mm & 2) {
2587             nn = float32_maybe_ah_chs(nn, fpcr_ah);
2588         }
2589         d[i] = nn;
2590     }
2591 }
2592 
2593 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2594 {
2595     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2596     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2597     uint64_t *d = vd, *n = vn, *m = vm;
2598     for (i = 0; i < opr_sz; i += 1) {
2599         uint64_t nn = n[i];
2600         uint64_t mm = m[i];
2601         if (mm & 1) {
2602             nn = float64_one;
2603         }
2604         if (mm & 2) {
2605             nn = float64_maybe_ah_chs(nn, fpcr_ah);
2606         }
2607         d[i] = nn;
2608     }
2609 }
2610 
2611 /*
2612  * Signed saturating addition with scalar operand.
2613  */
2614 
2615 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2616 {
2617     intptr_t i, oprsz = simd_oprsz(desc);
2618 
2619     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2620         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2621     }
2622 }
2623 
2624 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2625 {
2626     intptr_t i, oprsz = simd_oprsz(desc);
2627 
2628     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2629         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2630     }
2631 }
2632 
2633 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2634 {
2635     intptr_t i, oprsz = simd_oprsz(desc);
2636 
2637     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2638         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2639     }
2640 }
2641 
2642 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2643 {
2644     intptr_t i, oprsz = simd_oprsz(desc);
2645 
2646     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2647         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2648     }
2649 }
2650 
2651 /*
2652  * Unsigned saturating addition with scalar operand.
2653  */
2654 
2655 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2656 {
2657     intptr_t i, oprsz = simd_oprsz(desc);
2658 
2659     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2660         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2661     }
2662 }
2663 
2664 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2665 {
2666     intptr_t i, oprsz = simd_oprsz(desc);
2667 
2668     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2669         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2670     }
2671 }
2672 
2673 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2674 {
2675     intptr_t i, oprsz = simd_oprsz(desc);
2676 
2677     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2678         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2679     }
2680 }
2681 
2682 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2683 {
2684     intptr_t i, oprsz = simd_oprsz(desc);
2685 
2686     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2687         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2688     }
2689 }
2690 
2691 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2692 {
2693     intptr_t i, oprsz = simd_oprsz(desc);
2694 
2695     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2696         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2697     }
2698 }
2699 
2700 /* Two operand predicated copy immediate with merge.  All valid immediates
2701  * can fit within 17 signed bits in the simd_data field.
2702  */
2703 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2704                          uint64_t mm, uint32_t desc)
2705 {
2706     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2707     uint64_t *d = vd, *n = vn;
2708     uint8_t *pg = vg;
2709 
2710     mm = dup_const(MO_8, mm);
2711     for (i = 0; i < opr_sz; i += 1) {
2712         uint64_t nn = n[i];
2713         uint64_t pp = expand_pred_b(pg[H1(i)]);
2714         d[i] = (mm & pp) | (nn & ~pp);
2715     }
2716 }
2717 
2718 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2719                          uint64_t mm, uint32_t desc)
2720 {
2721     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2722     uint64_t *d = vd, *n = vn;
2723     uint8_t *pg = vg;
2724 
2725     mm = dup_const(MO_16, mm);
2726     for (i = 0; i < opr_sz; i += 1) {
2727         uint64_t nn = n[i];
2728         uint64_t pp = expand_pred_h(pg[H1(i)]);
2729         d[i] = (mm & pp) | (nn & ~pp);
2730     }
2731 }
2732 
2733 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2734                          uint64_t mm, uint32_t desc)
2735 {
2736     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2737     uint64_t *d = vd, *n = vn;
2738     uint8_t *pg = vg;
2739 
2740     mm = dup_const(MO_32, mm);
2741     for (i = 0; i < opr_sz; i += 1) {
2742         uint64_t nn = n[i];
2743         uint64_t pp = expand_pred_s(pg[H1(i)]);
2744         d[i] = (mm & pp) | (nn & ~pp);
2745     }
2746 }
2747 
2748 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2749                          uint64_t mm, uint32_t desc)
2750 {
2751     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2752     uint64_t *d = vd, *n = vn;
2753     uint8_t *pg = vg;
2754 
2755     for (i = 0; i < opr_sz; i += 1) {
2756         uint64_t nn = n[i];
2757         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2758     }
2759 }
2760 
2761 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2762 {
2763     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2764     uint64_t *d = vd;
2765     uint8_t *pg = vg;
2766 
2767     val = dup_const(MO_8, val);
2768     for (i = 0; i < opr_sz; i += 1) {
2769         d[i] = val & expand_pred_b(pg[H1(i)]);
2770     }
2771 }
2772 
2773 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2774 {
2775     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2776     uint64_t *d = vd;
2777     uint8_t *pg = vg;
2778 
2779     val = dup_const(MO_16, val);
2780     for (i = 0; i < opr_sz; i += 1) {
2781         d[i] = val & expand_pred_h(pg[H1(i)]);
2782     }
2783 }
2784 
2785 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2786 {
2787     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2788     uint64_t *d = vd;
2789     uint8_t *pg = vg;
2790 
2791     val = dup_const(MO_32, val);
2792     for (i = 0; i < opr_sz; i += 1) {
2793         d[i] = val & expand_pred_s(pg[H1(i)]);
2794     }
2795 }
2796 
2797 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2798 {
2799     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2800     uint64_t *d = vd;
2801     uint8_t *pg = vg;
2802 
2803     for (i = 0; i < opr_sz; i += 1) {
2804         d[i] = (pg[H1(i)] & 1 ? val : 0);
2805     }
2806 }
2807 
2808 /* Big-endian hosts need to frob the byte indices.  If the copy
2809  * happens to be 8-byte aligned, then no frobbing necessary.
2810  */
2811 static void swap_memmove(void *vd, void *vs, size_t n)
2812 {
2813     uintptr_t d = (uintptr_t)vd;
2814     uintptr_t s = (uintptr_t)vs;
2815     uintptr_t o = (d | s | n) & 7;
2816     size_t i;
2817 
2818 #if !HOST_BIG_ENDIAN
2819     o = 0;
2820 #endif
2821     switch (o) {
2822     case 0:
2823         memmove(vd, vs, n);
2824         break;
2825 
2826     case 4:
2827         if (d < s || d >= s + n) {
2828             for (i = 0; i < n; i += 4) {
2829                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2830             }
2831         } else {
2832             for (i = n; i > 0; ) {
2833                 i -= 4;
2834                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2835             }
2836         }
2837         break;
2838 
2839     case 2:
2840     case 6:
2841         if (d < s || d >= s + n) {
2842             for (i = 0; i < n; i += 2) {
2843                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2844             }
2845         } else {
2846             for (i = n; i > 0; ) {
2847                 i -= 2;
2848                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2849             }
2850         }
2851         break;
2852 
2853     default:
2854         if (d < s || d >= s + n) {
2855             for (i = 0; i < n; i++) {
2856                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2857             }
2858         } else {
2859             for (i = n; i > 0; ) {
2860                 i -= 1;
2861                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2862             }
2863         }
2864         break;
2865     }
2866 }
2867 
2868 /* Similarly for memset of 0.  */
2869 static void swap_memzero(void *vd, size_t n)
2870 {
2871     uintptr_t d = (uintptr_t)vd;
2872     uintptr_t o = (d | n) & 7;
2873     size_t i;
2874 
2875     /* Usually, the first bit of a predicate is set, so N is 0.  */
2876     if (likely(n == 0)) {
2877         return;
2878     }
2879 
2880 #if !HOST_BIG_ENDIAN
2881     o = 0;
2882 #endif
2883     switch (o) {
2884     case 0:
2885         memset(vd, 0, n);
2886         break;
2887 
2888     case 4:
2889         for (i = 0; i < n; i += 4) {
2890             *(uint32_t *)H1_4(d + i) = 0;
2891         }
2892         break;
2893 
2894     case 2:
2895     case 6:
2896         for (i = 0; i < n; i += 2) {
2897             *(uint16_t *)H1_2(d + i) = 0;
2898         }
2899         break;
2900 
2901     default:
2902         for (i = 0; i < n; i++) {
2903             *(uint8_t *)H1(d + i) = 0;
2904         }
2905         break;
2906     }
2907 }
2908 
2909 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2910 {
2911     intptr_t opr_sz = simd_oprsz(desc);
2912     size_t n_ofs = simd_data(desc);
2913     size_t n_siz = opr_sz - n_ofs;
2914 
2915     if (vd != vm) {
2916         swap_memmove(vd, vn + n_ofs, n_siz);
2917         swap_memmove(vd + n_siz, vm, n_ofs);
2918     } else if (vd != vn) {
2919         swap_memmove(vd + n_siz, vd, n_ofs);
2920         swap_memmove(vd, vn + n_ofs, n_siz);
2921     } else {
2922         /* vd == vn == vm.  Need temp space.  */
2923         ARMVectorReg tmp;
2924         swap_memmove(&tmp, vm, n_ofs);
2925         swap_memmove(vd, vd + n_ofs, n_siz);
2926         memcpy(vd + n_siz, &tmp, n_ofs);
2927     }
2928 }
2929 
2930 #define DO_INSR(NAME, TYPE, H) \
2931 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2932 {                                                                  \
2933     intptr_t opr_sz = simd_oprsz(desc);                            \
2934     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2935     *(TYPE *)(vd + H(0)) = val;                                    \
2936 }
2937 
2938 DO_INSR(sve_insr_b, uint8_t, H1)
2939 DO_INSR(sve_insr_h, uint16_t, H1_2)
2940 DO_INSR(sve_insr_s, uint32_t, H1_4)
2941 DO_INSR(sve_insr_d, uint64_t, H1_8)
2942 
2943 #undef DO_INSR
2944 
2945 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2946 {
2947     intptr_t i, j, opr_sz = simd_oprsz(desc);
2948     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2949         uint64_t f = *(uint64_t *)(vn + i);
2950         uint64_t b = *(uint64_t *)(vn + j);
2951         *(uint64_t *)(vd + i) = bswap64(b);
2952         *(uint64_t *)(vd + j) = bswap64(f);
2953     }
2954 }
2955 
2956 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2957 {
2958     intptr_t i, j, opr_sz = simd_oprsz(desc);
2959     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2960         uint64_t f = *(uint64_t *)(vn + i);
2961         uint64_t b = *(uint64_t *)(vn + j);
2962         *(uint64_t *)(vd + i) = hswap64(b);
2963         *(uint64_t *)(vd + j) = hswap64(f);
2964     }
2965 }
2966 
2967 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2968 {
2969     intptr_t i, j, opr_sz = simd_oprsz(desc);
2970     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2971         uint64_t f = *(uint64_t *)(vn + i);
2972         uint64_t b = *(uint64_t *)(vn + j);
2973         *(uint64_t *)(vd + i) = rol64(b, 32);
2974         *(uint64_t *)(vd + j) = rol64(f, 32);
2975     }
2976 }
2977 
2978 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2979 {
2980     intptr_t i, j, opr_sz = simd_oprsz(desc);
2981     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2982         uint64_t f = *(uint64_t *)(vn + i);
2983         uint64_t b = *(uint64_t *)(vn + j);
2984         *(uint64_t *)(vd + i) = b;
2985         *(uint64_t *)(vd + j) = f;
2986     }
2987 }
2988 
2989 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2990 
2991 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2992                            bool is_tbx, tb_impl_fn *fn)
2993 {
2994     ARMVectorReg scratch;
2995     uintptr_t oprsz = simd_oprsz(desc);
2996 
2997     if (unlikely(vd == vn)) {
2998         vn = memcpy(&scratch, vn, oprsz);
2999     }
3000 
3001     fn(vd, vn, NULL, vm, oprsz, is_tbx);
3002 }
3003 
3004 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3005                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3006 {
3007     ARMVectorReg scratch;
3008     uintptr_t oprsz = simd_oprsz(desc);
3009 
3010     if (unlikely(vd == vn0)) {
3011         vn0 = memcpy(&scratch, vn0, oprsz);
3012         if (vd == vn1) {
3013             vn1 = vn0;
3014         }
3015     } else if (unlikely(vd == vn1)) {
3016         vn1 = memcpy(&scratch, vn1, oprsz);
3017     }
3018 
3019     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3020 }
3021 
3022 #define DO_TB(SUFF, TYPE, H)                                            \
3023 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
3024                                 void *vm, uintptr_t oprsz, bool is_tbx) \
3025 {                                                                       \
3026     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
3027     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
3028     for (i = 0; i < nelem; ++i) {                                       \
3029         TYPE index = indexes[H1(i)], val = 0;                           \
3030         if (index < nelem) {                                            \
3031             val = tbl0[H(index)];                                       \
3032         } else {                                                        \
3033             index -= nelem;                                             \
3034             if (tbl1 && index < nelem) {                                \
3035                 val = tbl1[H(index)];                                   \
3036             } else if (is_tbx) {                                        \
3037                 continue;                                               \
3038             }                                                           \
3039         }                                                               \
3040         d[H(i)] = val;                                                  \
3041     }                                                                   \
3042 }                                                                       \
3043 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3044 {                                                                       \
3045     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3046 }                                                                       \
3047 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3048                              void *vm, uint32_t desc)                   \
3049 {                                                                       \
3050     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3051 }                                                                       \
3052 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3053 {                                                                       \
3054     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3055 }
3056 
3057 DO_TB(b, uint8_t, H1)
3058 DO_TB(h, uint16_t, H2)
3059 DO_TB(s, uint32_t, H4)
3060 DO_TB(d, uint64_t, H8)
3061 
3062 #undef DO_TB
3063 
3064 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3065 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3066 {                                                              \
3067     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3068     TYPED *d = vd;                                             \
3069     TYPES *n = vn;                                             \
3070     ARMVectorReg tmp;                                          \
3071     if (unlikely(vn - vd < opr_sz)) {                          \
3072         n = memcpy(&tmp, n, opr_sz / 2);                       \
3073     }                                                          \
3074     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3075         d[HD(i)] = n[HS(i)];                                   \
3076     }                                                          \
3077 }
3078 
3079 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3080 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3081 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3082 
3083 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3084 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3085 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3086 
3087 #undef DO_UNPK
3088 
3089 /* Mask of bits included in the even numbered predicates of width esz.
3090  * We also use this for expand_bits/compress_bits, and so extend the
3091  * same pattern out to 16-bit units.
3092  */
3093 static const uint64_t even_bit_esz_masks[5] = {
3094     0x5555555555555555ull,
3095     0x3333333333333333ull,
3096     0x0f0f0f0f0f0f0f0full,
3097     0x00ff00ff00ff00ffull,
3098     0x0000ffff0000ffffull,
3099 };
3100 
3101 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3102  * For N==0, this corresponds to the operation that in qemu/bitops.h
3103  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3104  * section 7-2 Shuffling Bits.
3105  */
3106 static uint64_t expand_bits(uint64_t x, int n)
3107 {
3108     int i;
3109 
3110     x &= 0xffffffffu;
3111     for (i = 4; i >= n; i--) {
3112         int sh = 1 << i;
3113         x = ((x << sh) | x) & even_bit_esz_masks[i];
3114     }
3115     return x;
3116 }
3117 
3118 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3119  * For N==0, this corresponds to the operation that in qemu/bitops.h
3120  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3121  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3122  */
3123 static uint64_t compress_bits(uint64_t x, int n)
3124 {
3125     int i;
3126 
3127     for (i = n; i <= 4; i++) {
3128         int sh = 1 << i;
3129         x &= even_bit_esz_masks[i];
3130         x = (x >> sh) | x;
3131     }
3132     return x & 0xffffffffu;
3133 }
3134 
3135 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3136 {
3137     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3138     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3139     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3140     int esize = 1 << esz;
3141     uint64_t *d = vd;
3142     intptr_t i;
3143 
3144     if (oprsz <= 8) {
3145         uint64_t nn = *(uint64_t *)vn;
3146         uint64_t mm = *(uint64_t *)vm;
3147         int half = 4 * oprsz;
3148 
3149         nn = extract64(nn, high * half, half);
3150         mm = extract64(mm, high * half, half);
3151         nn = expand_bits(nn, esz);
3152         mm = expand_bits(mm, esz);
3153         d[0] = nn | (mm << esize);
3154     } else {
3155         ARMPredicateReg tmp;
3156 
3157         /* We produce output faster than we consume input.
3158            Therefore we must be mindful of possible overlap.  */
3159         if (vd == vn) {
3160             vn = memcpy(&tmp, vn, oprsz);
3161             if (vd == vm) {
3162                 vm = vn;
3163             }
3164         } else if (vd == vm) {
3165             vm = memcpy(&tmp, vm, oprsz);
3166         }
3167         if (high) {
3168             high = oprsz >> 1;
3169         }
3170 
3171         if ((oprsz & 7) == 0) {
3172             uint32_t *n = vn, *m = vm;
3173             high >>= 2;
3174 
3175             for (i = 0; i < oprsz / 8; i++) {
3176                 uint64_t nn = n[H4(high + i)];
3177                 uint64_t mm = m[H4(high + i)];
3178 
3179                 nn = expand_bits(nn, esz);
3180                 mm = expand_bits(mm, esz);
3181                 d[i] = nn | (mm << esize);
3182             }
3183         } else {
3184             uint8_t *n = vn, *m = vm;
3185             uint16_t *d16 = vd;
3186 
3187             for (i = 0; i < oprsz / 2; i++) {
3188                 uint16_t nn = n[H1(high + i)];
3189                 uint16_t mm = m[H1(high + i)];
3190 
3191                 nn = expand_bits(nn, esz);
3192                 mm = expand_bits(mm, esz);
3193                 d16[H2(i)] = nn | (mm << esize);
3194             }
3195         }
3196     }
3197 }
3198 
3199 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3200 {
3201     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3202     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3203     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3204     uint64_t *d = vd, *n = vn, *m = vm;
3205     uint64_t l, h;
3206     intptr_t i;
3207 
3208     if (oprsz <= 8) {
3209         l = compress_bits(n[0] >> odd, esz);
3210         h = compress_bits(m[0] >> odd, esz);
3211         d[0] = l | (h << (4 * oprsz));
3212     } else {
3213         ARMPredicateReg tmp_m;
3214         intptr_t oprsz_16 = oprsz / 16;
3215 
3216         if ((vm - vd) < (uintptr_t)oprsz) {
3217             m = memcpy(&tmp_m, vm, oprsz);
3218         }
3219 
3220         for (i = 0; i < oprsz_16; i++) {
3221             l = n[2 * i + 0];
3222             h = n[2 * i + 1];
3223             l = compress_bits(l >> odd, esz);
3224             h = compress_bits(h >> odd, esz);
3225             d[i] = l | (h << 32);
3226         }
3227 
3228         /*
3229          * For VL which is not a multiple of 512, the results from M do not
3230          * align nicely with the uint64_t for D.  Put the aligned results
3231          * from M into TMP_M and then copy it into place afterward.
3232          */
3233         if (oprsz & 15) {
3234             int final_shift = (oprsz & 15) * 2;
3235 
3236             l = n[2 * i + 0];
3237             h = n[2 * i + 1];
3238             l = compress_bits(l >> odd, esz);
3239             h = compress_bits(h >> odd, esz);
3240             d[i] = l | (h << final_shift);
3241 
3242             for (i = 0; i < oprsz_16; i++) {
3243                 l = m[2 * i + 0];
3244                 h = m[2 * i + 1];
3245                 l = compress_bits(l >> odd, esz);
3246                 h = compress_bits(h >> odd, esz);
3247                 tmp_m.p[i] = l | (h << 32);
3248             }
3249             l = m[2 * i + 0];
3250             h = m[2 * i + 1];
3251             l = compress_bits(l >> odd, esz);
3252             h = compress_bits(h >> odd, esz);
3253             tmp_m.p[i] = l | (h << final_shift);
3254 
3255             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3256         } else {
3257             for (i = 0; i < oprsz_16; i++) {
3258                 l = m[2 * i + 0];
3259                 h = m[2 * i + 1];
3260                 l = compress_bits(l >> odd, esz);
3261                 h = compress_bits(h >> odd, esz);
3262                 d[oprsz_16 + i] = l | (h << 32);
3263             }
3264         }
3265     }
3266 }
3267 
3268 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3269 {
3270     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3271     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3272     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3273     uint64_t *d = vd, *n = vn, *m = vm;
3274     uint64_t mask;
3275     int shr, shl;
3276     intptr_t i;
3277 
3278     shl = 1 << esz;
3279     shr = 0;
3280     mask = even_bit_esz_masks[esz];
3281     if (odd) {
3282         mask <<= shl;
3283         shr = shl;
3284         shl = 0;
3285     }
3286 
3287     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3288         uint64_t nn = (n[i] & mask) >> shr;
3289         uint64_t mm = (m[i] & mask) << shl;
3290         d[i] = nn + mm;
3291     }
3292 }
3293 
3294 /* Reverse units of 2**N bits.  */
3295 static uint64_t reverse_bits_64(uint64_t x, int n)
3296 {
3297     int i, sh;
3298 
3299     x = bswap64(x);
3300     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3301         uint64_t mask = even_bit_esz_masks[i];
3302         x = ((x & mask) << sh) | ((x >> sh) & mask);
3303     }
3304     return x;
3305 }
3306 
3307 static uint8_t reverse_bits_8(uint8_t x, int n)
3308 {
3309     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3310     int i, sh;
3311 
3312     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3313         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3314     }
3315     return x;
3316 }
3317 
3318 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3319 {
3320     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3321     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3322     intptr_t i, oprsz_2 = oprsz / 2;
3323 
3324     if (oprsz <= 8) {
3325         uint64_t l = *(uint64_t *)vn;
3326         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3327         *(uint64_t *)vd = l;
3328     } else if ((oprsz & 15) == 0) {
3329         for (i = 0; i < oprsz_2; i += 8) {
3330             intptr_t ih = oprsz - 8 - i;
3331             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3332             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3333             *(uint64_t *)(vd + i) = h;
3334             *(uint64_t *)(vd + ih) = l;
3335         }
3336     } else {
3337         for (i = 0; i < oprsz_2; i += 1) {
3338             intptr_t il = H1(i);
3339             intptr_t ih = H1(oprsz - 1 - i);
3340             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3341             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3342             *(uint8_t *)(vd + il) = h;
3343             *(uint8_t *)(vd + ih) = l;
3344         }
3345     }
3346 }
3347 
3348 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3349 {
3350     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3351     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3352     uint64_t *d = vd;
3353     intptr_t i;
3354 
3355     if (oprsz <= 8) {
3356         uint64_t nn = *(uint64_t *)vn;
3357         int half = 4 * oprsz;
3358 
3359         nn = extract64(nn, high * half, half);
3360         nn = expand_bits(nn, 0);
3361         d[0] = nn;
3362     } else {
3363         ARMPredicateReg tmp_n;
3364 
3365         /* We produce output faster than we consume input.
3366            Therefore we must be mindful of possible overlap.  */
3367         if ((vn - vd) < (uintptr_t)oprsz) {
3368             vn = memcpy(&tmp_n, vn, oprsz);
3369         }
3370         if (high) {
3371             high = oprsz >> 1;
3372         }
3373 
3374         if ((oprsz & 7) == 0) {
3375             uint32_t *n = vn;
3376             high >>= 2;
3377 
3378             for (i = 0; i < oprsz / 8; i++) {
3379                 uint64_t nn = n[H4(high + i)];
3380                 d[i] = expand_bits(nn, 0);
3381             }
3382         } else {
3383             uint16_t *d16 = vd;
3384             uint8_t *n = vn;
3385 
3386             for (i = 0; i < oprsz / 2; i++) {
3387                 uint16_t nn = n[H1(high + i)];
3388                 d16[H2(i)] = expand_bits(nn, 0);
3389             }
3390         }
3391     }
3392 }
3393 
3394 #define DO_ZIP(NAME, TYPE, H) \
3395 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3396 {                                                                    \
3397     intptr_t oprsz = simd_oprsz(desc);                               \
3398     intptr_t odd_ofs = simd_data(desc);                              \
3399     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3400     ARMVectorReg tmp_n, tmp_m;                                       \
3401     /* We produce output faster than we consume input.               \
3402        Therefore we must be mindful of possible overlap.  */         \
3403     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3404         vn = memcpy(&tmp_n, vn, oprsz);                              \
3405     }                                                                \
3406     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3407         vm = memcpy(&tmp_m, vm, oprsz);                              \
3408     }                                                                \
3409     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3410         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3411         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3412             *(TYPE *)(vm + odd_ofs + H(i));                          \
3413     }                                                                \
3414     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3415         memset(vd + oprsz - 16, 0, 16);                              \
3416     }                                                                \
3417 }
3418 
3419 DO_ZIP(sve_zip_b, uint8_t, H1)
3420 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3421 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3422 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3423 DO_ZIP(sve2_zip_q, Int128, )
3424 
3425 #define DO_UZP(NAME, TYPE, H) \
3426 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3427 {                                                                      \
3428     intptr_t oprsz = simd_oprsz(desc);                                 \
3429     intptr_t odd_ofs = simd_data(desc);                                \
3430     intptr_t i, p;                                                     \
3431     ARMVectorReg tmp_m;                                                \
3432     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3433         vm = memcpy(&tmp_m, vm, oprsz);                                \
3434     }                                                                  \
3435     i = 0, p = odd_ofs;                                                \
3436     do {                                                               \
3437         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3438         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3439     } while (p < oprsz);                                               \
3440     p -= oprsz;                                                        \
3441     do {                                                               \
3442         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3443         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3444     } while (p < oprsz);                                               \
3445     tcg_debug_assert(i == oprsz);                                      \
3446 }
3447 
3448 DO_UZP(sve_uzp_b, uint8_t, H1)
3449 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3450 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3451 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3452 DO_UZP(sve2_uzp_q, Int128, )
3453 
3454 #define DO_TRN(NAME, TYPE, H) \
3455 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3456 {                                                                      \
3457     intptr_t oprsz = simd_oprsz(desc);                                 \
3458     intptr_t odd_ofs = simd_data(desc);                                \
3459     intptr_t i;                                                        \
3460     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3461         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3462         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3463         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3464         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3465     }                                                                  \
3466     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3467         memset(vd + oprsz - 16, 0, 16);                                \
3468     }                                                                  \
3469 }
3470 
3471 DO_TRN(sve_trn_b, uint8_t, H1)
3472 DO_TRN(sve_trn_h, uint16_t, H1_2)
3473 DO_TRN(sve_trn_s, uint32_t, H1_4)
3474 DO_TRN(sve_trn_d, uint64_t, H1_8)
3475 DO_TRN(sve2_trn_q, Int128, )
3476 
3477 #undef DO_ZIP
3478 #undef DO_UZP
3479 #undef DO_TRN
3480 
3481 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3482 {
3483     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3484     uint32_t *d = vd, *n = vn;
3485     uint8_t *pg = vg;
3486 
3487     for (i = j = 0; i < opr_sz; i++) {
3488         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3489             d[H4(j)] = n[H4(i)];
3490             j++;
3491         }
3492     }
3493     for (; j < opr_sz; j++) {
3494         d[H4(j)] = 0;
3495     }
3496 }
3497 
3498 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3499 {
3500     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3501     uint64_t *d = vd, *n = vn;
3502     uint8_t *pg = vg;
3503 
3504     for (i = j = 0; i < opr_sz; i++) {
3505         if (pg[H1(i)] & 1) {
3506             d[j] = n[i];
3507             j++;
3508         }
3509     }
3510     for (; j < opr_sz; j++) {
3511         d[j] = 0;
3512     }
3513 }
3514 
3515 /* Similar to the ARM LastActiveElement pseudocode function, except the
3516  * result is multiplied by the element size.  This includes the not found
3517  * indication; e.g. not found for esz=3 is -8.
3518  */
3519 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3520 {
3521     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3522     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3523 
3524     return last_active_element(vg, words, esz);
3525 }
3526 
3527 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3528 {
3529     intptr_t opr_sz = simd_oprsz(desc) / 8;
3530     int esz = simd_data(desc);
3531     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3532     intptr_t i, first_i, last_i;
3533     ARMVectorReg tmp;
3534 
3535     first_i = last_i = 0;
3536     first_g = last_g = 0;
3537 
3538     /* Find the extent of the active elements within VG.  */
3539     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3540         pg = *(uint64_t *)(vg + i) & mask;
3541         if (pg) {
3542             if (last_g == 0) {
3543                 last_g = pg;
3544                 last_i = i;
3545             }
3546             first_g = pg;
3547             first_i = i;
3548         }
3549     }
3550 
3551     len = 0;
3552     if (first_g != 0) {
3553         first_i = first_i * 8 + ctz64(first_g);
3554         last_i = last_i * 8 + 63 - clz64(last_g);
3555         len = last_i - first_i + (1 << esz);
3556         if (vd == vm) {
3557             vm = memcpy(&tmp, vm, opr_sz * 8);
3558         }
3559         swap_memmove(vd, vn + first_i, len);
3560     }
3561     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3562 }
3563 
3564 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3565                             void *vg, uint32_t desc)
3566 {
3567     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3568     uint64_t *d = vd, *n = vn, *m = vm;
3569     uint8_t *pg = vg;
3570 
3571     for (i = 0; i < opr_sz; i += 1) {
3572         uint64_t nn = n[i], mm = m[i];
3573         uint64_t pp = expand_pred_b(pg[H1(i)]);
3574         d[i] = (nn & pp) | (mm & ~pp);
3575     }
3576 }
3577 
3578 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3579                             void *vg, uint32_t desc)
3580 {
3581     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3582     uint64_t *d = vd, *n = vn, *m = vm;
3583     uint8_t *pg = vg;
3584 
3585     for (i = 0; i < opr_sz; i += 1) {
3586         uint64_t nn = n[i], mm = m[i];
3587         uint64_t pp = expand_pred_h(pg[H1(i)]);
3588         d[i] = (nn & pp) | (mm & ~pp);
3589     }
3590 }
3591 
3592 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3593                             void *vg, uint32_t desc)
3594 {
3595     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3596     uint64_t *d = vd, *n = vn, *m = vm;
3597     uint8_t *pg = vg;
3598 
3599     for (i = 0; i < opr_sz; i += 1) {
3600         uint64_t nn = n[i], mm = m[i];
3601         uint64_t pp = expand_pred_s(pg[H1(i)]);
3602         d[i] = (nn & pp) | (mm & ~pp);
3603     }
3604 }
3605 
3606 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3607                             void *vg, uint32_t desc)
3608 {
3609     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3610     uint64_t *d = vd, *n = vn, *m = vm;
3611     uint8_t *pg = vg;
3612 
3613     for (i = 0; i < opr_sz; i += 1) {
3614         uint64_t nn = n[i], mm = m[i];
3615         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3616     }
3617 }
3618 
3619 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3620                             void *vg, uint32_t desc)
3621 {
3622     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3623     Int128 *d = vd, *n = vn, *m = vm;
3624     uint16_t *pg = vg;
3625 
3626     for (i = 0; i < opr_sz; i += 1) {
3627         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3628     }
3629 }
3630 
3631 /* Two operand comparison controlled by a predicate.
3632  * ??? It is very tempting to want to be able to expand this inline
3633  * with x86 instructions, e.g.
3634  *
3635  *    vcmpeqw    zm, zn, %ymm0
3636  *    vpmovmskb  %ymm0, %eax
3637  *    and        $0x5555, %eax
3638  *    and        pg, %eax
3639  *
3640  * or even aarch64, e.g.
3641  *
3642  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3643  *    cmeq       v0.8h, zn, zm
3644  *    and        v0.8h, v0.8h, mask
3645  *    addv       h0, v0.8h
3646  *    and        v0.8b, pg
3647  *
3648  * However, coming up with an abstraction that allows vector inputs and
3649  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3650  * scalar outputs, is tricky.
3651  */
3652 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3653 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3654 {                                                                            \
3655     intptr_t opr_sz = simd_oprsz(desc);                                      \
3656     uint32_t flags = PREDTEST_INIT;                                          \
3657     intptr_t i = opr_sz;                                                     \
3658     do {                                                                     \
3659         uint64_t out = 0, pg;                                                \
3660         do {                                                                 \
3661             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3662             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3663             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3664             out |= nn OP mm;                                                 \
3665         } while (i & 63);                                                    \
3666         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3667         out &= pg;                                                           \
3668         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3669         flags = iter_predtest_bwd(out, pg, flags);                           \
3670     } while (i > 0);                                                         \
3671     return flags;                                                            \
3672 }
3673 
3674 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3675     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3676 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3677     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3678 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3679     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3680 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3681     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3682 
3683 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3684 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3685 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3686 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3687 
3688 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3689 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3690 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3691 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3692 
3693 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3694 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3695 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3696 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3697 
3698 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3699 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3700 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3701 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3702 
3703 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3704 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3705 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3706 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3707 
3708 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3709 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3710 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3711 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3712 
3713 #undef DO_CMP_PPZZ_B
3714 #undef DO_CMP_PPZZ_H
3715 #undef DO_CMP_PPZZ_S
3716 #undef DO_CMP_PPZZ_D
3717 #undef DO_CMP_PPZZ
3718 
3719 /* Similar, but the second source is "wide".  */
3720 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3721 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3722 {                                                                            \
3723     intptr_t opr_sz = simd_oprsz(desc);                                      \
3724     uint32_t flags = PREDTEST_INIT;                                          \
3725     intptr_t i = opr_sz;                                                     \
3726     do {                                                                     \
3727         uint64_t out = 0, pg;                                                \
3728         do {                                                                 \
3729             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3730             do {                                                             \
3731                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3732                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3733                 out |= nn OP mm;                                             \
3734             } while (i & 7);                                                 \
3735         } while (i & 63);                                                    \
3736         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3737         out &= pg;                                                           \
3738         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3739         flags = iter_predtest_bwd(out, pg, flags);                           \
3740     } while (i > 0);                                                         \
3741     return flags;                                                            \
3742 }
3743 
3744 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3745     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3746 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3747     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3748 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3749     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3750 
3751 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3752 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3753 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3754 
3755 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3756 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3757 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3758 
3759 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3760 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3761 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3762 
3763 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3764 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3765 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3766 
3767 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3768 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3769 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3770 
3771 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3772 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3773 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3774 
3775 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3776 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3777 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3778 
3779 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3780 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3781 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3782 
3783 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3784 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3785 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3786 
3787 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3788 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3789 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3790 
3791 #undef DO_CMP_PPZW_B
3792 #undef DO_CMP_PPZW_H
3793 #undef DO_CMP_PPZW_S
3794 #undef DO_CMP_PPZW
3795 
3796 /* Similar, but the second source is immediate.  */
3797 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3798 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3799 {                                                                    \
3800     intptr_t opr_sz = simd_oprsz(desc);                              \
3801     uint32_t flags = PREDTEST_INIT;                                  \
3802     TYPE mm = simd_data(desc);                                       \
3803     intptr_t i = opr_sz;                                             \
3804     do {                                                             \
3805         uint64_t out = 0, pg;                                        \
3806         do {                                                         \
3807             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3808             TYPE nn = *(TYPE *)(vn + H(i));                          \
3809             out |= nn OP mm;                                         \
3810         } while (i & 63);                                            \
3811         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3812         out &= pg;                                                   \
3813         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3814         flags = iter_predtest_bwd(out, pg, flags);                   \
3815     } while (i > 0);                                                 \
3816     return flags;                                                    \
3817 }
3818 
3819 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3820     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3821 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3822     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3823 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3824     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3825 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3826     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3827 
3828 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3829 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3830 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3831 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3832 
3833 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3834 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3835 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3836 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3837 
3838 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3839 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3840 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3841 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3842 
3843 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3844 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3845 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3846 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3847 
3848 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3849 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3850 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3851 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3852 
3853 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3854 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3855 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3856 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3857 
3858 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3859 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3860 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3861 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3862 
3863 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3864 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3865 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3866 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3867 
3868 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3869 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3870 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3871 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3872 
3873 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3874 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3875 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3876 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3877 
3878 #undef DO_CMP_PPZI_B
3879 #undef DO_CMP_PPZI_H
3880 #undef DO_CMP_PPZI_S
3881 #undef DO_CMP_PPZI_D
3882 #undef DO_CMP_PPZI
3883 
3884 /* Similar to the ARM LastActive pseudocode function.  */
3885 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3886 {
3887     intptr_t i;
3888 
3889     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3890         uint64_t pg = *(uint64_t *)(vg + i);
3891         if (pg) {
3892             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3893         }
3894     }
3895     return 0;
3896 }
3897 
3898 /* Compute a mask into RETB that is true for all G, up to and including
3899  * (if after) or excluding (if !after) the first G & N.
3900  * Return true if BRK found.
3901  */
3902 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3903                         bool brk, bool after)
3904 {
3905     uint64_t b;
3906 
3907     if (brk) {
3908         b = 0;
3909     } else if ((g & n) == 0) {
3910         /* For all G, no N are set; break not found.  */
3911         b = g;
3912     } else {
3913         /* Break somewhere in N.  Locate it.  */
3914         b = g & n;            /* guard true, pred true */
3915         b = b & -b;           /* first such */
3916         if (after) {
3917             b = b | (b - 1);  /* break after same */
3918         } else {
3919             b = b - 1;        /* break before same */
3920         }
3921         brk = true;
3922     }
3923 
3924     *retb = b;
3925     return brk;
3926 }
3927 
3928 /* Compute a zeroing BRK.  */
3929 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3930                           intptr_t oprsz, bool after)
3931 {
3932     bool brk = false;
3933     intptr_t i;
3934 
3935     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3936         uint64_t this_b, this_g = g[i];
3937 
3938         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3939         d[i] = this_b & this_g;
3940     }
3941 }
3942 
3943 /* Likewise, but also compute flags.  */
3944 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3945                                intptr_t oprsz, bool after)
3946 {
3947     uint32_t flags = PREDTEST_INIT;
3948     bool brk = false;
3949     intptr_t i;
3950 
3951     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3952         uint64_t this_b, this_d, this_g = g[i];
3953 
3954         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3955         d[i] = this_d = this_b & this_g;
3956         flags = iter_predtest_fwd(this_d, this_g, flags);
3957     }
3958     return flags;
3959 }
3960 
3961 /* Compute a merging BRK.  */
3962 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3963                           intptr_t oprsz, bool after)
3964 {
3965     bool brk = false;
3966     intptr_t i;
3967 
3968     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3969         uint64_t this_b, this_g = g[i];
3970 
3971         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3972         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3973     }
3974 }
3975 
3976 /* Likewise, but also compute flags.  */
3977 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3978                                intptr_t oprsz, bool after)
3979 {
3980     uint32_t flags = PREDTEST_INIT;
3981     bool brk = false;
3982     intptr_t i;
3983 
3984     for (i = 0; i < oprsz / 8; ++i) {
3985         uint64_t this_b, this_d = d[i], this_g = g[i];
3986 
3987         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3988         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3989         flags = iter_predtest_fwd(this_d, this_g, flags);
3990     }
3991     return flags;
3992 }
3993 
3994 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3995 {
3996     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3997      * The compiler should turn this into 4 64-bit integer stores.
3998      */
3999     memset(d, 0, sizeof(ARMPredicateReg));
4000     return PREDTEST_INIT;
4001 }
4002 
4003 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4004                        uint32_t pred_desc)
4005 {
4006     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4007     if (last_active_pred(vn, vg, oprsz)) {
4008         compute_brk_z(vd, vm, vg, oprsz, true);
4009     } else {
4010         do_zero(vd, oprsz);
4011     }
4012 }
4013 
4014 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4015                             uint32_t pred_desc)
4016 {
4017     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4018     if (last_active_pred(vn, vg, oprsz)) {
4019         return compute_brks_z(vd, vm, vg, oprsz, true);
4020     } else {
4021         return do_zero(vd, oprsz);
4022     }
4023 }
4024 
4025 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4026                        uint32_t pred_desc)
4027 {
4028     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4029     if (last_active_pred(vn, vg, oprsz)) {
4030         compute_brk_z(vd, vm, vg, oprsz, false);
4031     } else {
4032         do_zero(vd, oprsz);
4033     }
4034 }
4035 
4036 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4037                             uint32_t pred_desc)
4038 {
4039     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4040     if (last_active_pred(vn, vg, oprsz)) {
4041         return compute_brks_z(vd, vm, vg, oprsz, false);
4042     } else {
4043         return do_zero(vd, oprsz);
4044     }
4045 }
4046 
4047 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4048 {
4049     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4050     compute_brk_z(vd, vn, vg, oprsz, true);
4051 }
4052 
4053 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4054 {
4055     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4056     return compute_brks_z(vd, vn, vg, oprsz, true);
4057 }
4058 
4059 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4060 {
4061     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4062     compute_brk_z(vd, vn, vg, oprsz, false);
4063 }
4064 
4065 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4066 {
4067     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4068     return compute_brks_z(vd, vn, vg, oprsz, false);
4069 }
4070 
4071 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4072 {
4073     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4074     compute_brk_m(vd, vn, vg, oprsz, true);
4075 }
4076 
4077 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4078 {
4079     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4080     return compute_brks_m(vd, vn, vg, oprsz, true);
4081 }
4082 
4083 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4084 {
4085     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4086     compute_brk_m(vd, vn, vg, oprsz, false);
4087 }
4088 
4089 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4090 {
4091     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4092     return compute_brks_m(vd, vn, vg, oprsz, false);
4093 }
4094 
4095 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4096 {
4097     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4098     if (!last_active_pred(vn, vg, oprsz)) {
4099         do_zero(vd, oprsz);
4100     }
4101 }
4102 
4103 /* As if PredTest(Ones(PL), D, esz).  */
4104 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4105                               uint64_t esz_mask)
4106 {
4107     uint32_t flags = PREDTEST_INIT;
4108     intptr_t i;
4109 
4110     for (i = 0; i < oprsz / 8; i++) {
4111         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4112     }
4113     if (oprsz & 7) {
4114         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4115         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4116     }
4117     return flags;
4118 }
4119 
4120 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4121 {
4122     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4123     if (last_active_pred(vn, vg, oprsz)) {
4124         return predtest_ones(vd, oprsz, -1);
4125     } else {
4126         return do_zero(vd, oprsz);
4127     }
4128 }
4129 
4130 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4131 {
4132     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4133     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4134     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4135     intptr_t i;
4136 
4137     for (i = 0; i < words; ++i) {
4138         uint64_t t = n[i] & g[i] & mask;
4139         sum += ctpop64(t);
4140     }
4141     return sum;
4142 }
4143 
4144 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4145 {
4146     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4147     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4148     uint64_t esz_mask = pred_esz_masks[esz];
4149     ARMPredicateReg *d = vd;
4150     uint32_t flags;
4151     intptr_t i;
4152 
4153     /* Begin with a zero predicate register.  */
4154     flags = do_zero(d, oprsz);
4155     if (count == 0) {
4156         return flags;
4157     }
4158 
4159     /* Set all of the requested bits.  */
4160     for (i = 0; i < count / 64; ++i) {
4161         d->p[i] = esz_mask;
4162     }
4163     if (count & 63) {
4164         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4165     }
4166 
4167     return predtest_ones(d, oprsz, esz_mask);
4168 }
4169 
4170 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4171 {
4172     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4173     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4174     uint64_t esz_mask = pred_esz_masks[esz];
4175     ARMPredicateReg *d = vd;
4176     intptr_t i, invcount, oprbits;
4177     uint64_t bits;
4178 
4179     if (count == 0) {
4180         return do_zero(d, oprsz);
4181     }
4182 
4183     oprbits = oprsz * 8;
4184     tcg_debug_assert(count <= oprbits);
4185 
4186     bits = esz_mask;
4187     if (oprbits & 63) {
4188         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4189     }
4190 
4191     invcount = oprbits - count;
4192     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4193         d->p[i] = bits;
4194         bits = esz_mask;
4195     }
4196 
4197     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4198 
4199     while (--i >= 0) {
4200         d->p[i] = 0;
4201     }
4202 
4203     return predtest_ones(d, oprsz, esz_mask);
4204 }
4205 
4206 /* Recursive reduction on a function;
4207  * C.f. the ARM ARM function ReducePredicated.
4208  *
4209  * While it would be possible to write this without the DATA temporary,
4210  * it is much simpler to process the predicate register this way.
4211  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4212  * little to gain with a more complex non-recursive form.
4213  */
4214 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4215 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4216 {                                                                     \
4217     if (n == 1) {                                                     \
4218         return *data;                                                 \
4219     } else {                                                          \
4220         uintptr_t half = n / 2;                                       \
4221         TYPE lo = NAME##_reduce(data, status, half);                  \
4222         TYPE hi = NAME##_reduce(data + half, status, half);           \
4223         return FUNC(lo, hi, status);                                  \
4224     }                                                                 \
4225 }                                                                     \
4226 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
4227 {                                                                     \
4228     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4229     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4230     for (i = 0; i < oprsz; ) {                                        \
4231         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4232         do {                                                          \
4233             TYPE nn = *(TYPE *)(vn + H(i));                           \
4234             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4235             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4236         } while (i & 15);                                             \
4237     }                                                                 \
4238     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4239         *(TYPE *)((void *)data + i) = IDENT;                          \
4240     }                                                                 \
4241     return NAME##_reduce(data, s, maxsz / sizeof(TYPE));              \
4242 }
4243 
4244 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero)
4245 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero)
4246 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero)
4247 
4248 /* Identity is floatN_default_nan, without the function call.  */
4249 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00)
4250 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000)
4251 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
4252 
4253 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00)
4254 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000)
4255 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
4256 
4257 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity)
4258 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity)
4259 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity)
4260 
4261 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity))
4262 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity))
4263 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity))
4264 
4265 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
4266 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
4267 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
4268 
4269 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh,
4270           float16_chs(float16_infinity))
4271 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs,
4272           float32_chs(float32_infinity))
4273 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd,
4274           float64_chs(float64_infinity))
4275 
4276 #undef DO_REDUCE
4277 
4278 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4279                              float_status *status, uint32_t desc)
4280 {
4281     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4282     float16 result = nn;
4283 
4284     do {
4285         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4286         do {
4287             if (pg & 1) {
4288                 float16 mm = *(float16 *)(vm + H1_2(i));
4289                 result = float16_add(result, mm, status);
4290             }
4291             i += sizeof(float16), pg >>= sizeof(float16);
4292         } while (i & 15);
4293     } while (i < opr_sz);
4294 
4295     return result;
4296 }
4297 
4298 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4299                              float_status *status, uint32_t desc)
4300 {
4301     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4302     float32 result = nn;
4303 
4304     do {
4305         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4306         do {
4307             if (pg & 1) {
4308                 float32 mm = *(float32 *)(vm + H1_2(i));
4309                 result = float32_add(result, mm, status);
4310             }
4311             i += sizeof(float32), pg >>= sizeof(float32);
4312         } while (i & 15);
4313     } while (i < opr_sz);
4314 
4315     return result;
4316 }
4317 
4318 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4319                              float_status *status, uint32_t desc)
4320 {
4321     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4322     uint64_t *m = vm;
4323     uint8_t *pg = vg;
4324 
4325     for (i = 0; i < opr_sz; i++) {
4326         if (pg[H1(i)] & 1) {
4327             nn = float64_add(nn, m[i], status);
4328         }
4329     }
4330 
4331     return nn;
4332 }
4333 
4334 /* Fully general three-operand expander, controlled by a predicate,
4335  * With the extra float_status parameter.
4336  */
4337 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4338 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4339                   float_status *status, uint32_t desc)          \
4340 {                                                               \
4341     intptr_t i = simd_oprsz(desc);                              \
4342     uint64_t *g = vg;                                           \
4343     do {                                                        \
4344         uint64_t pg = g[(i - 1) >> 6];                          \
4345         do {                                                    \
4346             i -= sizeof(TYPE);                                  \
4347             if (likely((pg >> (i & 63)) & 1)) {                 \
4348                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4349                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4350                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4351             }                                                   \
4352         } while (i & 63);                                       \
4353     } while (i != 0);                                           \
4354 }
4355 
4356 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4357 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4358 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4359 
4360 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4361 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4362 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4363 
4364 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4365 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4366 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4367 
4368 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4369 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4370 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4371 
4372 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4373 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4374 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4375 
4376 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4377 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4378 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4379 
4380 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh)
4381 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins)
4382 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind)
4383 
4384 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh)
4385 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs)
4386 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd)
4387 
4388 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4389 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4390 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4391 
4392 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4393 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4394 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4395 
4396 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4397 {
4398     return float16_abs(float16_sub(a, b, s));
4399 }
4400 
4401 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4402 {
4403     return float32_abs(float32_sub(a, b, s));
4404 }
4405 
4406 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4407 {
4408     return float64_abs(float64_sub(a, b, s));
4409 }
4410 
4411 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
4412 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat)
4413 {
4414     float16 r = float16_sub(op1, op2, stat);
4415     return float16_is_any_nan(r) ? r : float16_abs(r);
4416 }
4417 
4418 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat)
4419 {
4420     float32 r = float32_sub(op1, op2, stat);
4421     return float32_is_any_nan(r) ? r : float32_abs(r);
4422 }
4423 
4424 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat)
4425 {
4426     float64 r = float64_sub(op1, op2, stat);
4427     return float64_is_any_nan(r) ? r : float64_abs(r);
4428 }
4429 
4430 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4431 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4432 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4433 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h)
4434 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s)
4435 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d)
4436 
4437 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4438 {
4439     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4440     return float64_scalbn(a, b_int, s);
4441 }
4442 
4443 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4444 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4445 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4446 
4447 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4448 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4449 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4450 
4451 #undef DO_ZPZZ_FP
4452 
4453 /* Three-operand expander, with one scalar operand, controlled by
4454  * a predicate, with the extra float_status parameter.
4455  */
4456 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4457 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4458                   float_status *status, uint32_t desc)            \
4459 {                                                                 \
4460     intptr_t i = simd_oprsz(desc);                                \
4461     uint64_t *g = vg;                                             \
4462     TYPE mm = scalar;                                             \
4463     do {                                                          \
4464         uint64_t pg = g[(i - 1) >> 6];                            \
4465         do {                                                      \
4466             i -= sizeof(TYPE);                                    \
4467             if (likely((pg >> (i & 63)) & 1)) {                   \
4468                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4469                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4470             }                                                     \
4471         } while (i & 63);                                         \
4472     } while (i != 0);                                             \
4473 }
4474 
4475 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4476 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4477 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4478 
4479 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4480 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4481 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4482 
4483 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4484 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4485 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4486 
4487 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4488 {
4489     return float16_sub(b, a, s);
4490 }
4491 
4492 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4493 {
4494     return float32_sub(b, a, s);
4495 }
4496 
4497 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4498 {
4499     return float64_sub(b, a, s);
4500 }
4501 
4502 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4503 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4504 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4505 
4506 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4507 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4508 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4509 
4510 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4511 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4512 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4513 
4514 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4515 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4516 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4517 
4518 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4519 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4520 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4521 
4522 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh)
4523 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs)
4524 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd)
4525 
4526 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh)
4527 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins)
4528 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind)
4529 
4530 /* Fully general two-operand expander, controlled by a predicate,
4531  * With the extra float_status parameter.
4532  */
4533 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4534 void HELPER(NAME)(void *vd, void *vn, void *vg,                       \
4535                   float_status *status, uint32_t desc)                \
4536 {                                                                     \
4537     intptr_t i = simd_oprsz(desc);                                    \
4538     uint64_t *g = vg;                                                 \
4539     do {                                                              \
4540         uint64_t pg = g[(i - 1) >> 6];                                \
4541         do {                                                          \
4542             i -= sizeof(TYPE);                                        \
4543             if (likely((pg >> (i & 63)) & 1)) {                       \
4544                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4545                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4546             }                                                         \
4547         } while (i & 63);                                             \
4548     } while (i != 0);                                                 \
4549 }
4550 
4551 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4552  * FZ16.  When converting from fp16, this affects flushing input denormals;
4553  * when converting to fp16, this affects flushing output denormals.
4554  */
4555 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4556 {
4557     bool save = get_flush_inputs_to_zero(fpst);
4558     float32 ret;
4559 
4560     set_flush_inputs_to_zero(false, fpst);
4561     ret = float16_to_float32(f, true, fpst);
4562     set_flush_inputs_to_zero(save, fpst);
4563     return ret;
4564 }
4565 
4566 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4567 {
4568     bool save = get_flush_inputs_to_zero(fpst);
4569     float64 ret;
4570 
4571     set_flush_inputs_to_zero(false, fpst);
4572     ret = float16_to_float64(f, true, fpst);
4573     set_flush_inputs_to_zero(save, fpst);
4574     return ret;
4575 }
4576 
4577 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4578 {
4579     bool save = get_flush_to_zero(fpst);
4580     float16 ret;
4581 
4582     set_flush_to_zero(false, fpst);
4583     ret = float32_to_float16(f, true, fpst);
4584     set_flush_to_zero(save, fpst);
4585     return ret;
4586 }
4587 
4588 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4589 {
4590     bool save = get_flush_to_zero(fpst);
4591     float16 ret;
4592 
4593     set_flush_to_zero(false, fpst);
4594     ret = float64_to_float16(f, true, fpst);
4595     set_flush_to_zero(save, fpst);
4596     return ret;
4597 }
4598 
4599 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4600 {
4601     if (float16_is_any_nan(f)) {
4602         float_raise(float_flag_invalid, s);
4603         return 0;
4604     }
4605     return float16_to_int16_round_to_zero(f, s);
4606 }
4607 
4608 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4609 {
4610     if (float16_is_any_nan(f)) {
4611         float_raise(float_flag_invalid, s);
4612         return 0;
4613     }
4614     return float16_to_int64_round_to_zero(f, s);
4615 }
4616 
4617 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4618 {
4619     if (float32_is_any_nan(f)) {
4620         float_raise(float_flag_invalid, s);
4621         return 0;
4622     }
4623     return float32_to_int64_round_to_zero(f, s);
4624 }
4625 
4626 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4627 {
4628     if (float64_is_any_nan(f)) {
4629         float_raise(float_flag_invalid, s);
4630         return 0;
4631     }
4632     return float64_to_int64_round_to_zero(f, s);
4633 }
4634 
4635 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4636 {
4637     if (float16_is_any_nan(f)) {
4638         float_raise(float_flag_invalid, s);
4639         return 0;
4640     }
4641     return float16_to_uint16_round_to_zero(f, s);
4642 }
4643 
4644 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4645 {
4646     if (float16_is_any_nan(f)) {
4647         float_raise(float_flag_invalid, s);
4648         return 0;
4649     }
4650     return float16_to_uint64_round_to_zero(f, s);
4651 }
4652 
4653 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4654 {
4655     if (float32_is_any_nan(f)) {
4656         float_raise(float_flag_invalid, s);
4657         return 0;
4658     }
4659     return float32_to_uint64_round_to_zero(f, s);
4660 }
4661 
4662 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4663 {
4664     if (float64_is_any_nan(f)) {
4665         float_raise(float_flag_invalid, s);
4666         return 0;
4667     }
4668     return float64_to_uint64_round_to_zero(f, s);
4669 }
4670 
4671 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4672 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4673 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4674 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4675 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4676 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4677 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4678 
4679 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4680 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4681 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4682 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4683 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4684 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4685 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4686 
4687 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4688 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4689 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4690 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4691 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4692 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4693 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4694 
4695 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4696 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4697 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4698 
4699 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4700 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4701 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4702 
4703 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4704 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4705 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4706 
4707 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4708 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4709 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4710 
4711 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4712 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4713 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4714 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4715 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4716 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4717 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4718 
4719 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4720 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4721 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4722 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4723 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4724 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4725 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4726 
4727 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4728 {
4729     /* Extract frac to the top of the uint32_t. */
4730     uint32_t frac = (uint32_t)a << (16 + 6);
4731     int16_t exp = extract32(a, 10, 5);
4732 
4733     if (unlikely(exp == 0)) {
4734         if (frac != 0) {
4735             if (!get_flush_inputs_to_zero(s)) {
4736                 /* denormal: bias - fractional_zeros */
4737                 return -15 - clz32(frac);
4738             }
4739             /* flush to zero */
4740             float_raise(float_flag_input_denormal_flushed, s);
4741         }
4742     } else if (unlikely(exp == 0x1f)) {
4743         if (frac == 0) {
4744             return INT16_MAX; /* infinity */
4745         }
4746     } else {
4747         /* normal: exp - bias */
4748         return exp - 15;
4749     }
4750     /* nan or zero */
4751     float_raise(float_flag_invalid, s);
4752     return INT16_MIN;
4753 }
4754 
4755 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4756 {
4757     /* Extract frac to the top of the uint32_t. */
4758     uint32_t frac = a << 9;
4759     int32_t exp = extract32(a, 23, 8);
4760 
4761     if (unlikely(exp == 0)) {
4762         if (frac != 0) {
4763             if (!get_flush_inputs_to_zero(s)) {
4764                 /* denormal: bias - fractional_zeros */
4765                 return -127 - clz32(frac);
4766             }
4767             /* flush to zero */
4768             float_raise(float_flag_input_denormal_flushed, s);
4769         }
4770     } else if (unlikely(exp == 0xff)) {
4771         if (frac == 0) {
4772             return INT32_MAX; /* infinity */
4773         }
4774     } else {
4775         /* normal: exp - bias */
4776         return exp - 127;
4777     }
4778     /* nan or zero */
4779     float_raise(float_flag_invalid, s);
4780     return INT32_MIN;
4781 }
4782 
4783 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4784 {
4785     /* Extract frac to the top of the uint64_t. */
4786     uint64_t frac = a << 12;
4787     int64_t exp = extract64(a, 52, 11);
4788 
4789     if (unlikely(exp == 0)) {
4790         if (frac != 0) {
4791             if (!get_flush_inputs_to_zero(s)) {
4792                 /* denormal: bias - fractional_zeros */
4793                 return -1023 - clz64(frac);
4794             }
4795             /* flush to zero */
4796             float_raise(float_flag_input_denormal_flushed, s);
4797         }
4798     } else if (unlikely(exp == 0x7ff)) {
4799         if (frac == 0) {
4800             return INT64_MAX; /* infinity */
4801         }
4802     } else {
4803         /* normal: exp - bias */
4804         return exp - 1023;
4805     }
4806     /* nan or zero */
4807     float_raise(float_flag_invalid, s);
4808     return INT64_MIN;
4809 }
4810 
4811 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4812 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4813 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4814 
4815 #undef DO_ZPZ_FP
4816 
4817 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4818                             float_status *status, uint32_t desc,
4819                             uint16_t neg1, uint16_t neg3, int flags)
4820 {
4821     intptr_t i = simd_oprsz(desc);
4822     uint64_t *g = vg;
4823 
4824     do {
4825         uint64_t pg = g[(i - 1) >> 6];
4826         do {
4827             i -= 2;
4828             if (likely((pg >> (i & 63)) & 1)) {
4829                 float16 e1, e2, e3, r;
4830 
4831                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4832                 e2 = *(uint16_t *)(vm + H1_2(i));
4833                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4834                 r = float16_muladd(e1, e2, e3, flags, status);
4835                 *(uint16_t *)(vd + H1_2(i)) = r;
4836             }
4837         } while (i & 63);
4838     } while (i != 0);
4839 }
4840 
4841 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4842                               void *vg, float_status *status, uint32_t desc)
4843 {
4844     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4845 }
4846 
4847 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4848                               void *vg, float_status *status, uint32_t desc)
4849 {
4850     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
4851 }
4852 
4853 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4854                                void *vg, float_status *status, uint32_t desc)
4855 {
4856     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
4857 }
4858 
4859 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4860                                void *vg, float_status *status, uint32_t desc)
4861 {
4862     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
4863 }
4864 
4865 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4866                               void *vg, float_status *status, uint32_t desc)
4867 {
4868     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4869                     float_muladd_negate_product);
4870 }
4871 
4872 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4873                                void *vg, float_status *status, uint32_t desc)
4874 {
4875     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4876                     float_muladd_negate_product | float_muladd_negate_c);
4877 }
4878 
4879 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4880                                void *vg, float_status *status, uint32_t desc)
4881 {
4882     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4883                     float_muladd_negate_c);
4884 }
4885 
4886 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4887                             float_status *status, uint32_t desc,
4888                             uint32_t neg1, uint32_t neg3, int flags)
4889 {
4890     intptr_t i = simd_oprsz(desc);
4891     uint64_t *g = vg;
4892 
4893     do {
4894         uint64_t pg = g[(i - 1) >> 6];
4895         do {
4896             i -= 4;
4897             if (likely((pg >> (i & 63)) & 1)) {
4898                 float32 e1, e2, e3, r;
4899 
4900                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4901                 e2 = *(uint32_t *)(vm + H1_4(i));
4902                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4903                 r = float32_muladd(e1, e2, e3, flags, status);
4904                 *(uint32_t *)(vd + H1_4(i)) = r;
4905             }
4906         } while (i & 63);
4907     } while (i != 0);
4908 }
4909 
4910 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4911                               void *vg, float_status *status, uint32_t desc)
4912 {
4913     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4914 }
4915 
4916 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4917                               void *vg, float_status *status, uint32_t desc)
4918 {
4919     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0);
4920 }
4921 
4922 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4923                                void *vg, float_status *status, uint32_t desc)
4924 {
4925     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0);
4926 }
4927 
4928 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4929                                void *vg, float_status *status, uint32_t desc)
4930 {
4931     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0);
4932 }
4933 
4934 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4935                               void *vg, float_status *status, uint32_t desc)
4936 {
4937     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4938                     float_muladd_negate_product);
4939 }
4940 
4941 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4942                                void *vg, float_status *status, uint32_t desc)
4943 {
4944     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4945                     float_muladd_negate_product | float_muladd_negate_c);
4946 }
4947 
4948 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4949                                void *vg, float_status *status, uint32_t desc)
4950 {
4951     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4952                     float_muladd_negate_c);
4953 }
4954 
4955 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4956                             float_status *status, uint32_t desc,
4957                             uint64_t neg1, uint64_t neg3, int flags)
4958 {
4959     intptr_t i = simd_oprsz(desc);
4960     uint64_t *g = vg;
4961 
4962     do {
4963         uint64_t pg = g[(i - 1) >> 6];
4964         do {
4965             i -= 8;
4966             if (likely((pg >> (i & 63)) & 1)) {
4967                 float64 e1, e2, e3, r;
4968 
4969                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4970                 e2 = *(uint64_t *)(vm + i);
4971                 e3 = *(uint64_t *)(va + i) ^ neg3;
4972                 r = float64_muladd(e1, e2, e3, flags, status);
4973                 *(uint64_t *)(vd + i) = r;
4974             }
4975         } while (i & 63);
4976     } while (i != 0);
4977 }
4978 
4979 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4980                               void *vg, float_status *status, uint32_t desc)
4981 {
4982     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4983 }
4984 
4985 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4986                               void *vg, float_status *status, uint32_t desc)
4987 {
4988     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0);
4989 }
4990 
4991 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4992                                void *vg, float_status *status, uint32_t desc)
4993 {
4994     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0);
4995 }
4996 
4997 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4998                                void *vg, float_status *status, uint32_t desc)
4999 {
5000     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0);
5001 }
5002 
5003 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5004                               void *vg, float_status *status, uint32_t desc)
5005 {
5006     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5007                     float_muladd_negate_product);
5008 }
5009 
5010 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5011                                void *vg, float_status *status, uint32_t desc)
5012 {
5013     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5014                     float_muladd_negate_product | float_muladd_negate_c);
5015 }
5016 
5017 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5018                                void *vg, float_status *status, uint32_t desc)
5019 {
5020     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5021                     float_muladd_negate_c);
5022 }
5023 
5024 /* Two operand floating-point comparison controlled by a predicate.
5025  * Unlike the integer version, we are not allowed to optimistically
5026  * compare operands, since the comparison may have side effects wrt
5027  * the FPSR.
5028  */
5029 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
5030 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
5031                   float_status *status, uint32_t desc)                  \
5032 {                                                                       \
5033     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
5034     uint64_t *d = vd, *g = vg;                                          \
5035     do {                                                                \
5036         uint64_t out = 0, pg = g[j];                                    \
5037         do {                                                            \
5038             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
5039             if (likely((pg >> (i & 63)) & 1)) {                         \
5040                 TYPE nn = *(TYPE *)(vn + H(i));                         \
5041                 TYPE mm = *(TYPE *)(vm + H(i));                         \
5042                 out |= OP(TYPE, nn, mm, status);                        \
5043             }                                                           \
5044         } while (i & 63);                                               \
5045         d[j--] = out;                                                   \
5046     } while (i > 0);                                                    \
5047 }
5048 
5049 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5050     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5051 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5052     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5053 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5054     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5055 
5056 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5057     DO_FPCMP_PPZZ_H(NAME, OP)   \
5058     DO_FPCMP_PPZZ_S(NAME, OP)   \
5059     DO_FPCMP_PPZZ_D(NAME, OP)
5060 
5061 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
5062 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
5063 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
5064 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
5065 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
5066 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
5067 #define DO_FCMUO(TYPE, X, Y, ST)  \
5068     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5069 #define DO_FACGE(TYPE, X, Y, ST)  \
5070     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5071 #define DO_FACGT(TYPE, X, Y, ST)  \
5072     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5073 
5074 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5075 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5076 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5077 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5078 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5079 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5080 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5081 
5082 #undef DO_FPCMP_PPZZ_ALL
5083 #undef DO_FPCMP_PPZZ_D
5084 #undef DO_FPCMP_PPZZ_S
5085 #undef DO_FPCMP_PPZZ_H
5086 #undef DO_FPCMP_PPZZ
5087 
5088 /* One operand floating-point comparison against zero, controlled
5089  * by a predicate.
5090  */
5091 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
5092 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
5093                   float_status *status, uint32_t desc)     \
5094 {                                                          \
5095     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
5096     uint64_t *d = vd, *g = vg;                             \
5097     do {                                                   \
5098         uint64_t out = 0, pg = g[j];                       \
5099         do {                                               \
5100             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
5101             if ((pg >> (i & 63)) & 1) {                    \
5102                 TYPE nn = *(TYPE *)(vn + H(i));            \
5103                 out |= OP(TYPE, nn, 0, status);            \
5104             }                                              \
5105         } while (i & 63);                                  \
5106         d[j--] = out;                                      \
5107     } while (i > 0);                                       \
5108 }
5109 
5110 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5111     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5112 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5113     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5114 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5115     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5116 
5117 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5118     DO_FPCMP_PPZ0_H(NAME, OP)   \
5119     DO_FPCMP_PPZ0_S(NAME, OP)   \
5120     DO_FPCMP_PPZ0_D(NAME, OP)
5121 
5122 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5123 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5124 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5125 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5126 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5127 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5128 
5129 /* FP Trig Multiply-Add. */
5130 
5131 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
5132                          float_status *s, uint32_t desc)
5133 {
5134     static const float16 coeff[16] = {
5135         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5136         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5137     };
5138     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5139     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5140     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5141     float16 *d = vd, *n = vn, *m = vm;
5142 
5143     for (i = 0; i < opr_sz; i++) {
5144         float16 mm = m[i];
5145         intptr_t xx = x;
5146         int flags = 0;
5147 
5148         if (float16_is_neg(mm)) {
5149             if (fpcr_ah) {
5150                 flags = float_muladd_negate_product;
5151             } else {
5152                 mm = float16_abs(mm);
5153             }
5154             xx += 8;
5155         }
5156         d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5157     }
5158 }
5159 
5160 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5161                          float_status *s, uint32_t desc)
5162 {
5163     static const float32 coeff[16] = {
5164         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5165         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5166         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5167         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5168     };
5169     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5170     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5171     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5172     float32 *d = vd, *n = vn, *m = vm;
5173 
5174     for (i = 0; i < opr_sz; i++) {
5175         float32 mm = m[i];
5176         intptr_t xx = x;
5177         int flags = 0;
5178 
5179         if (float32_is_neg(mm)) {
5180             if (fpcr_ah) {
5181                 flags = float_muladd_negate_product;
5182             } else {
5183                 mm = float32_abs(mm);
5184             }
5185             xx += 8;
5186         }
5187         d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);
5188     }
5189 }
5190 
5191 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5192                          float_status *s, uint32_t desc)
5193 {
5194     static const float64 coeff[16] = {
5195         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5196         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5197         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5198         0x3de5d8408868552full, 0x0000000000000000ull,
5199         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5200         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5201         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5202         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5203     };
5204     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5205     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5206     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5207     float64 *d = vd, *n = vn, *m = vm;
5208 
5209     for (i = 0; i < opr_sz; i++) {
5210         float64 mm = m[i];
5211         intptr_t xx = x;
5212         int flags = 0;
5213 
5214         if (float64_is_neg(mm)) {
5215             if (fpcr_ah) {
5216                 flags = float_muladd_negate_product;
5217             } else {
5218                 mm = float64_abs(mm);
5219             }
5220             xx += 8;
5221         }
5222         d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);
5223     }
5224 }
5225 
5226 /*
5227  * FP Complex Add
5228  */
5229 
5230 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5231                          float_status *s, uint32_t desc)
5232 {
5233     intptr_t j, i = simd_oprsz(desc);
5234     uint64_t *g = vg;
5235     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5236     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5237 
5238     do {
5239         uint64_t pg = g[(i - 1) >> 6];
5240         do {
5241             float16 e0, e1, e2, e3;
5242 
5243             /* I holds the real index; J holds the imag index.  */
5244             j = i - sizeof(float16);
5245             i -= 2 * sizeof(float16);
5246 
5247             e0 = *(float16 *)(vn + H1_2(i));
5248             e1 = *(float16 *)(vm + H1_2(j));
5249             e2 = *(float16 *)(vn + H1_2(j));
5250             e3 = *(float16 *)(vm + H1_2(i));
5251 
5252             if (rot) {
5253                 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
5254             } else {
5255                 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
5256             }
5257 
5258             if (likely((pg >> (i & 63)) & 1)) {
5259                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5260             }
5261             if (likely((pg >> (j & 63)) & 1)) {
5262                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5263             }
5264         } while (i & 63);
5265     } while (i != 0);
5266 }
5267 
5268 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5269                          float_status *s, uint32_t desc)
5270 {
5271     intptr_t j, i = simd_oprsz(desc);
5272     uint64_t *g = vg;
5273     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5274     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5275 
5276     do {
5277         uint64_t pg = g[(i - 1) >> 6];
5278         do {
5279             float32 e0, e1, e2, e3;
5280 
5281             /* I holds the real index; J holds the imag index.  */
5282             j = i - sizeof(float32);
5283             i -= 2 * sizeof(float32);
5284 
5285             e0 = *(float32 *)(vn + H1_2(i));
5286             e1 = *(float32 *)(vm + H1_2(j));
5287             e2 = *(float32 *)(vn + H1_2(j));
5288             e3 = *(float32 *)(vm + H1_2(i));
5289 
5290             if (rot) {
5291                 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
5292             } else {
5293                 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
5294             }
5295 
5296             if (likely((pg >> (i & 63)) & 1)) {
5297                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5298             }
5299             if (likely((pg >> (j & 63)) & 1)) {
5300                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5301             }
5302         } while (i & 63);
5303     } while (i != 0);
5304 }
5305 
5306 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5307                          float_status *s, uint32_t desc)
5308 {
5309     intptr_t j, i = simd_oprsz(desc);
5310     uint64_t *g = vg;
5311     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5312     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5313 
5314     do {
5315         uint64_t pg = g[(i - 1) >> 6];
5316         do {
5317             float64 e0, e1, e2, e3;
5318 
5319             /* I holds the real index; J holds the imag index.  */
5320             j = i - sizeof(float64);
5321             i -= 2 * sizeof(float64);
5322 
5323             e0 = *(float64 *)(vn + H1_2(i));
5324             e1 = *(float64 *)(vm + H1_2(j));
5325             e2 = *(float64 *)(vn + H1_2(j));
5326             e3 = *(float64 *)(vm + H1_2(i));
5327 
5328             if (rot) {
5329                 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
5330             } else {
5331                 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
5332             }
5333 
5334             if (likely((pg >> (i & 63)) & 1)) {
5335                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5336             }
5337             if (likely((pg >> (j & 63)) & 1)) {
5338                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5339             }
5340         } while (i & 63);
5341     } while (i != 0);
5342 }
5343 
5344 /*
5345  * FP Complex Multiply
5346  */
5347 
5348 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5349                                void *vg, float_status *status, uint32_t desc)
5350 {
5351     intptr_t j, i = simd_oprsz(desc);
5352     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5353     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5354     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5355     uint32_t negf_real = flip ^ negf_imag;
5356     float16 negx_imag, negx_real;
5357     uint64_t *g = vg;
5358 
5359     /* With AH=0, use negx; with AH=1 use negf. */
5360     negx_real = (negf_real & ~fpcr_ah) << 15;
5361     negx_imag = (negf_imag & ~fpcr_ah) << 15;
5362     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5363     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5364 
5365     do {
5366         uint64_t pg = g[(i - 1) >> 6];
5367         do {
5368             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5369 
5370             /* I holds the real index; J holds the imag index.  */
5371             j = i - sizeof(float16);
5372             i -= 2 * sizeof(float16);
5373 
5374             nr = *(float16 *)(vn + H1_2(i));
5375             ni = *(float16 *)(vn + H1_2(j));
5376             mr = *(float16 *)(vm + H1_2(i));
5377             mi = *(float16 *)(vm + H1_2(j));
5378 
5379             e2 = (flip ? ni : nr);
5380             e1 = (flip ? mi : mr) ^ negx_real;
5381             e4 = e2;
5382             e3 = (flip ? mr : mi) ^ negx_imag;
5383 
5384             if (likely((pg >> (i & 63)) & 1)) {
5385                 d = *(float16 *)(va + H1_2(i));
5386                 d = float16_muladd(e2, e1, d, negf_real, status);
5387                 *(float16 *)(vd + H1_2(i)) = d;
5388             }
5389             if (likely((pg >> (j & 63)) & 1)) {
5390                 d = *(float16 *)(va + H1_2(j));
5391                 d = float16_muladd(e4, e3, d, negf_imag, status);
5392                 *(float16 *)(vd + H1_2(j)) = d;
5393             }
5394         } while (i & 63);
5395     } while (i != 0);
5396 }
5397 
5398 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5399                                void *vg, float_status *status, uint32_t desc)
5400 {
5401     intptr_t j, i = simd_oprsz(desc);
5402     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5403     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5404     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5405     uint32_t negf_real = flip ^ negf_imag;
5406     float32 negx_imag, negx_real;
5407     uint64_t *g = vg;
5408 
5409     /* With AH=0, use negx; with AH=1 use negf. */
5410     negx_real = (negf_real & ~fpcr_ah) << 31;
5411     negx_imag = (negf_imag & ~fpcr_ah) << 31;
5412     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5413     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5414 
5415     do {
5416         uint64_t pg = g[(i - 1) >> 6];
5417         do {
5418             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5419 
5420             /* I holds the real index; J holds the imag index.  */
5421             j = i - sizeof(float32);
5422             i -= 2 * sizeof(float32);
5423 
5424             nr = *(float32 *)(vn + H1_2(i));
5425             ni = *(float32 *)(vn + H1_2(j));
5426             mr = *(float32 *)(vm + H1_2(i));
5427             mi = *(float32 *)(vm + H1_2(j));
5428 
5429             e2 = (flip ? ni : nr);
5430             e1 = (flip ? mi : mr) ^ negx_real;
5431             e4 = e2;
5432             e3 = (flip ? mr : mi) ^ negx_imag;
5433 
5434             if (likely((pg >> (i & 63)) & 1)) {
5435                 d = *(float32 *)(va + H1_2(i));
5436                 d = float32_muladd(e2, e1, d, negf_real, status);
5437                 *(float32 *)(vd + H1_2(i)) = d;
5438             }
5439             if (likely((pg >> (j & 63)) & 1)) {
5440                 d = *(float32 *)(va + H1_2(j));
5441                 d = float32_muladd(e4, e3, d, negf_imag, status);
5442                 *(float32 *)(vd + H1_2(j)) = d;
5443             }
5444         } while (i & 63);
5445     } while (i != 0);
5446 }
5447 
5448 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5449                                void *vg, float_status *status, uint32_t desc)
5450 {
5451     intptr_t j, i = simd_oprsz(desc);
5452     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5453     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5454     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5455     uint32_t negf_real = flip ^ negf_imag;
5456     float64 negx_imag, negx_real;
5457     uint64_t *g = vg;
5458 
5459     /* With AH=0, use negx; with AH=1 use negf. */
5460     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
5461     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
5462     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5463     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5464 
5465     do {
5466         uint64_t pg = g[(i - 1) >> 6];
5467         do {
5468             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5469 
5470             /* I holds the real index; J holds the imag index.  */
5471             j = i - sizeof(float64);
5472             i -= 2 * sizeof(float64);
5473 
5474             nr = *(float64 *)(vn + H1_2(i));
5475             ni = *(float64 *)(vn + H1_2(j));
5476             mr = *(float64 *)(vm + H1_2(i));
5477             mi = *(float64 *)(vm + H1_2(j));
5478 
5479             e2 = (flip ? ni : nr);
5480             e1 = (flip ? mi : mr) ^ negx_real;
5481             e4 = e2;
5482             e3 = (flip ? mr : mi) ^ negx_imag;
5483 
5484             if (likely((pg >> (i & 63)) & 1)) {
5485                 d = *(float64 *)(va + H1_2(i));
5486                 d = float64_muladd(e2, e1, d, negf_real, status);
5487                 *(float64 *)(vd + H1_2(i)) = d;
5488             }
5489             if (likely((pg >> (j & 63)) & 1)) {
5490                 d = *(float64 *)(va + H1_2(j));
5491                 d = float64_muladd(e4, e3, d, negf_imag, status);
5492                 *(float64 *)(vd + H1_2(j)) = d;
5493             }
5494         } while (i & 63);
5495     } while (i != 0);
5496 }
5497 
5498 /*
5499  * Load contiguous data, protected by a governing predicate.
5500  */
5501 
5502 /*
5503  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5504  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5505  * element >= @reg_off, or @reg_max if there were no active elements at all.
5506  */
5507 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5508                                  intptr_t reg_max, int esz)
5509 {
5510     uint64_t pg_mask = pred_esz_masks[esz];
5511     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5512 
5513     /* In normal usage, the first element is active.  */
5514     if (likely(pg & 1)) {
5515         return reg_off;
5516     }
5517 
5518     if (pg == 0) {
5519         reg_off &= -64;
5520         do {
5521             reg_off += 64;
5522             if (unlikely(reg_off >= reg_max)) {
5523                 /* The entire predicate was false.  */
5524                 return reg_max;
5525             }
5526             pg = vg[reg_off >> 6] & pg_mask;
5527         } while (pg == 0);
5528     }
5529     reg_off += ctz64(pg);
5530 
5531     /* We should never see an out of range predicate bit set.  */
5532     tcg_debug_assert(reg_off < reg_max);
5533     return reg_off;
5534 }
5535 
5536 /*
5537  * Resolve the guest virtual address to info->host and info->flags.
5538  * If @nofault, return false if the page is invalid, otherwise
5539  * exit via page fault exception.
5540  */
5541 
5542 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5543                     target_ulong addr, int mem_off, MMUAccessType access_type,
5544                     int mmu_idx, uintptr_t retaddr)
5545 {
5546     int flags;
5547 
5548     addr += mem_off;
5549 
5550     /*
5551      * User-only currently always issues with TBI.  See the comment
5552      * above useronly_clean_ptr.  Usually we clean this top byte away
5553      * during translation, but we can't do that for e.g. vector + imm
5554      * addressing modes.
5555      *
5556      * We currently always enable TBI for user-only, and do not provide
5557      * a way to turn it off.  So clean the pointer unconditionally here,
5558      * rather than look it up here, or pass it down from above.
5559      */
5560     addr = useronly_clean_ptr(addr);
5561 
5562 #ifdef CONFIG_USER_ONLY
5563     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5564                                &info->host, retaddr);
5565 #else
5566     CPUTLBEntryFull *full;
5567     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5568                               &info->host, &full, retaddr);
5569 #endif
5570     info->flags = flags;
5571 
5572     if (flags & TLB_INVALID_MASK) {
5573         g_assert(nofault);
5574         return false;
5575     }
5576 
5577 #ifdef CONFIG_USER_ONLY
5578     memset(&info->attrs, 0, sizeof(info->attrs));
5579     /* Require both ANON and MTE; see allocation_tag_mem(). */
5580     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5581 #else
5582     info->attrs = full->attrs;
5583     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5584 #endif
5585 
5586     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5587     info->host -= mem_off;
5588     return true;
5589 }
5590 
5591 /*
5592  * Find first active element on each page, and a loose bound for the
5593  * final element on each page.  Identify any single element that spans
5594  * the page boundary.  Return true if there are any active elements.
5595  */
5596 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5597                             intptr_t reg_max, int esz, int msize)
5598 {
5599     const int esize = 1 << esz;
5600     const uint64_t pg_mask = pred_esz_masks[esz];
5601     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5602     intptr_t mem_off_last, mem_off_split;
5603     intptr_t page_split, elt_split;
5604     intptr_t i;
5605 
5606     /* Set all of the element indices to -1, and the TLB data to 0. */
5607     memset(info, -1, offsetof(SVEContLdSt, page));
5608     memset(info->page, 0, sizeof(info->page));
5609 
5610     /* Gross scan over the entire predicate to find bounds. */
5611     i = 0;
5612     do {
5613         uint64_t pg = vg[i] & pg_mask;
5614         if (pg) {
5615             reg_off_last = i * 64 + 63 - clz64(pg);
5616             if (reg_off_first < 0) {
5617                 reg_off_first = i * 64 + ctz64(pg);
5618             }
5619         }
5620     } while (++i * 64 < reg_max);
5621 
5622     if (unlikely(reg_off_first < 0)) {
5623         /* No active elements, no pages touched. */
5624         return false;
5625     }
5626     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5627 
5628     info->reg_off_first[0] = reg_off_first;
5629     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5630     mem_off_last = (reg_off_last >> esz) * msize;
5631 
5632     page_split = -(addr | TARGET_PAGE_MASK);
5633     if (likely(mem_off_last + msize <= page_split)) {
5634         /* The entire operation fits within a single page. */
5635         info->reg_off_last[0] = reg_off_last;
5636         return true;
5637     }
5638 
5639     info->page_split = page_split;
5640     elt_split = page_split / msize;
5641     reg_off_split = elt_split << esz;
5642     mem_off_split = elt_split * msize;
5643 
5644     /*
5645      * This is the last full element on the first page, but it is not
5646      * necessarily active.  If there is no full element, i.e. the first
5647      * active element is the one that's split, this value remains -1.
5648      * It is useful as iteration bounds.
5649      */
5650     if (elt_split != 0) {
5651         info->reg_off_last[0] = reg_off_split - esize;
5652     }
5653 
5654     /* Determine if an unaligned element spans the pages.  */
5655     if (page_split % msize != 0) {
5656         /* It is helpful to know if the split element is active. */
5657         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5658             info->reg_off_split = reg_off_split;
5659             info->mem_off_split = mem_off_split;
5660 
5661             if (reg_off_split == reg_off_last) {
5662                 /* The page crossing element is last. */
5663                 return true;
5664             }
5665         }
5666         reg_off_split += esize;
5667         mem_off_split += msize;
5668     }
5669 
5670     /*
5671      * We do want the first active element on the second page, because
5672      * this may affect the address reported in an exception.
5673      */
5674     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5675     tcg_debug_assert(reg_off_split <= reg_off_last);
5676     info->reg_off_first[1] = reg_off_split;
5677     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5678     info->reg_off_last[1] = reg_off_last;
5679     return true;
5680 }
5681 
5682 /*
5683  * Resolve the guest virtual addresses to info->page[].
5684  * Control the generation of page faults with @fault.  Return false if
5685  * there is no work to do, which can only happen with @fault == FAULT_NO.
5686  */
5687 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5688                          CPUARMState *env, target_ulong addr,
5689                          MMUAccessType access_type, uintptr_t retaddr)
5690 {
5691     int mmu_idx = arm_env_mmu_index(env);
5692     int mem_off = info->mem_off_first[0];
5693     bool nofault = fault == FAULT_NO;
5694     bool have_work = true;
5695 
5696     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5697                         access_type, mmu_idx, retaddr)) {
5698         /* No work to be done. */
5699         return false;
5700     }
5701 
5702     if (likely(info->page_split < 0)) {
5703         /* The entire operation was on the one page. */
5704         return true;
5705     }
5706 
5707     /*
5708      * If the second page is invalid, then we want the fault address to be
5709      * the first byte on that page which is accessed.
5710      */
5711     if (info->mem_off_split >= 0) {
5712         /*
5713          * There is an element split across the pages.  The fault address
5714          * should be the first byte of the second page.
5715          */
5716         mem_off = info->page_split;
5717         /*
5718          * If the split element is also the first active element
5719          * of the vector, then:  For first-fault we should continue
5720          * to generate faults for the second page.  For no-fault,
5721          * we have work only if the second page is valid.
5722          */
5723         if (info->mem_off_first[0] < info->mem_off_split) {
5724             nofault = FAULT_FIRST;
5725             have_work = false;
5726         }
5727     } else {
5728         /*
5729          * There is no element split across the pages.  The fault address
5730          * should be the first active element on the second page.
5731          */
5732         mem_off = info->mem_off_first[1];
5733         /*
5734          * There must have been one active element on the first page,
5735          * so we're out of first-fault territory.
5736          */
5737         nofault = fault != FAULT_ALL;
5738     }
5739 
5740     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5741                                 access_type, mmu_idx, retaddr);
5742     return have_work;
5743 }
5744 
5745 #ifndef CONFIG_USER_ONLY
5746 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5747                                uint64_t *vg, target_ulong addr,
5748                                int esize, int msize, int wp_access,
5749                                uintptr_t retaddr)
5750 {
5751     intptr_t mem_off, reg_off, reg_last;
5752     int flags0 = info->page[0].flags;
5753     int flags1 = info->page[1].flags;
5754 
5755     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5756         return;
5757     }
5758 
5759     /* Indicate that watchpoints are handled. */
5760     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5761     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5762 
5763     if (flags0 & TLB_WATCHPOINT) {
5764         mem_off = info->mem_off_first[0];
5765         reg_off = info->reg_off_first[0];
5766         reg_last = info->reg_off_last[0];
5767 
5768         while (reg_off <= reg_last) {
5769             uint64_t pg = vg[reg_off >> 6];
5770             do {
5771                 if ((pg >> (reg_off & 63)) & 1) {
5772                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5773                                          msize, info->page[0].attrs,
5774                                          wp_access, retaddr);
5775                 }
5776                 reg_off += esize;
5777                 mem_off += msize;
5778             } while (reg_off <= reg_last && (reg_off & 63));
5779         }
5780     }
5781 
5782     mem_off = info->mem_off_split;
5783     if (mem_off >= 0) {
5784         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5785                              info->page[0].attrs, wp_access, retaddr);
5786     }
5787 
5788     mem_off = info->mem_off_first[1];
5789     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5790         reg_off = info->reg_off_first[1];
5791         reg_last = info->reg_off_last[1];
5792 
5793         do {
5794             uint64_t pg = vg[reg_off >> 6];
5795             do {
5796                 if ((pg >> (reg_off & 63)) & 1) {
5797                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5798                                          msize, info->page[1].attrs,
5799                                          wp_access, retaddr);
5800                 }
5801                 reg_off += esize;
5802                 mem_off += msize;
5803             } while (reg_off & 63);
5804         } while (reg_off <= reg_last);
5805     }
5806 }
5807 #endif
5808 
5809 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5810                              uint64_t *vg, target_ulong addr, int esize,
5811                              int msize, uint32_t mtedesc, uintptr_t ra)
5812 {
5813     intptr_t mem_off, reg_off, reg_last;
5814 
5815     /* Process the page only if MemAttr == Tagged. */
5816     if (info->page[0].tagged) {
5817         mem_off = info->mem_off_first[0];
5818         reg_off = info->reg_off_first[0];
5819         reg_last = info->reg_off_split;
5820         if (reg_last < 0) {
5821             reg_last = info->reg_off_last[0];
5822         }
5823 
5824         do {
5825             uint64_t pg = vg[reg_off >> 6];
5826             do {
5827                 if ((pg >> (reg_off & 63)) & 1) {
5828                     mte_check(env, mtedesc, addr, ra);
5829                 }
5830                 reg_off += esize;
5831                 mem_off += msize;
5832             } while (reg_off <= reg_last && (reg_off & 63));
5833         } while (reg_off <= reg_last);
5834     }
5835 
5836     mem_off = info->mem_off_first[1];
5837     if (mem_off >= 0 && info->page[1].tagged) {
5838         reg_off = info->reg_off_first[1];
5839         reg_last = info->reg_off_last[1];
5840 
5841         do {
5842             uint64_t pg = vg[reg_off >> 6];
5843             do {
5844                 if ((pg >> (reg_off & 63)) & 1) {
5845                     mte_check(env, mtedesc, addr, ra);
5846                 }
5847                 reg_off += esize;
5848                 mem_off += msize;
5849             } while (reg_off & 63);
5850         } while (reg_off <= reg_last);
5851     }
5852 }
5853 
5854 /*
5855  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5856  */
5857 static inline QEMU_ALWAYS_INLINE
5858 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5859                uint32_t desc, const uintptr_t retaddr,
5860                const int esz, const int msz, const int N, uint32_t mtedesc,
5861                sve_ldst1_host_fn *host_fn,
5862                sve_ldst1_tlb_fn *tlb_fn)
5863 {
5864     const unsigned rd = simd_data(desc);
5865     const intptr_t reg_max = simd_oprsz(desc);
5866     intptr_t reg_off, reg_last, mem_off;
5867     SVEContLdSt info;
5868     void *host;
5869     int flags, i;
5870 
5871     /* Find the active elements.  */
5872     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5873         /* The entire predicate was false; no load occurs.  */
5874         for (i = 0; i < N; ++i) {
5875             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5876         }
5877         return;
5878     }
5879 
5880     /* Probe the page(s).  Exit with exception for any invalid page. */
5881     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5882 
5883     /* Handle watchpoints for all active elements. */
5884     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5885                               BP_MEM_READ, retaddr);
5886 
5887     /*
5888      * Handle mte checks for all active elements.
5889      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5890      */
5891     if (mtedesc) {
5892         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5893                                 mtedesc, retaddr);
5894     }
5895 
5896     flags = info.page[0].flags | info.page[1].flags;
5897     if (unlikely(flags != 0)) {
5898         /*
5899          * At least one page includes MMIO.
5900          * Any bus operation can fail with cpu_transaction_failed,
5901          * which for ARM will raise SyncExternal.  Perform the load
5902          * into scratch memory to preserve register state until the end.
5903          */
5904         ARMVectorReg scratch[4] = { };
5905 
5906         mem_off = info.mem_off_first[0];
5907         reg_off = info.reg_off_first[0];
5908         reg_last = info.reg_off_last[1];
5909         if (reg_last < 0) {
5910             reg_last = info.reg_off_split;
5911             if (reg_last < 0) {
5912                 reg_last = info.reg_off_last[0];
5913             }
5914         }
5915 
5916         do {
5917             uint64_t pg = vg[reg_off >> 6];
5918             do {
5919                 if ((pg >> (reg_off & 63)) & 1) {
5920                     for (i = 0; i < N; ++i) {
5921                         tlb_fn(env, &scratch[i], reg_off,
5922                                addr + mem_off + (i << msz), retaddr);
5923                     }
5924                 }
5925                 reg_off += 1 << esz;
5926                 mem_off += N << msz;
5927             } while (reg_off & 63);
5928         } while (reg_off <= reg_last);
5929 
5930         for (i = 0; i < N; ++i) {
5931             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5932         }
5933         return;
5934     }
5935 
5936     /* The entire operation is in RAM, on valid pages. */
5937 
5938     for (i = 0; i < N; ++i) {
5939         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5940     }
5941 
5942     mem_off = info.mem_off_first[0];
5943     reg_off = info.reg_off_first[0];
5944     reg_last = info.reg_off_last[0];
5945     host = info.page[0].host;
5946 
5947     set_helper_retaddr(retaddr);
5948 
5949     while (reg_off <= reg_last) {
5950         uint64_t pg = vg[reg_off >> 6];
5951         do {
5952             if ((pg >> (reg_off & 63)) & 1) {
5953                 for (i = 0; i < N; ++i) {
5954                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5955                             host + mem_off + (i << msz));
5956                 }
5957             }
5958             reg_off += 1 << esz;
5959             mem_off += N << msz;
5960         } while (reg_off <= reg_last && (reg_off & 63));
5961     }
5962 
5963     clear_helper_retaddr();
5964 
5965     /*
5966      * Use the slow path to manage the cross-page misalignment.
5967      * But we know this is RAM and cannot trap.
5968      */
5969     mem_off = info.mem_off_split;
5970     if (unlikely(mem_off >= 0)) {
5971         reg_off = info.reg_off_split;
5972         for (i = 0; i < N; ++i) {
5973             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5974                    addr + mem_off + (i << msz), retaddr);
5975         }
5976     }
5977 
5978     mem_off = info.mem_off_first[1];
5979     if (unlikely(mem_off >= 0)) {
5980         reg_off = info.reg_off_first[1];
5981         reg_last = info.reg_off_last[1];
5982         host = info.page[1].host;
5983 
5984         set_helper_retaddr(retaddr);
5985 
5986         do {
5987             uint64_t pg = vg[reg_off >> 6];
5988             do {
5989                 if ((pg >> (reg_off & 63)) & 1) {
5990                     for (i = 0; i < N; ++i) {
5991                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5992                                 host + mem_off + (i << msz));
5993                     }
5994                 }
5995                 reg_off += 1 << esz;
5996                 mem_off += N << msz;
5997             } while (reg_off & 63);
5998         } while (reg_off <= reg_last);
5999 
6000         clear_helper_retaddr();
6001     }
6002 }
6003 
6004 static inline QEMU_ALWAYS_INLINE
6005 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6006                    uint32_t desc, const uintptr_t ra,
6007                    const int esz, const int msz, const int N,
6008                    sve_ldst1_host_fn *host_fn,
6009                    sve_ldst1_tlb_fn *tlb_fn)
6010 {
6011     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6012     int bit55 = extract64(addr, 55, 1);
6013 
6014     /* Remove mtedesc from the normal sve descriptor. */
6015     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6016 
6017     /* Perform gross MTE suppression early. */
6018     if (!tbi_check(mtedesc, bit55) ||
6019         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6020         mtedesc = 0;
6021     }
6022 
6023     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6024 }
6025 
6026 #define DO_LD1_1(NAME, ESZ)                                             \
6027 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
6028                             target_ulong addr, uint32_t desc)           \
6029 {                                                                       \
6030     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
6031               sve_##NAME##_host, sve_##NAME##_tlb);                     \
6032 }                                                                       \
6033 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
6034                                 target_ulong addr, uint32_t desc)       \
6035 {                                                                       \
6036     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
6037                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
6038 }
6039 
6040 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
6041 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
6042                                target_ulong addr, uint32_t desc)        \
6043 {                                                                       \
6044     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6045               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
6046 }                                                                       \
6047 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
6048                                target_ulong addr, uint32_t desc)        \
6049 {                                                                       \
6050     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6051               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
6052 }                                                                       \
6053 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
6054                                    target_ulong addr, uint32_t desc)    \
6055 {                                                                       \
6056     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6057                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
6058 }                                                                       \
6059 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
6060                                    target_ulong addr, uint32_t desc)    \
6061 {                                                                       \
6062     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6063                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
6064 }
6065 
6066 DO_LD1_1(ld1bb,  MO_8)
6067 DO_LD1_1(ld1bhu, MO_16)
6068 DO_LD1_1(ld1bhs, MO_16)
6069 DO_LD1_1(ld1bsu, MO_32)
6070 DO_LD1_1(ld1bss, MO_32)
6071 DO_LD1_1(ld1bdu, MO_64)
6072 DO_LD1_1(ld1bds, MO_64)
6073 
6074 DO_LD1_2(ld1hh,  MO_16, MO_16)
6075 DO_LD1_2(ld1hsu, MO_32, MO_16)
6076 DO_LD1_2(ld1hss, MO_32, MO_16)
6077 DO_LD1_2(ld1hdu, MO_64, MO_16)
6078 DO_LD1_2(ld1hds, MO_64, MO_16)
6079 
6080 DO_LD1_2(ld1ss,  MO_32, MO_32)
6081 DO_LD1_2(ld1sdu, MO_64, MO_32)
6082 DO_LD1_2(ld1sds, MO_64, MO_32)
6083 
6084 DO_LD1_2(ld1dd,  MO_64, MO_64)
6085 
6086 #undef DO_LD1_1
6087 #undef DO_LD1_2
6088 
6089 #define DO_LDN_1(N)                                                     \
6090 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
6091                              target_ulong addr, uint32_t desc)          \
6092 {                                                                       \
6093     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
6094               sve_ld1bb_host, sve_ld1bb_tlb);                           \
6095 }                                                                       \
6096 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
6097                                  target_ulong addr, uint32_t desc)      \
6098 {                                                                       \
6099     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
6100                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
6101 }
6102 
6103 #define DO_LDN_2(N, SUFF, ESZ)                                          \
6104 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
6105                                     target_ulong addr, uint32_t desc)   \
6106 {                                                                       \
6107     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6108               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
6109 }                                                                       \
6110 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
6111                                     target_ulong addr, uint32_t desc)   \
6112 {                                                                       \
6113     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6114               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
6115 }                                                                       \
6116 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
6117                                         target_ulong addr, uint32_t desc) \
6118 {                                                                       \
6119     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6120                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
6121 }                                                                       \
6122 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
6123                                         target_ulong addr, uint32_t desc) \
6124 {                                                                       \
6125     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6126                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
6127 }
6128 
6129 DO_LDN_1(2)
6130 DO_LDN_1(3)
6131 DO_LDN_1(4)
6132 
6133 DO_LDN_2(2, hh, MO_16)
6134 DO_LDN_2(3, hh, MO_16)
6135 DO_LDN_2(4, hh, MO_16)
6136 
6137 DO_LDN_2(2, ss, MO_32)
6138 DO_LDN_2(3, ss, MO_32)
6139 DO_LDN_2(4, ss, MO_32)
6140 
6141 DO_LDN_2(2, dd, MO_64)
6142 DO_LDN_2(3, dd, MO_64)
6143 DO_LDN_2(4, dd, MO_64)
6144 
6145 #undef DO_LDN_1
6146 #undef DO_LDN_2
6147 
6148 /*
6149  * Load contiguous data, first-fault and no-fault.
6150  *
6151  * For user-only, we control the race between page_check_range and
6152  * another thread's munmap by using set/clear_helper_retaddr.  Any
6153  * SEGV that occurs between those markers is assumed to be because
6154  * the guest page vanished.  Keep that block as small as possible
6155  * so that unrelated QEMU bugs are not blamed on the guest.
6156  */
6157 
6158 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
6159  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6160  * option, which leaves subsequent data unchanged.
6161  */
6162 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6163 {
6164     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6165 
6166     if (i & 63) {
6167         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6168         i = ROUND_UP(i, 64);
6169     }
6170     for (; i < oprsz; i += 64) {
6171         ffr[i / 64] = 0;
6172     }
6173 }
6174 
6175 /*
6176  * Common helper for all contiguous no-fault and first-fault loads.
6177  */
6178 static inline QEMU_ALWAYS_INLINE
6179 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6180                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6181                    const int esz, const int msz, const SVEContFault fault,
6182                    sve_ldst1_host_fn *host_fn,
6183                    sve_ldst1_tlb_fn *tlb_fn)
6184 {
6185     const unsigned rd = simd_data(desc);
6186     void *vd = &env->vfp.zregs[rd];
6187     const intptr_t reg_max = simd_oprsz(desc);
6188     intptr_t reg_off, mem_off, reg_last;
6189     SVEContLdSt info;
6190     int flags;
6191     void *host;
6192 
6193     /* Find the active elements.  */
6194     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6195         /* The entire predicate was false; no load occurs.  */
6196         memset(vd, 0, reg_max);
6197         return;
6198     }
6199     reg_off = info.reg_off_first[0];
6200 
6201     /* Probe the page(s). */
6202     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6203         /* Fault on first element. */
6204         tcg_debug_assert(fault == FAULT_NO);
6205         memset(vd, 0, reg_max);
6206         goto do_fault;
6207     }
6208 
6209     mem_off = info.mem_off_first[0];
6210     flags = info.page[0].flags;
6211 
6212     /*
6213      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6214      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6215      */
6216     if (!info.page[0].tagged) {
6217         mtedesc = 0;
6218     }
6219 
6220     if (fault == FAULT_FIRST) {
6221         /* Trapping mte check for the first-fault element.  */
6222         if (mtedesc) {
6223             mte_check(env, mtedesc, addr + mem_off, retaddr);
6224         }
6225 
6226         /*
6227          * Special handling of the first active element,
6228          * if it crosses a page boundary or is MMIO.
6229          */
6230         bool is_split = mem_off == info.mem_off_split;
6231         if (unlikely(flags != 0) || unlikely(is_split)) {
6232             /*
6233              * Use the slow path for cross-page handling.
6234              * Might trap for MMIO or watchpoints.
6235              */
6236             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6237 
6238             /* After any fault, zero the other elements. */
6239             swap_memzero(vd, reg_off);
6240             reg_off += 1 << esz;
6241             mem_off += 1 << msz;
6242             swap_memzero(vd + reg_off, reg_max - reg_off);
6243 
6244             if (is_split) {
6245                 goto second_page;
6246             }
6247         } else {
6248             memset(vd, 0, reg_max);
6249         }
6250     } else {
6251         memset(vd, 0, reg_max);
6252         if (unlikely(mem_off == info.mem_off_split)) {
6253             /* The first active element crosses a page boundary. */
6254             flags |= info.page[1].flags;
6255             if (unlikely(flags & TLB_MMIO)) {
6256                 /* Some page is MMIO, see below. */
6257                 goto do_fault;
6258             }
6259             if (unlikely(flags & TLB_WATCHPOINT) &&
6260                 (cpu_watchpoint_address_matches
6261                  (env_cpu(env), addr + mem_off, 1 << msz)
6262                  & BP_MEM_READ)) {
6263                 /* Watchpoint hit, see below. */
6264                 goto do_fault;
6265             }
6266             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6267                 goto do_fault;
6268             }
6269             /*
6270              * Use the slow path for cross-page handling.
6271              * This is RAM, without a watchpoint, and will not trap.
6272              */
6273             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6274             goto second_page;
6275         }
6276     }
6277 
6278     /*
6279      * From this point on, all memory operations are MemSingleNF.
6280      *
6281      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6282      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6283      *
6284      * Unfortuately we do not have access to the memory attributes from the
6285      * PTE to tell Device memory from Normal memory.  So we make a mostly
6286      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6287      * This gives the right answer for the common cases of "Normal memory,
6288      * backed by host RAM" and "Device memory, backed by MMIO".
6289      * The architecture allows us to suppress an NF load and return
6290      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6291      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6292      * get wrong is "Device memory, backed by host RAM", for which we
6293      * should return (UNKNOWN, FAULT) for but do not.
6294      *
6295      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6296      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6297      * architectural breakpoints the same.
6298      */
6299     if (unlikely(flags & TLB_MMIO)) {
6300         goto do_fault;
6301     }
6302 
6303     reg_last = info.reg_off_last[0];
6304     host = info.page[0].host;
6305 
6306     set_helper_retaddr(retaddr);
6307 
6308     do {
6309         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6310         do {
6311             if ((pg >> (reg_off & 63)) & 1) {
6312                 if (unlikely(flags & TLB_WATCHPOINT) &&
6313                     (cpu_watchpoint_address_matches
6314                      (env_cpu(env), addr + mem_off, 1 << msz)
6315                      & BP_MEM_READ)) {
6316                     clear_helper_retaddr();
6317                     goto do_fault;
6318                 }
6319                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6320                     clear_helper_retaddr();
6321                     goto do_fault;
6322                 }
6323                 host_fn(vd, reg_off, host + mem_off);
6324             }
6325             reg_off += 1 << esz;
6326             mem_off += 1 << msz;
6327         } while (reg_off <= reg_last && (reg_off & 63));
6328     } while (reg_off <= reg_last);
6329 
6330     clear_helper_retaddr();
6331 
6332     /*
6333      * MemSingleNF is allowed to fail for any reason.  We have special
6334      * code above to handle the first element crossing a page boundary.
6335      * As an implementation choice, decline to handle a cross-page element
6336      * in any other position.
6337      */
6338     reg_off = info.reg_off_split;
6339     if (reg_off >= 0) {
6340         goto do_fault;
6341     }
6342 
6343  second_page:
6344     reg_off = info.reg_off_first[1];
6345     if (likely(reg_off < 0)) {
6346         /* No active elements on the second page.  All done. */
6347         return;
6348     }
6349 
6350     /*
6351      * MemSingleNF is allowed to fail for any reason.  As an implementation
6352      * choice, decline to handle elements on the second page.  This should
6353      * be low frequency as the guest walks through memory -- the next
6354      * iteration of the guest's loop should be aligned on the page boundary,
6355      * and then all following iterations will stay aligned.
6356      */
6357 
6358  do_fault:
6359     record_fault(env, reg_off, reg_max);
6360 }
6361 
6362 static inline QEMU_ALWAYS_INLINE
6363 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6364                        uint32_t desc, const uintptr_t retaddr,
6365                        const int esz, const int msz, const SVEContFault fault,
6366                        sve_ldst1_host_fn *host_fn,
6367                        sve_ldst1_tlb_fn *tlb_fn)
6368 {
6369     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6370     int bit55 = extract64(addr, 55, 1);
6371 
6372     /* Remove mtedesc from the normal sve descriptor. */
6373     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6374 
6375     /* Perform gross MTE suppression early. */
6376     if (!tbi_check(mtedesc, bit55) ||
6377         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6378         mtedesc = 0;
6379     }
6380 
6381     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6382                   esz, msz, fault, host_fn, tlb_fn);
6383 }
6384 
6385 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6386 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6387                                  target_ulong addr, uint32_t desc)      \
6388 {                                                                       \
6389     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6390                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6391 }                                                                       \
6392 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6393                                  target_ulong addr, uint32_t desc)      \
6394 {                                                                       \
6395     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6396                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6397 }                                                                       \
6398 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6399                                      target_ulong addr, uint32_t desc)  \
6400 {                                                                       \
6401     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6402                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6403 }                                                                       \
6404 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6405                                      target_ulong addr, uint32_t desc)  \
6406 {                                                                       \
6407     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6408                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6409 }
6410 
6411 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6412 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6413                                     target_ulong addr, uint32_t desc)   \
6414 {                                                                       \
6415     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6416                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6417 }                                                                       \
6418 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6419                                     target_ulong addr, uint32_t desc)   \
6420 {                                                                       \
6421     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6422                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6423 }                                                                       \
6424 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6425                                     target_ulong addr, uint32_t desc)   \
6426 {                                                                       \
6427     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6428                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6429 }                                                                       \
6430 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6431                                     target_ulong addr, uint32_t desc)   \
6432 {                                                                       \
6433     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6434                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6435 }                                                                       \
6436 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6437                                         target_ulong addr, uint32_t desc) \
6438 {                                                                       \
6439     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6440                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6441 }                                                                       \
6442 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6443                                         target_ulong addr, uint32_t desc) \
6444 {                                                                       \
6445     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6446                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6447 }                                                                       \
6448 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6449                                         target_ulong addr, uint32_t desc) \
6450 {                                                                       \
6451     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6452                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6453 }                                                                       \
6454 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6455                                         target_ulong addr, uint32_t desc) \
6456 {                                                                       \
6457     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6458                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6459 }
6460 
6461 DO_LDFF1_LDNF1_1(bb,  MO_8)
6462 DO_LDFF1_LDNF1_1(bhu, MO_16)
6463 DO_LDFF1_LDNF1_1(bhs, MO_16)
6464 DO_LDFF1_LDNF1_1(bsu, MO_32)
6465 DO_LDFF1_LDNF1_1(bss, MO_32)
6466 DO_LDFF1_LDNF1_1(bdu, MO_64)
6467 DO_LDFF1_LDNF1_1(bds, MO_64)
6468 
6469 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6470 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6471 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6472 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6473 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6474 
6475 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6476 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6477 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6478 
6479 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6480 
6481 #undef DO_LDFF1_LDNF1_1
6482 #undef DO_LDFF1_LDNF1_2
6483 
6484 /*
6485  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6486  */
6487 
6488 static inline QEMU_ALWAYS_INLINE
6489 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6490                uint32_t desc, const uintptr_t retaddr,
6491                const int esz, const int msz, const int N, uint32_t mtedesc,
6492                sve_ldst1_host_fn *host_fn,
6493                sve_ldst1_tlb_fn *tlb_fn)
6494 {
6495     const unsigned rd = simd_data(desc);
6496     const intptr_t reg_max = simd_oprsz(desc);
6497     intptr_t reg_off, reg_last, mem_off;
6498     SVEContLdSt info;
6499     void *host;
6500     int i, flags;
6501 
6502     /* Find the active elements.  */
6503     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6504         /* The entire predicate was false; no store occurs.  */
6505         return;
6506     }
6507 
6508     /* Probe the page(s).  Exit with exception for any invalid page. */
6509     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6510 
6511     /* Handle watchpoints for all active elements. */
6512     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6513                               BP_MEM_WRITE, retaddr);
6514 
6515     /*
6516      * Handle mte checks for all active elements.
6517      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6518      */
6519     if (mtedesc) {
6520         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6521                                 mtedesc, retaddr);
6522     }
6523 
6524     flags = info.page[0].flags | info.page[1].flags;
6525     if (unlikely(flags != 0)) {
6526         /*
6527          * At least one page includes MMIO.
6528          * Any bus operation can fail with cpu_transaction_failed,
6529          * which for ARM will raise SyncExternal.  We cannot avoid
6530          * this fault and will leave with the store incomplete.
6531          */
6532         mem_off = info.mem_off_first[0];
6533         reg_off = info.reg_off_first[0];
6534         reg_last = info.reg_off_last[1];
6535         if (reg_last < 0) {
6536             reg_last = info.reg_off_split;
6537             if (reg_last < 0) {
6538                 reg_last = info.reg_off_last[0];
6539             }
6540         }
6541 
6542         do {
6543             uint64_t pg = vg[reg_off >> 6];
6544             do {
6545                 if ((pg >> (reg_off & 63)) & 1) {
6546                     for (i = 0; i < N; ++i) {
6547                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6548                                addr + mem_off + (i << msz), retaddr);
6549                     }
6550                 }
6551                 reg_off += 1 << esz;
6552                 mem_off += N << msz;
6553             } while (reg_off & 63);
6554         } while (reg_off <= reg_last);
6555         return;
6556     }
6557 
6558     mem_off = info.mem_off_first[0];
6559     reg_off = info.reg_off_first[0];
6560     reg_last = info.reg_off_last[0];
6561     host = info.page[0].host;
6562 
6563     set_helper_retaddr(retaddr);
6564 
6565     while (reg_off <= reg_last) {
6566         uint64_t pg = vg[reg_off >> 6];
6567         do {
6568             if ((pg >> (reg_off & 63)) & 1) {
6569                 for (i = 0; i < N; ++i) {
6570                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6571                             host + mem_off + (i << msz));
6572                 }
6573             }
6574             reg_off += 1 << esz;
6575             mem_off += N << msz;
6576         } while (reg_off <= reg_last && (reg_off & 63));
6577     }
6578 
6579     clear_helper_retaddr();
6580 
6581     /*
6582      * Use the slow path to manage the cross-page misalignment.
6583      * But we know this is RAM and cannot trap.
6584      */
6585     mem_off = info.mem_off_split;
6586     if (unlikely(mem_off >= 0)) {
6587         reg_off = info.reg_off_split;
6588         for (i = 0; i < N; ++i) {
6589             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6590                    addr + mem_off + (i << msz), retaddr);
6591         }
6592     }
6593 
6594     mem_off = info.mem_off_first[1];
6595     if (unlikely(mem_off >= 0)) {
6596         reg_off = info.reg_off_first[1];
6597         reg_last = info.reg_off_last[1];
6598         host = info.page[1].host;
6599 
6600         set_helper_retaddr(retaddr);
6601 
6602         do {
6603             uint64_t pg = vg[reg_off >> 6];
6604             do {
6605                 if ((pg >> (reg_off & 63)) & 1) {
6606                     for (i = 0; i < N; ++i) {
6607                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6608                                 host + mem_off + (i << msz));
6609                     }
6610                 }
6611                 reg_off += 1 << esz;
6612                 mem_off += N << msz;
6613             } while (reg_off & 63);
6614         } while (reg_off <= reg_last);
6615 
6616         clear_helper_retaddr();
6617     }
6618 }
6619 
6620 static inline QEMU_ALWAYS_INLINE
6621 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6622                    uint32_t desc, const uintptr_t ra,
6623                    const int esz, const int msz, const int N,
6624                    sve_ldst1_host_fn *host_fn,
6625                    sve_ldst1_tlb_fn *tlb_fn)
6626 {
6627     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6628     int bit55 = extract64(addr, 55, 1);
6629 
6630     /* Remove mtedesc from the normal sve descriptor. */
6631     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6632 
6633     /* Perform gross MTE suppression early. */
6634     if (!tbi_check(mtedesc, bit55) ||
6635         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6636         mtedesc = 0;
6637     }
6638 
6639     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6640 }
6641 
6642 #define DO_STN_1(N, NAME, ESZ)                                          \
6643 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6644                                  target_ulong addr, uint32_t desc)      \
6645 {                                                                       \
6646     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6647               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6648 }                                                                       \
6649 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6650                                      target_ulong addr, uint32_t desc)  \
6651 {                                                                       \
6652     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6653                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6654 }
6655 
6656 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6657 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6658                                     target_ulong addr, uint32_t desc)   \
6659 {                                                                       \
6660     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6661               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6662 }                                                                       \
6663 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6664                                     target_ulong addr, uint32_t desc)   \
6665 {                                                                       \
6666     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6667               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6668 }                                                                       \
6669 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6670                                         target_ulong addr, uint32_t desc) \
6671 {                                                                       \
6672     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6673                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6674 }                                                                       \
6675 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6676                                         target_ulong addr, uint32_t desc) \
6677 {                                                                       \
6678     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6679                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6680 }
6681 
6682 DO_STN_1(1, bb, MO_8)
6683 DO_STN_1(1, bh, MO_16)
6684 DO_STN_1(1, bs, MO_32)
6685 DO_STN_1(1, bd, MO_64)
6686 DO_STN_1(2, bb, MO_8)
6687 DO_STN_1(3, bb, MO_8)
6688 DO_STN_1(4, bb, MO_8)
6689 
6690 DO_STN_2(1, hh, MO_16, MO_16)
6691 DO_STN_2(1, hs, MO_32, MO_16)
6692 DO_STN_2(1, hd, MO_64, MO_16)
6693 DO_STN_2(2, hh, MO_16, MO_16)
6694 DO_STN_2(3, hh, MO_16, MO_16)
6695 DO_STN_2(4, hh, MO_16, MO_16)
6696 
6697 DO_STN_2(1, ss, MO_32, MO_32)
6698 DO_STN_2(1, sd, MO_64, MO_32)
6699 DO_STN_2(2, ss, MO_32, MO_32)
6700 DO_STN_2(3, ss, MO_32, MO_32)
6701 DO_STN_2(4, ss, MO_32, MO_32)
6702 
6703 DO_STN_2(1, dd, MO_64, MO_64)
6704 DO_STN_2(2, dd, MO_64, MO_64)
6705 DO_STN_2(3, dd, MO_64, MO_64)
6706 DO_STN_2(4, dd, MO_64, MO_64)
6707 
6708 #undef DO_STN_1
6709 #undef DO_STN_2
6710 
6711 /*
6712  * Loads with a vector index.
6713  */
6714 
6715 /*
6716  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6717  */
6718 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6719 
6720 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6721 {
6722     return *(uint32_t *)(reg + H1_4(reg_ofs));
6723 }
6724 
6725 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6726 {
6727     return *(int32_t *)(reg + H1_4(reg_ofs));
6728 }
6729 
6730 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6731 {
6732     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6733 }
6734 
6735 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6736 {
6737     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6738 }
6739 
6740 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6741 {
6742     return *(uint64_t *)(reg + reg_ofs);
6743 }
6744 
6745 static inline QEMU_ALWAYS_INLINE
6746 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6747                target_ulong base, uint32_t desc, uintptr_t retaddr,
6748                uint32_t mtedesc, int esize, int msize,
6749                zreg_off_fn *off_fn,
6750                sve_ldst1_host_fn *host_fn,
6751                sve_ldst1_tlb_fn *tlb_fn)
6752 {
6753     const int mmu_idx = arm_env_mmu_index(env);
6754     const intptr_t reg_max = simd_oprsz(desc);
6755     const int scale = simd_data(desc);
6756     ARMVectorReg scratch;
6757     intptr_t reg_off;
6758     SVEHostPage info, info2;
6759 
6760     memset(&scratch, 0, reg_max);
6761     reg_off = 0;
6762     do {
6763         uint64_t pg = vg[reg_off >> 6];
6764         do {
6765             if (likely(pg & 1)) {
6766                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6767                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6768 
6769                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6770                                mmu_idx, retaddr);
6771 
6772                 if (likely(in_page >= msize)) {
6773                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6774                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6775                                              info.attrs, BP_MEM_READ, retaddr);
6776                     }
6777                     if (mtedesc && info.tagged) {
6778                         mte_check(env, mtedesc, addr, retaddr);
6779                     }
6780                     if (unlikely(info.flags & TLB_MMIO)) {
6781                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6782                     } else {
6783                         set_helper_retaddr(retaddr);
6784                         host_fn(&scratch, reg_off, info.host);
6785                         clear_helper_retaddr();
6786                     }
6787                 } else {
6788                     /* Element crosses the page boundary. */
6789                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6790                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6791                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6792                         cpu_check_watchpoint(env_cpu(env), addr,
6793                                              msize, info.attrs,
6794                                              BP_MEM_READ, retaddr);
6795                     }
6796                     if (mtedesc && info.tagged) {
6797                         mte_check(env, mtedesc, addr, retaddr);
6798                     }
6799                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6800                 }
6801             }
6802             reg_off += esize;
6803             pg >>= esize;
6804         } while (reg_off & 63);
6805     } while (reg_off < reg_max);
6806 
6807     /* Wait until all exceptions have been raised to write back.  */
6808     memcpy(vd, &scratch, reg_max);
6809 }
6810 
6811 static inline QEMU_ALWAYS_INLINE
6812 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6813                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6814                    int esize, int msize, zreg_off_fn *off_fn,
6815                    sve_ldst1_host_fn *host_fn,
6816                    sve_ldst1_tlb_fn *tlb_fn)
6817 {
6818     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6819     /* Remove mtedesc from the normal sve descriptor. */
6820     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6821 
6822     /*
6823      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6824      * offset base entirely over the address space hole to change the
6825      * pointer tag, or change the bit55 selector.  So we could here
6826      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6827      */
6828     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6829               esize, msize, off_fn, host_fn, tlb_fn);
6830 }
6831 
6832 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6833 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6834                                  void *vm, target_ulong base, uint32_t desc) \
6835 {                                                                            \
6836     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6837               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6838 }                                                                            \
6839 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6840      void *vm, target_ulong base, uint32_t desc)                             \
6841 {                                                                            \
6842     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6843                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6844 }
6845 
6846 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6847 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6848                                  void *vm, target_ulong base, uint32_t desc) \
6849 {                                                                            \
6850     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6851               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6852 }                                                                            \
6853 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6854     void *vm, target_ulong base, uint32_t desc)                              \
6855 {                                                                            \
6856     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6857                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6858 }
6859 
6860 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6861 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6862 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6863 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6864 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6865 
6866 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6867 DO_LD1_ZPZ_S(bss, zss, MO_8)
6868 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6869 DO_LD1_ZPZ_D(bds, zss, MO_8)
6870 DO_LD1_ZPZ_D(bds, zd, MO_8)
6871 
6872 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6873 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6874 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6875 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6876 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6877 
6878 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6879 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6880 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6881 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6882 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6883 
6884 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6885 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6886 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6887 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6888 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6889 
6890 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6891 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6892 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6893 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6894 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6895 
6896 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6897 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6898 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6899 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6900 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6901 
6902 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6903 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6904 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6905 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6906 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6907 
6908 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6909 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6910 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6911 
6912 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6913 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6914 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6915 
6916 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6917 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6918 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6919 
6920 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6921 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6922 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6923 
6924 #undef DO_LD1_ZPZ_S
6925 #undef DO_LD1_ZPZ_D
6926 
6927 /* First fault loads with a vector index.  */
6928 
6929 /*
6930  * Common helpers for all gather first-faulting loads.
6931  */
6932 
6933 static inline QEMU_ALWAYS_INLINE
6934 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6935                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6936                  uint32_t mtedesc, const int esz, const int msz,
6937                  zreg_off_fn *off_fn,
6938                  sve_ldst1_host_fn *host_fn,
6939                  sve_ldst1_tlb_fn *tlb_fn)
6940 {
6941     const int mmu_idx = arm_env_mmu_index(env);
6942     const intptr_t reg_max = simd_oprsz(desc);
6943     const int scale = simd_data(desc);
6944     const int esize = 1 << esz;
6945     const int msize = 1 << msz;
6946     intptr_t reg_off;
6947     SVEHostPage info;
6948     target_ulong addr, in_page;
6949     ARMVectorReg scratch;
6950 
6951     /* Skip to the first true predicate.  */
6952     reg_off = find_next_active(vg, 0, reg_max, esz);
6953     if (unlikely(reg_off >= reg_max)) {
6954         /* The entire predicate was false; no load occurs.  */
6955         memset(vd, 0, reg_max);
6956         return;
6957     }
6958 
6959     /* Protect against overlap between vd and vm. */
6960     if (unlikely(vd == vm)) {
6961         vm = memcpy(&scratch, vm, reg_max);
6962     }
6963 
6964     /*
6965      * Probe the first element, allowing faults.
6966      */
6967     addr = base + (off_fn(vm, reg_off) << scale);
6968     if (mtedesc) {
6969         mte_check(env, mtedesc, addr, retaddr);
6970     }
6971     tlb_fn(env, vd, reg_off, addr, retaddr);
6972 
6973     /* After any fault, zero the other elements. */
6974     swap_memzero(vd, reg_off);
6975     reg_off += esize;
6976     swap_memzero(vd + reg_off, reg_max - reg_off);
6977 
6978     /*
6979      * Probe the remaining elements, not allowing faults.
6980      */
6981     while (reg_off < reg_max) {
6982         uint64_t pg = vg[reg_off >> 6];
6983         do {
6984             if (likely((pg >> (reg_off & 63)) & 1)) {
6985                 addr = base + (off_fn(vm, reg_off) << scale);
6986                 in_page = -(addr | TARGET_PAGE_MASK);
6987 
6988                 if (unlikely(in_page < msize)) {
6989                     /* Stop if the element crosses a page boundary. */
6990                     goto fault;
6991                 }
6992 
6993                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6994                                mmu_idx, retaddr);
6995                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6996                     goto fault;
6997                 }
6998                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6999                     (cpu_watchpoint_address_matches
7000                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
7001                     goto fault;
7002                 }
7003                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
7004                     goto fault;
7005                 }
7006 
7007                 set_helper_retaddr(retaddr);
7008                 host_fn(vd, reg_off, info.host);
7009                 clear_helper_retaddr();
7010             }
7011             reg_off += esize;
7012         } while (reg_off & 63);
7013     }
7014     return;
7015 
7016  fault:
7017     record_fault(env, reg_off, reg_max);
7018 }
7019 
7020 static inline QEMU_ALWAYS_INLINE
7021 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7022                      target_ulong base, uint32_t desc, uintptr_t retaddr,
7023                      const int esz, const int msz,
7024                      zreg_off_fn *off_fn,
7025                      sve_ldst1_host_fn *host_fn,
7026                      sve_ldst1_tlb_fn *tlb_fn)
7027 {
7028     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7029     /* Remove mtedesc from the normal sve descriptor. */
7030     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7031 
7032     /*
7033      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7034      * offset base entirely over the address space hole to change the
7035      * pointer tag, or change the bit55 selector.  So we could here
7036      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7037      */
7038     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7039                 esz, msz, off_fn, host_fn, tlb_fn);
7040 }
7041 
7042 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
7043 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7044     (CPUARMState *env, void *vd, void *vg,                              \
7045      void *vm, target_ulong base, uint32_t desc)                        \
7046 {                                                                       \
7047     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
7048                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7049 }                                                                       \
7050 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7051     (CPUARMState *env, void *vd, void *vg,                              \
7052      void *vm, target_ulong base, uint32_t desc)                        \
7053 {                                                                       \
7054     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
7055                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7056 }
7057 
7058 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
7059 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7060     (CPUARMState *env, void *vd, void *vg,                              \
7061      void *vm, target_ulong base, uint32_t desc)                        \
7062 {                                                                       \
7063     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
7064                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7065 }                                                                       \
7066 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7067     (CPUARMState *env, void *vd, void *vg,                              \
7068      void *vm, target_ulong base, uint32_t desc)                        \
7069 {                                                                       \
7070     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
7071                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7072 }
7073 
7074 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7075 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7076 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7077 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7078 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7079 
7080 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7081 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7082 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7083 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7084 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7085 
7086 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7087 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7088 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7089 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7090 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7091 
7092 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7093 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7094 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7095 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7096 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7097 
7098 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7099 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7100 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7101 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7102 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7103 
7104 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7105 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7106 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7107 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7108 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7109 
7110 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
7111 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
7112 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7113 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7114 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7115 
7116 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
7117 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
7118 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7119 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7120 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7121 
7122 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7123 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7124 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7125 
7126 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7127 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7128 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7129 
7130 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7131 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7132 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7133 
7134 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7135 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7136 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7137 
7138 /* Stores with a vector index.  */
7139 
7140 static inline QEMU_ALWAYS_INLINE
7141 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7142                target_ulong base, uint32_t desc, uintptr_t retaddr,
7143                uint32_t mtedesc, int esize, int msize,
7144                zreg_off_fn *off_fn,
7145                sve_ldst1_host_fn *host_fn,
7146                sve_ldst1_tlb_fn *tlb_fn)
7147 {
7148     const int mmu_idx = arm_env_mmu_index(env);
7149     const intptr_t reg_max = simd_oprsz(desc);
7150     const int scale = simd_data(desc);
7151     void *host[ARM_MAX_VQ * 4];
7152     intptr_t reg_off, i;
7153     SVEHostPage info, info2;
7154 
7155     /*
7156      * Probe all of the elements for host addresses and flags.
7157      */
7158     i = reg_off = 0;
7159     do {
7160         uint64_t pg = vg[reg_off >> 6];
7161         do {
7162             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7163             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7164 
7165             host[i] = NULL;
7166             if (likely((pg >> (reg_off & 63)) & 1)) {
7167                 if (likely(in_page >= msize)) {
7168                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7169                                    mmu_idx, retaddr);
7170                     if (!(info.flags & TLB_MMIO)) {
7171                         host[i] = info.host;
7172                     }
7173                 } else {
7174                     /*
7175                      * Element crosses the page boundary.
7176                      * Probe both pages, but do not record the host address,
7177                      * so that we use the slow path.
7178                      */
7179                     sve_probe_page(&info, false, env, addr, 0,
7180                                    MMU_DATA_STORE, mmu_idx, retaddr);
7181                     sve_probe_page(&info2, false, env, addr + in_page, 0,
7182                                    MMU_DATA_STORE, mmu_idx, retaddr);
7183                     info.flags |= info2.flags;
7184                 }
7185 
7186                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7187                     cpu_check_watchpoint(env_cpu(env), addr, msize,
7188                                          info.attrs, BP_MEM_WRITE, retaddr);
7189                 }
7190 
7191                 if (mtedesc && info.tagged) {
7192                     mte_check(env, mtedesc, addr, retaddr);
7193                 }
7194             }
7195             i += 1;
7196             reg_off += esize;
7197         } while (reg_off & 63);
7198     } while (reg_off < reg_max);
7199 
7200     /*
7201      * Now that we have recognized all exceptions except SyncExternal
7202      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7203      *
7204      * Note for the common case of an element in RAM, not crossing a page
7205      * boundary, we have stored the host address in host[].  This doubles
7206      * as a first-level check against the predicate, since only enabled
7207      * elements have non-null host addresses.
7208      */
7209     i = reg_off = 0;
7210     do {
7211         void *h = host[i];
7212         if (likely(h != NULL)) {
7213             set_helper_retaddr(retaddr);
7214             host_fn(vd, reg_off, h);
7215             clear_helper_retaddr();
7216         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7217             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7218             tlb_fn(env, vd, reg_off, addr, retaddr);
7219         }
7220         i += 1;
7221         reg_off += esize;
7222     } while (reg_off < reg_max);
7223 }
7224 
7225 static inline QEMU_ALWAYS_INLINE
7226 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7227                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7228                    int esize, int msize, zreg_off_fn *off_fn,
7229                    sve_ldst1_host_fn *host_fn,
7230                    sve_ldst1_tlb_fn *tlb_fn)
7231 {
7232     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7233     /* Remove mtedesc from the normal sve descriptor. */
7234     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7235 
7236     /*
7237      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7238      * offset base entirely over the address space hole to change the
7239      * pointer tag, or change the bit55 selector.  So we could here
7240      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7241      */
7242     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7243               esize, msize, off_fn, host_fn, tlb_fn);
7244 }
7245 
7246 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7247 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7248                                  void *vm, target_ulong base, uint32_t desc) \
7249 {                                                                       \
7250     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7251               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7252 }                                                                       \
7253 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7254     void *vm, target_ulong base, uint32_t desc)                         \
7255 {                                                                       \
7256     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7257                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7258 }
7259 
7260 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7261 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7262                                  void *vm, target_ulong base, uint32_t desc) \
7263 {                                                                       \
7264     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7265               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7266 }                                                                       \
7267 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7268     void *vm, target_ulong base, uint32_t desc)                         \
7269 {                                                                       \
7270     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7271                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7272 }
7273 
7274 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7275 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7276 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7277 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7278 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7279 
7280 DO_ST1_ZPZ_S(bs, zss, MO_8)
7281 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7282 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7283 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7284 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7285 
7286 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7287 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7288 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7289 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7290 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7291 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7292 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7293 
7294 DO_ST1_ZPZ_D(bd, zss, MO_8)
7295 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7296 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7297 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7298 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7299 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7300 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7301 
7302 DO_ST1_ZPZ_D(bd, zd, MO_8)
7303 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7304 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7305 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7306 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7307 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7308 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7309 
7310 #undef DO_ST1_ZPZ_S
7311 #undef DO_ST1_ZPZ_D
7312 
7313 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7314 {
7315     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7316     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7317 
7318     for (i = 0; i < opr_sz; ++i) {
7319         d[i] = n[i] ^ m[i] ^ k[i];
7320     }
7321 }
7322 
7323 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7324 {
7325     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7326     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7327 
7328     for (i = 0; i < opr_sz; ++i) {
7329         d[i] = n[i] ^ (m[i] & ~k[i]);
7330     }
7331 }
7332 
7333 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7334 {
7335     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7336     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7337 
7338     for (i = 0; i < opr_sz; ++i) {
7339         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7340     }
7341 }
7342 
7343 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7344 {
7345     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7346     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7347 
7348     for (i = 0; i < opr_sz; ++i) {
7349         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7350     }
7351 }
7352 
7353 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7354 {
7355     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7356     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7357 
7358     for (i = 0; i < opr_sz; ++i) {
7359         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7360     }
7361 }
7362 
7363 /*
7364  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7365  * See hasless(v,1) from
7366  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7367  */
7368 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7369 {
7370     int bits = 8 << esz;
7371     uint64_t ones = dup_const(esz, 1);
7372     uint64_t signs = ones << (bits - 1);
7373     uint64_t cmp0, cmp1;
7374 
7375     cmp1 = dup_const(esz, n);
7376     cmp0 = cmp1 ^ m0;
7377     cmp1 = cmp1 ^ m1;
7378     cmp0 = (cmp0 - ones) & ~cmp0;
7379     cmp1 = (cmp1 - ones) & ~cmp1;
7380     return (cmp0 | cmp1) & signs;
7381 }
7382 
7383 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7384                                 uint32_t desc, int esz, bool nmatch)
7385 {
7386     uint16_t esz_mask = pred_esz_masks[esz];
7387     intptr_t opr_sz = simd_oprsz(desc);
7388     uint32_t flags = PREDTEST_INIT;
7389     intptr_t i, j, k;
7390 
7391     for (i = 0; i < opr_sz; i += 16) {
7392         uint64_t m0 = *(uint64_t *)(vm + i);
7393         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7394         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7395         uint16_t out = 0;
7396 
7397         for (j = 0; j < 16; j += 8) {
7398             uint64_t n = *(uint64_t *)(vn + i + j);
7399 
7400             for (k = 0; k < 8; k += 1 << esz) {
7401                 if (pg & (1 << (j + k))) {
7402                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7403                     out |= (o ^ nmatch) << (j + k);
7404                 }
7405             }
7406         }
7407         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7408         flags = iter_predtest_fwd(out, pg, flags);
7409     }
7410     return flags;
7411 }
7412 
7413 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7414 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7415 {                                                                             \
7416     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7417 }
7418 
7419 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7420 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7421 
7422 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7423 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7424 
7425 #undef DO_PPZZ_MATCH
7426 
7427 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7428                             uint32_t desc)
7429 {
7430     ARMVectorReg scratch;
7431     intptr_t i, j;
7432     intptr_t opr_sz = simd_oprsz(desc);
7433     uint32_t *d = vd, *n = vn, *m = vm;
7434     uint8_t *pg = vg;
7435 
7436     if (d == n) {
7437         n = memcpy(&scratch, n, opr_sz);
7438         if (d == m) {
7439             m = n;
7440         }
7441     } else if (d == m) {
7442         m = memcpy(&scratch, m, opr_sz);
7443     }
7444 
7445     for (i = 0; i < opr_sz; i += 4) {
7446         uint64_t count = 0;
7447         uint8_t pred;
7448 
7449         pred = pg[H1(i >> 3)] >> (i & 7);
7450         if (pred & 1) {
7451             uint32_t nn = n[H4(i >> 2)];
7452 
7453             for (j = 0; j <= i; j += 4) {
7454                 pred = pg[H1(j >> 3)] >> (j & 7);
7455                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7456                     ++count;
7457                 }
7458             }
7459         }
7460         d[H4(i >> 2)] = count;
7461     }
7462 }
7463 
7464 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7465                             uint32_t desc)
7466 {
7467     ARMVectorReg scratch;
7468     intptr_t i, j;
7469     intptr_t opr_sz = simd_oprsz(desc);
7470     uint64_t *d = vd, *n = vn, *m = vm;
7471     uint8_t *pg = vg;
7472 
7473     if (d == n) {
7474         n = memcpy(&scratch, n, opr_sz);
7475         if (d == m) {
7476             m = n;
7477         }
7478     } else if (d == m) {
7479         m = memcpy(&scratch, m, opr_sz);
7480     }
7481 
7482     for (i = 0; i < opr_sz / 8; ++i) {
7483         uint64_t count = 0;
7484         if (pg[H1(i)] & 1) {
7485             uint64_t nn = n[i];
7486             for (j = 0; j <= i; ++j) {
7487                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7488                     ++count;
7489                 }
7490             }
7491         }
7492         d[i] = count;
7493     }
7494 }
7495 
7496 /*
7497  * Returns the number of bytes in m0 and m1 that match n.
7498  * Unlike do_match2 we don't just need true/false, we need an exact count.
7499  * This requires two extra logical operations.
7500  */
7501 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7502 {
7503     const uint64_t mask = dup_const(MO_8, 0x7f);
7504     uint64_t cmp0, cmp1;
7505 
7506     cmp1 = dup_const(MO_8, n);
7507     cmp0 = cmp1 ^ m0;
7508     cmp1 = cmp1 ^ m1;
7509 
7510     /*
7511      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7512      * 2: carry in to msb if byte != 0 (+ mask)
7513      * 3: set msb if cmp has msb set (| cmp)
7514      * 4: set ~msb to ignore them (| mask)
7515      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7516      * 5: invert, resulting in 0x80 if and only if byte == 0.
7517      */
7518     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7519     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7520 
7521     /*
7522      * Combine the two compares in a way that the bits do
7523      * not overlap, and so preserves the count of set bits.
7524      * If the host has an efficient instruction for ctpop,
7525      * then ctpop(x) + ctpop(y) has the same number of
7526      * operations as ctpop(x | (y >> 1)).  If the host does
7527      * not have an efficient ctpop, then we only want to
7528      * use it once.
7529      */
7530     return ctpop64(cmp0 | (cmp1 >> 1));
7531 }
7532 
7533 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7534 {
7535     intptr_t i, j;
7536     intptr_t opr_sz = simd_oprsz(desc);
7537 
7538     for (i = 0; i < opr_sz; i += 16) {
7539         uint64_t n0 = *(uint64_t *)(vn + i);
7540         uint64_t m0 = *(uint64_t *)(vm + i);
7541         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7542         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7543         uint64_t out0 = 0;
7544         uint64_t out1 = 0;
7545 
7546         for (j = 0; j < 64; j += 8) {
7547             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7548             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7549             out0 |= cnt0 << j;
7550             out1 |= cnt1 << j;
7551         }
7552 
7553         *(uint64_t *)(vd + i) = out0;
7554         *(uint64_t *)(vd + i + 8) = out1;
7555     }
7556 }
7557 
7558 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7559 {
7560     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7561     int shr = simd_data(desc);
7562     int shl = 8 - shr;
7563     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7564     uint64_t *d = vd, *n = vn, *m = vm;
7565 
7566     for (i = 0; i < opr_sz; ++i) {
7567         uint64_t t = n[i] ^ m[i];
7568         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7569     }
7570 }
7571 
7572 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7573 {
7574     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7575     int shr = simd_data(desc);
7576     int shl = 16 - shr;
7577     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7578     uint64_t *d = vd, *n = vn, *m = vm;
7579 
7580     for (i = 0; i < opr_sz; ++i) {
7581         uint64_t t = n[i] ^ m[i];
7582         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7583     }
7584 }
7585 
7586 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7587 {
7588     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7589     int shr = simd_data(desc);
7590     uint32_t *d = vd, *n = vn, *m = vm;
7591 
7592     for (i = 0; i < opr_sz; ++i) {
7593         d[i] = ror32(n[i] ^ m[i], shr);
7594     }
7595 }
7596 
7597 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7598                      float_status *status, uint32_t desc)
7599 {
7600     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7601 
7602     for (s = 0; s < opr_sz; ++s) {
7603         float32 *n = vn + s * sizeof(float32) * 4;
7604         float32 *m = vm + s * sizeof(float32) * 4;
7605         float32 *a = va + s * sizeof(float32) * 4;
7606         float32 *d = vd + s * sizeof(float32) * 4;
7607         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7608         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7609         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7610         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7611         float32 p0, p1;
7612 
7613         /* i = 0, j = 0 */
7614         p0 = float32_mul(n00, m00, status);
7615         p1 = float32_mul(n01, m01, status);
7616         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7617 
7618         /* i = 0, j = 1 */
7619         p0 = float32_mul(n00, m10, status);
7620         p1 = float32_mul(n01, m11, status);
7621         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7622 
7623         /* i = 1, j = 0 */
7624         p0 = float32_mul(n10, m00, status);
7625         p1 = float32_mul(n11, m01, status);
7626         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7627 
7628         /* i = 1, j = 1 */
7629         p0 = float32_mul(n10, m10, status);
7630         p1 = float32_mul(n11, m11, status);
7631         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7632     }
7633 }
7634 
7635 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7636                      float_status *status, uint32_t desc)
7637 {
7638     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7639 
7640     for (s = 0; s < opr_sz; ++s) {
7641         float64 *n = vn + s * sizeof(float64) * 4;
7642         float64 *m = vm + s * sizeof(float64) * 4;
7643         float64 *a = va + s * sizeof(float64) * 4;
7644         float64 *d = vd + s * sizeof(float64) * 4;
7645         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7646         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7647         float64 p0, p1;
7648 
7649         /* i = 0, j = 0 */
7650         p0 = float64_mul(n00, m00, status);
7651         p1 = float64_mul(n01, m01, status);
7652         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7653 
7654         /* i = 0, j = 1 */
7655         p0 = float64_mul(n00, m10, status);
7656         p1 = float64_mul(n01, m11, status);
7657         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7658 
7659         /* i = 1, j = 0 */
7660         p0 = float64_mul(n10, m00, status);
7661         p1 = float64_mul(n11, m01, status);
7662         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7663 
7664         /* i = 1, j = 1 */
7665         p0 = float64_mul(n10, m10, status);
7666         p1 = float64_mul(n11, m11, status);
7667         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7668     }
7669 }
7670 
7671 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7672 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7673                   float_status *status, uint32_t desc)                        \
7674 {                                                                             \
7675     intptr_t i = simd_oprsz(desc);                                            \
7676     uint64_t *g = vg;                                                         \
7677     do {                                                                      \
7678         uint64_t pg = g[(i - 1) >> 6];                                        \
7679         do {                                                                  \
7680             i -= sizeof(TYPEW);                                               \
7681             if (likely((pg >> (i & 63)) & 1)) {                               \
7682                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7683                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7684             }                                                                 \
7685         } while (i & 63);                                                     \
7686     } while (i != 0);                                                         \
7687 }
7688 
7689 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7690 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7691 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7692 
7693 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7694 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7695                   float_status *status, uint32_t desc)                        \
7696 {                                                                             \
7697     intptr_t i = simd_oprsz(desc);                                            \
7698     uint64_t *g = vg;                                                         \
7699     do {                                                                      \
7700         uint64_t pg = g[(i - 1) >> 6];                                        \
7701         do {                                                                  \
7702             i -= sizeof(TYPEW);                                               \
7703             if (likely((pg >> (i & 63)) & 1)) {                               \
7704                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7705                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7706             }                                                                 \
7707         } while (i & 63);                                                     \
7708     } while (i != 0);                                                         \
7709 }
7710 
7711 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7712 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7713 
7714 #undef DO_FCVTLT
7715 #undef DO_FCVTNT
7716