xref: /openbmc/qemu/target/arm/tcg/sve_helper.c (revision 29318db1)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/page-protection.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29 #include "vec_internal.h"
30 #include "sve_ldst_internal.h"
31 #include "hw/core/tcg-cpu-ops.h"
32 
33 
34 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
35  *
36  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
37  * and bit 0 set if C is set.  Compare the definitions of these variables
38  * within CPUARMState.
39  */
40 
41 /* For no G bits set, NZCV = C.  */
42 #define PREDTEST_INIT  1
43 
44 /* This is an iterative function, called for each Pd and Pg word
45  * moving forward.
46  */
47 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
48 {
49     if (likely(g)) {
50         /* Compute N from first D & G.
51            Use bit 2 to signal first G bit seen.  */
52         if (!(flags & 4)) {
53             flags |= ((d & (g & -g)) != 0) << 31;
54             flags |= 4;
55         }
56 
57         /* Accumulate Z from each D & G.  */
58         flags |= ((d & g) != 0) << 1;
59 
60         /* Compute C from last !(D & G).  Replace previous.  */
61         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
62     }
63     return flags;
64 }
65 
66 /* This is an iterative function, called for each Pd and Pg word
67  * moving backward.
68  */
69 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
70 {
71     if (likely(g)) {
72         /* Compute C from first (i.e last) !(D & G).
73            Use bit 2 to signal first G bit seen.  */
74         if (!(flags & 4)) {
75             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
76             flags |= (d & pow2floor(g)) == 0;
77         }
78 
79         /* Accumulate Z from each D & G.  */
80         flags |= ((d & g) != 0) << 1;
81 
82         /* Compute N from last (i.e first) D & G.  Replace previous.  */
83         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
84     }
85     return flags;
86 }
87 
88 /* The same for a single word predicate.  */
89 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
90 {
91     return iter_predtest_fwd(d, g, PREDTEST_INIT);
92 }
93 
94 /* The same for a multi-word predicate.  */
95 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
96 {
97     uint32_t flags = PREDTEST_INIT;
98     uint64_t *d = vd, *g = vg;
99     uintptr_t i = 0;
100 
101     do {
102         flags = iter_predtest_fwd(d[i], g[i], flags);
103     } while (++i < words);
104 
105     return flags;
106 }
107 
108 /* Similarly for single word elements.  */
109 static inline uint64_t expand_pred_s(uint8_t byte)
110 {
111     static const uint64_t word[] = {
112         [0x01] = 0x00000000ffffffffull,
113         [0x10] = 0xffffffff00000000ull,
114         [0x11] = 0xffffffffffffffffull,
115     };
116     return word[byte & 0x11];
117 }
118 
119 #define LOGICAL_PPPP(NAME, FUNC) \
120 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
121 {                                                                         \
122     uintptr_t opr_sz = simd_oprsz(desc);                                  \
123     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
124     uintptr_t i;                                                          \
125     for (i = 0; i < opr_sz / 8; ++i) {                                    \
126         d[i] = FUNC(n[i], m[i], g[i]);                                    \
127     }                                                                     \
128 }
129 
130 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
131 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
132 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
133 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
134 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
135 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
136 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
137 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
138 
139 LOGICAL_PPPP(sve_and_pppp, DO_AND)
140 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
141 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
142 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
143 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
144 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
145 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
146 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
147 
148 #undef DO_AND
149 #undef DO_BIC
150 #undef DO_EOR
151 #undef DO_ORR
152 #undef DO_ORN
153 #undef DO_NOR
154 #undef DO_NAND
155 #undef DO_SEL
156 #undef LOGICAL_PPPP
157 
158 /* Fully general three-operand expander, controlled by a predicate.
159  * This is complicated by the host-endian storage of the register file.
160  */
161 /* ??? I don't expect the compiler could ever vectorize this itself.
162  * With some tables we can convert bit masks to byte masks, and with
163  * extra care wrt byte/word ordering we could use gcc generic vectors
164  * and do 16 bytes at a time.
165  */
166 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
167 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
168 {                                                                       \
169     intptr_t i, opr_sz = simd_oprsz(desc);                              \
170     for (i = 0; i < opr_sz; ) {                                         \
171         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
172         do {                                                            \
173             if (pg & 1) {                                               \
174                 TYPE nn = *(TYPE *)(vn + H(i));                         \
175                 TYPE mm = *(TYPE *)(vm + H(i));                         \
176                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
177             }                                                           \
178             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
179         } while (i & 15);                                               \
180     }                                                                   \
181 }
182 
183 /* Similarly, specialized for 64-bit operands.  */
184 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
185 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
186 {                                                               \
187     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
188     TYPE *d = vd, *n = vn, *m = vm;                             \
189     uint8_t *pg = vg;                                           \
190     for (i = 0; i < opr_sz; i += 1) {                           \
191         if (pg[H1(i)] & 1) {                                    \
192             TYPE nn = n[i], mm = m[i];                          \
193             d[i] = OP(nn, mm);                                  \
194         }                                                       \
195     }                                                           \
196 }
197 
198 #define DO_AND(N, M)  (N & M)
199 #define DO_EOR(N, M)  (N ^ M)
200 #define DO_ORR(N, M)  (N | M)
201 #define DO_BIC(N, M)  (N & ~M)
202 #define DO_ADD(N, M)  (N + M)
203 #define DO_SUB(N, M)  (N - M)
204 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
205 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
206 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
207 #define DO_MUL(N, M)  (N * M)
208 
209 
210 /*
211  * We must avoid the C undefined behaviour cases: division by
212  * zero and signed division of INT_MIN by -1. Both of these
213  * have architecturally defined required results for Arm.
214  * We special case all signed divisions by -1 to avoid having
215  * to deduce the minimum integer for the type involved.
216  */
217 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
218 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
219 
220 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
221 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
222 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
223 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
224 
225 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
226 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
227 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
228 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
229 
230 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
231 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
232 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
233 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
234 
235 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
236 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
237 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
238 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
239 
240 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
241 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
242 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
243 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
244 
245 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
246 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
247 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
248 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
249 
250 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
251 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
252 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
253 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
254 
255 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
256 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
257 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
258 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
259 
260 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
261 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
262 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
263 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
264 
265 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
266 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
267 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
268 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
269 
270 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
271 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
272 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
273 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
274 
275 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
276 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
277 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
278 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
279 
280 /* Because the computation type is at least twice as large as required,
281    these work for both signed and unsigned source types.  */
282 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
283 {
284     return (n * m) >> 8;
285 }
286 
287 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
288 {
289     return (n * m) >> 16;
290 }
291 
292 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
293 {
294     return (n * m) >> 32;
295 }
296 
297 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
298 {
299     uint64_t lo, hi;
300     muls64(&lo, &hi, n, m);
301     return hi;
302 }
303 
304 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
305 {
306     uint64_t lo, hi;
307     mulu64(&lo, &hi, n, m);
308     return hi;
309 }
310 
311 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
312 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
313 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
314 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
315 
316 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
317 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
318 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
319 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
320 
321 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
322 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
323 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
324 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
325 
326 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
327 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
328 
329 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
330 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
331 
332 /* Note that all bits of the shift are significant
333    and not modulo the element size.  */
334 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
335 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
336 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
337 
338 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
339 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
340 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
341 
342 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
343 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
344 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
345 
346 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
347 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
348 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
349 
350 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
351 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
352 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
353 
354 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
355 {
356     int8_t n1 = n, n2 = n >> 8;
357     return m + n1 + n2;
358 }
359 
360 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
361 {
362     int16_t n1 = n, n2 = n >> 16;
363     return m + n1 + n2;
364 }
365 
366 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
367 {
368     int32_t n1 = n, n2 = n >> 32;
369     return m + n1 + n2;
370 }
371 
372 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
373 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
374 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
375 
376 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
377 {
378     uint8_t n1 = n, n2 = n >> 8;
379     return m + n1 + n2;
380 }
381 
382 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
383 {
384     uint16_t n1 = n, n2 = n >> 16;
385     return m + n1 + n2;
386 }
387 
388 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
389 {
390     uint32_t n1 = n, n2 = n >> 32;
391     return m + n1 + n2;
392 }
393 
394 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
395 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
396 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
397 
398 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
399 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
400 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
401 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
402 
403 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
404 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
405 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
406 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
407 
408 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
409 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
410 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
411 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
412 
413 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
414 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
415 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
416 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
417 
418 /*
419  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
420  * We pass in a pointer to a dummy saturation field to trigger
421  * the saturating arithmetic but discard the information about
422  * whether it has occurred.
423  */
424 #define do_sqshl_b(n, m) \
425    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
426 #define do_sqshl_h(n, m) \
427    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
428 #define do_sqshl_s(n, m) \
429    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
430 #define do_sqshl_d(n, m) \
431    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
432 
433 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
434 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
435 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
436 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
437 
438 #define do_uqshl_b(n, m) \
439    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
440 #define do_uqshl_h(n, m) \
441    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
442 #define do_uqshl_s(n, m) \
443    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
444 #define do_uqshl_d(n, m) \
445    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
446 
447 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
448 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
449 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
450 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
451 
452 #define do_sqrshl_b(n, m) \
453    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
454 #define do_sqrshl_h(n, m) \
455    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
456 #define do_sqrshl_s(n, m) \
457    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
458 #define do_sqrshl_d(n, m) \
459    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
460 
461 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
462 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
463 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
464 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
465 
466 #undef do_sqrshl_d
467 
468 #define do_uqrshl_b(n, m) \
469    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
470 #define do_uqrshl_h(n, m) \
471    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
472 #define do_uqrshl_s(n, m) \
473    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
474 #define do_uqrshl_d(n, m) \
475    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
476 
477 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
478 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
479 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
480 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
481 
482 #undef do_uqrshl_d
483 
484 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
485 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
486 
487 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
488 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
489 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
490 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
491 
492 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
493 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
494 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
495 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
496 
497 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
498 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
499 
500 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
501 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
502 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
503 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
504 
505 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
506 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
507 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
508 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
509 
510 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
511 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
512 
513 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
514 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
515 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
516 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
517 
518 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
519 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
520 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
521 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
522 
523 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
524 {
525     return val >= max ? max : val <= min ? min : val;
526 }
527 
528 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
529 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
530 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
531 
532 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
533 {
534     int64_t r = n + m;
535     if (((r ^ n) & ~(n ^ m)) < 0) {
536         /* Signed overflow.  */
537         return r < 0 ? INT64_MAX : INT64_MIN;
538     }
539     return r;
540 }
541 
542 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
543 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
544 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
545 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
546 
547 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
548 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
549 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
550 
551 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
552 {
553     uint64_t r = n + m;
554     return r < n ? UINT64_MAX : r;
555 }
556 
557 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
558 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
559 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
560 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
561 
562 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
563 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
564 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
565 
566 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
567 {
568     int64_t r = n - m;
569     if (((r ^ n) & (n ^ m)) < 0) {
570         /* Signed overflow.  */
571         return r < 0 ? INT64_MAX : INT64_MIN;
572     }
573     return r;
574 }
575 
576 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
577 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
578 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
579 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
580 
581 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
582 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
583 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
584 
585 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
586 {
587     return n > m ? n - m : 0;
588 }
589 
590 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
591 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
592 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
593 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
594 
595 #define DO_SUQADD_B(n, m) \
596     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
597 #define DO_SUQADD_H(n, m) \
598     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
599 #define DO_SUQADD_S(n, m) \
600     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
601 
602 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
603 {
604     uint64_t r = n + m;
605 
606     if (n < 0) {
607         /* Note that m - abs(n) cannot underflow. */
608         if (r > INT64_MAX) {
609             /* Result is either very large positive or negative. */
610             if (m > -n) {
611                 /* m > abs(n), so r is a very large positive. */
612                 return INT64_MAX;
613             }
614             /* Result is negative. */
615         }
616     } else {
617         /* Both inputs are positive: check for overflow.  */
618         if (r < m || r > INT64_MAX) {
619             return INT64_MAX;
620         }
621     }
622     return r;
623 }
624 
625 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
626 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
627 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
628 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
629 
630 #define DO_USQADD_B(n, m) \
631     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
632 #define DO_USQADD_H(n, m) \
633     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
634 #define DO_USQADD_S(n, m) \
635     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
636 
637 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
638 {
639     uint64_t r = n + m;
640 
641     if (m < 0) {
642         return n < -m ? 0 : r;
643     }
644     return r < n ? UINT64_MAX : r;
645 }
646 
647 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
648 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
649 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
650 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
651 
652 #undef DO_ZPZZ
653 #undef DO_ZPZZ_D
654 
655 /*
656  * Three operand expander, operating on element pairs.
657  * If the slot I is even, the elements from from VN {I, I+1}.
658  * If the slot I is odd, the elements from from VM {I-1, I}.
659  * Load all of the input elements in each pair before overwriting output.
660  */
661 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
662 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
663 {                                                               \
664     intptr_t i, opr_sz = simd_oprsz(desc);                      \
665     for (i = 0; i < opr_sz; ) {                                 \
666         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
667         do {                                                    \
668             TYPE n0 = *(TYPE *)(vn + H(i));                     \
669             TYPE m0 = *(TYPE *)(vm + H(i));                     \
670             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
671             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
672             if (pg & 1) {                                       \
673                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
674             }                                                   \
675             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
676             if (pg & 1) {                                       \
677                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
678             }                                                   \
679             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
680         } while (i & 15);                                       \
681     }                                                           \
682 }
683 
684 /* Similarly, specialized for 64-bit operands.  */
685 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
686 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
687 {                                                               \
688     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
689     TYPE *d = vd, *n = vn, *m = vm;                             \
690     uint8_t *pg = vg;                                           \
691     for (i = 0; i < opr_sz; i += 2) {                           \
692         TYPE n0 = n[i], n1 = n[i + 1];                          \
693         TYPE m0 = m[i], m1 = m[i + 1];                          \
694         if (pg[H1(i)] & 1) {                                    \
695             d[i] = OP(n0, n1);                                  \
696         }                                                       \
697         if (pg[H1(i + 1)] & 1) {                                \
698             d[i + 1] = OP(m0, m1);                              \
699         }                                                       \
700     }                                                           \
701 }
702 
703 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
704 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
705 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
706 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
707 
708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
709 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
710 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
711 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
712 
713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
714 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
715 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
716 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
717 
718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
719 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
720 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
721 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
722 
723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
724 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
725 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
726 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
727 
728 #undef DO_ZPZZ_PAIR
729 #undef DO_ZPZZ_PAIR_D
730 
731 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
732 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
733                   void *status, uint32_t desc)                          \
734 {                                                                       \
735     intptr_t i, opr_sz = simd_oprsz(desc);                              \
736     for (i = 0; i < opr_sz; ) {                                         \
737         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
738         do {                                                            \
739             TYPE n0 = *(TYPE *)(vn + H(i));                             \
740             TYPE m0 = *(TYPE *)(vm + H(i));                             \
741             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
742             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
743             if (pg & 1) {                                               \
744                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
745             }                                                           \
746             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
747             if (pg & 1) {                                               \
748                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
749             }                                                           \
750             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
751         } while (i & 15);                                               \
752     }                                                                   \
753 }
754 
755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
756 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
757 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
758 
759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
760 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
761 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
762 
763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
764 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
765 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
766 
767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
768 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
769 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
770 
771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
772 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
773 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
774 
775 #undef DO_ZPZZ_PAIR_FP
776 
777 /* Three-operand expander, controlled by a predicate, in which the
778  * third operand is "wide".  That is, for D = N op M, the same 64-bit
779  * value of M is used with all of the narrower values of N.
780  */
781 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
782 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
783 {                                                                       \
784     intptr_t i, opr_sz = simd_oprsz(desc);                              \
785     for (i = 0; i < opr_sz; ) {                                         \
786         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
787         TYPEW mm = *(TYPEW *)(vm + i);                                  \
788         do {                                                            \
789             if (pg & 1) {                                               \
790                 TYPE nn = *(TYPE *)(vn + H(i));                         \
791                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
792             }                                                           \
793             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
794         } while (i & 7);                                                \
795     }                                                                   \
796 }
797 
798 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
799 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
800 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
801 
802 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
803 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
804 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
805 
806 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
807 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
808 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
809 
810 #undef DO_ZPZW
811 
812 /* Fully general two-operand expander, controlled by a predicate.
813  */
814 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
815 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
816 {                                                               \
817     intptr_t i, opr_sz = simd_oprsz(desc);                      \
818     for (i = 0; i < opr_sz; ) {                                 \
819         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
820         do {                                                    \
821             if (pg & 1) {                                       \
822                 TYPE nn = *(TYPE *)(vn + H(i));                 \
823                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
824             }                                                   \
825             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
826         } while (i & 15);                                       \
827     }                                                           \
828 }
829 
830 /* Similarly, specialized for 64-bit operands.  */
831 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
832 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
833 {                                                               \
834     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
835     TYPE *d = vd, *n = vn;                                      \
836     uint8_t *pg = vg;                                           \
837     for (i = 0; i < opr_sz; i += 1) {                           \
838         if (pg[H1(i)] & 1) {                                    \
839             TYPE nn = n[i];                                     \
840             d[i] = OP(nn);                                      \
841         }                                                       \
842     }                                                           \
843 }
844 
845 #define DO_CLS_B(N)   (clrsb32(N) - 24)
846 #define DO_CLS_H(N)   (clrsb32(N) - 16)
847 
848 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
849 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
850 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
851 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
852 
853 #define DO_CLZ_B(N)   (clz32(N) - 24)
854 #define DO_CLZ_H(N)   (clz32(N) - 16)
855 
856 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
857 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
858 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
859 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
860 
861 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
862 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
863 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
864 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
865 
866 #define DO_CNOT(N)    (N == 0)
867 
868 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
869 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
870 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
871 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
872 
873 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
874 
875 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
876 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
877 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
878 
879 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
880 
881 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
882 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
883 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
884 
885 #define DO_NOT(N)    (~N)
886 
887 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
888 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
889 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
890 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
891 
892 #define DO_SXTB(N)    ((int8_t)N)
893 #define DO_SXTH(N)    ((int16_t)N)
894 #define DO_SXTS(N)    ((int32_t)N)
895 #define DO_UXTB(N)    ((uint8_t)N)
896 #define DO_UXTH(N)    ((uint16_t)N)
897 #define DO_UXTS(N)    ((uint32_t)N)
898 
899 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
900 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
901 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
902 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
903 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
904 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
905 
906 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
907 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
908 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
909 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
910 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
911 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
912 
913 #define DO_ABS(N)    (N < 0 ? -N : N)
914 
915 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
916 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
917 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
918 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
919 
920 #define DO_NEG(N)    (-N)
921 
922 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
923 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
924 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
925 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
926 
927 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
928 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
929 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
930 
931 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
932 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
933 
934 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
935 
936 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
937 {
938     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
939     uint64_t *d = vd, *n = vn;
940     uint8_t *pg = vg;
941 
942     for (i = 0; i < opr_sz; i += 2) {
943         if (pg[H1(i)] & 1) {
944             uint64_t n0 = n[i + 0];
945             uint64_t n1 = n[i + 1];
946             d[i + 0] = n1;
947             d[i + 1] = n0;
948         }
949     }
950 }
951 
952 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
953 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
954 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
955 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
956 
957 #define DO_SQABS(X) \
958     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
959        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
960 
961 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
962 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
963 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
964 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
965 
966 #define DO_SQNEG(X) \
967     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
968        x_ == min_ ? -min_ - 1 : -x_; })
969 
970 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
971 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
972 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
973 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
974 
975 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
976 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
977 
978 /* Three-operand expander, unpredicated, in which the third operand is "wide".
979  */
980 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
981 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
982 {                                                              \
983     intptr_t i, opr_sz = simd_oprsz(desc);                     \
984     for (i = 0; i < opr_sz; ) {                                \
985         TYPEW mm = *(TYPEW *)(vm + i);                         \
986         do {                                                   \
987             TYPE nn = *(TYPE *)(vn + H(i));                    \
988             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
989             i += sizeof(TYPE);                                 \
990         } while (i & 7);                                       \
991     }                                                          \
992 }
993 
994 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
995 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
996 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
997 
998 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
999 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1000 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1001 
1002 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1003 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1004 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1005 
1006 #undef DO_ZZW
1007 
1008 #undef DO_CLS_B
1009 #undef DO_CLS_H
1010 #undef DO_CLZ_B
1011 #undef DO_CLZ_H
1012 #undef DO_CNOT
1013 #undef DO_FABS
1014 #undef DO_FNEG
1015 #undef DO_ABS
1016 #undef DO_NEG
1017 #undef DO_ZPZ
1018 #undef DO_ZPZ_D
1019 
1020 /*
1021  * Three-operand expander, unpredicated, in which the two inputs are
1022  * selected from the top or bottom half of the wide column.
1023  */
1024 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1025 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1026 {                                                                       \
1027     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1028     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1029     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1030     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1031         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1032         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1033         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1034     }                                                                   \
1035 }
1036 
1037 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1038 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1039 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1040 
1041 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1042 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1043 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1044 
1045 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1046 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1047 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1048 
1049 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1050 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1051 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1052 
1053 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1054 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1055 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1056 
1057 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1058 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1059 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1060 
1061 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1062 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1063 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1064 
1065 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1066 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1067 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1068 
1069 /* Note that the multiply cannot overflow, but the doubling can. */
1070 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1071 {
1072     int16_t val = n * m;
1073     return DO_SQADD_H(val, val);
1074 }
1075 
1076 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1077 {
1078     int32_t val = n * m;
1079     return DO_SQADD_S(val, val);
1080 }
1081 
1082 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1083 {
1084     int64_t val = n * m;
1085     return do_sqadd_d(val, val);
1086 }
1087 
1088 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1089 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1090 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1091 
1092 #undef DO_ZZZ_TB
1093 
1094 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1095 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1096 {                                                              \
1097     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1098     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1099     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1100         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1101         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1102         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1103     }                                                          \
1104 }
1105 
1106 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1107 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1108 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1109 
1110 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1111 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1112 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1113 
1114 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1115 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1116 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1117 
1118 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1119 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1120 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1121 
1122 #undef DO_ZZZ_WTB
1123 
1124 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1125 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1126 {                                                                       \
1127     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1128     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1129     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1130     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1131         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1132         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1133         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1134     }                                                                   \
1135 }
1136 
1137 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1138 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1139 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1140 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1141 
1142 #undef DO_ZZZ_NTB
1143 
1144 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1145 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1146 {                                                               \
1147     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1148     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1149     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1150         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1151         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1152         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1153         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1154     }                                                           \
1155 }
1156 
1157 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1158 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1159 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1160 
1161 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1162 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1163 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1164 
1165 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1166 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1167 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1168 
1169 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1170 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1171 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1172 
1173 #define DO_NMUL(N, M)  -(N * M)
1174 
1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1176 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1177 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1178 
1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1180 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1181 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1182 
1183 #undef DO_ZZZW_ACC
1184 
1185 #define DO_XTNB(NAME, TYPE, OP) \
1186 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1187 {                                                            \
1188     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1189     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1190         TYPE nn = *(TYPE *)(vn + i);                         \
1191         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1192         *(TYPE *)(vd + i) = nn;                              \
1193     }                                                        \
1194 }
1195 
1196 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1197 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1198 {                                                                       \
1199     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1200     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1201         TYPE nn = *(TYPE *)(vn + i);                                    \
1202         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1203     }                                                                   \
1204 }
1205 
1206 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1207 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1208 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1209 
1210 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1211 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1212 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1213 
1214 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1215 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1216 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1217 
1218 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1219 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1220 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1221 
1222 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1223 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1224 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1225 
1226 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1227 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1228 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1229 
1230 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1231 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1232 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1233 
1234 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1235 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1236 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1237 
1238 #undef DO_XTNB
1239 #undef DO_XTNT
1240 
1241 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1242 {
1243     intptr_t i, opr_sz = simd_oprsz(desc);
1244     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1245     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1246     uint32_t *a = va, *n = vn;
1247     uint64_t *d = vd, *m = vm;
1248 
1249     for (i = 0; i < opr_sz / 8; ++i) {
1250         uint32_t e1 = a[2 * i + H4(0)];
1251         uint32_t e2 = n[2 * i + sel] ^ inv;
1252         uint64_t c = extract64(m[i], 32, 1);
1253         /* Compute and store the entire 33-bit result at once. */
1254         d[i] = c + e1 + e2;
1255     }
1256 }
1257 
1258 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1259 {
1260     intptr_t i, opr_sz = simd_oprsz(desc);
1261     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1262     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1263     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1264 
1265     for (i = 0; i < opr_sz / 8; i += 2) {
1266         Int128 e1 = int128_make64(a[i]);
1267         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1268         Int128 c = int128_make64(m[i + 1] & 1);
1269         Int128 r = int128_add(int128_add(e1, e2), c);
1270         d[i + 0] = int128_getlo(r);
1271         d[i + 1] = int128_gethi(r);
1272     }
1273 }
1274 
1275 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1276 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1277 {                                                                       \
1278     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1279     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1280     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1281     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1282         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1283         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1284         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1285         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1286     }                                                                   \
1287 }
1288 
1289 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1290            do_sqdmull_h, DO_SQADD_H)
1291 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1292            do_sqdmull_s, DO_SQADD_S)
1293 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1294            do_sqdmull_d, do_sqadd_d)
1295 
1296 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1297            do_sqdmull_h, DO_SQSUB_H)
1298 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1299            do_sqdmull_s, DO_SQSUB_S)
1300 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1301            do_sqdmull_d, do_sqsub_d)
1302 
1303 #undef DO_SQDMLAL
1304 
1305 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1306 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1307 {                                                               \
1308     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1309     int rot = simd_data(desc);                                  \
1310     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1311     bool sub_r = rot == 1 || rot == 2;                          \
1312     bool sub_i = rot >= 2;                                      \
1313     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1314     for (i = 0; i < opr_sz; i += 2) {                           \
1315         TYPE elt1_a = n[H(i + sel_a)];                          \
1316         TYPE elt2_a = m[H(i + sel_a)];                          \
1317         TYPE elt2_b = m[H(i + sel_b)];                          \
1318         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1319         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1320     }                                                           \
1321 }
1322 
1323 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1324 
1325 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1326 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1327 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1328 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1329 
1330 #define DO_SQRDMLAH_B(N, M, A, S) \
1331     do_sqrdmlah_b(N, M, A, S, true)
1332 #define DO_SQRDMLAH_H(N, M, A, S) \
1333     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1334 #define DO_SQRDMLAH_S(N, M, A, S) \
1335     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1336 #define DO_SQRDMLAH_D(N, M, A, S) \
1337     do_sqrdmlah_d(N, M, A, S, true)
1338 
1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1341 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1342 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1343 
1344 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1345 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1346 {                                                                           \
1347     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1348     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1349     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1350     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1351     bool sub_r = rot == 1 || rot == 2;                                      \
1352     bool sub_i = rot >= 2;                                                  \
1353     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1354     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1355         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1356         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1357         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1358             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1359             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1360             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1361         }                                                                   \
1362     }                                                                       \
1363 }
1364 
1365 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1366 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1367 
1368 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1369 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1370 
1371 #undef DO_CMLA
1372 #undef DO_CMLA_FUNC
1373 #undef DO_CMLA_IDX_FUNC
1374 #undef DO_SQRDMLAH_B
1375 #undef DO_SQRDMLAH_H
1376 #undef DO_SQRDMLAH_S
1377 #undef DO_SQRDMLAH_D
1378 
1379 /* Note N and M are 4 elements bundled into one unit. */
1380 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1381                          int sel_a, int sel_b, int sub_i)
1382 {
1383     for (int i = 0; i <= 1; i++) {
1384         int32_t elt1_r = (int8_t)(n >> (16 * i));
1385         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1386         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1387         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1388 
1389         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1390     }
1391     return a;
1392 }
1393 
1394 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1395                          int sel_a, int sel_b, int sub_i)
1396 {
1397     for (int i = 0; i <= 1; i++) {
1398         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1399         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1400         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1401         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1402 
1403         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1404     }
1405     return a;
1406 }
1407 
1408 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1409                               void *va, uint32_t desc)
1410 {
1411     int opr_sz = simd_oprsz(desc);
1412     int rot = simd_data(desc);
1413     int sel_a = rot & 1;
1414     int sel_b = sel_a ^ 1;
1415     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1416     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1417 
1418     for (int e = 0; e < opr_sz / 4; e++) {
1419         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1420     }
1421 }
1422 
1423 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1424                               void *va, uint32_t desc)
1425 {
1426     int opr_sz = simd_oprsz(desc);
1427     int rot = simd_data(desc);
1428     int sel_a = rot & 1;
1429     int sel_b = sel_a ^ 1;
1430     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1431     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1432 
1433     for (int e = 0; e < opr_sz / 8; e++) {
1434         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1435     }
1436 }
1437 
1438 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1439                              void *va, uint32_t desc)
1440 {
1441     int opr_sz = simd_oprsz(desc);
1442     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1443     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1444     int sel_a = rot & 1;
1445     int sel_b = sel_a ^ 1;
1446     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1447     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1448 
1449     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1450         uint32_t seg_m = m[seg + idx];
1451         for (int e = 0; e < 4; e++) {
1452             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1453                                    sel_a, sel_b, sub_i);
1454         }
1455     }
1456 }
1457 
1458 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1459                              void *va, uint32_t desc)
1460 {
1461     int seg, opr_sz = simd_oprsz(desc);
1462     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1463     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1464     int sel_a = rot & 1;
1465     int sel_b = sel_a ^ 1;
1466     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1467     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1468 
1469     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1470         uint64_t seg_m = m[seg + idx];
1471         for (int e = 0; e < 2; e++) {
1472             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1473                                    sel_a, sel_b, sub_i);
1474         }
1475     }
1476 }
1477 
1478 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1479 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1480 {                                                                       \
1481     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1482     intptr_t i, j, idx = simd_data(desc);                               \
1483     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1484     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1485         TYPE mm = m[i];                                                 \
1486         for (j = 0; j < segment; j++) {                                 \
1487             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1488         }                                                               \
1489     }                                                                   \
1490 }
1491 
1492 #define DO_SQRDMLAH_H(N, M, A) \
1493     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1494 #define DO_SQRDMLAH_S(N, M, A) \
1495     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1496 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1497 
1498 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1499 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1500 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1501 
1502 #define DO_SQRDMLSH_H(N, M, A) \
1503     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1504 #define DO_SQRDMLSH_S(N, M, A) \
1505     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1506 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1507 
1508 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1509 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1510 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1511 
1512 #undef DO_ZZXZ
1513 
1514 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1515 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1516 {                                                                         \
1517     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1518     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1519     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1520     for (i = 0; i < oprsz; i += 16) {                                     \
1521         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1522         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1523             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1524             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1525             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1526         }                                                                 \
1527     }                                                                     \
1528 }
1529 
1530 #define DO_MLA(N, M, A)  (A + N * M)
1531 
1532 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1533 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1534 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1535 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1536 
1537 #define DO_MLS(N, M, A)  (A - N * M)
1538 
1539 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1540 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1541 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1542 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1543 
1544 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1545 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1546 
1547 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1548 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1549 
1550 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1551 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1552 
1553 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1554 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1555 
1556 #undef DO_MLA
1557 #undef DO_MLS
1558 #undef DO_ZZXW
1559 
1560 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1561 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1562 {                                                                         \
1563     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1564     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1565     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1566     for (i = 0; i < oprsz; i += 16) {                                     \
1567         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1568         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1569             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1570             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1571         }                                                                 \
1572     }                                                                     \
1573 }
1574 
1575 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1576 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1577 
1578 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1579 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1580 
1581 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1582 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1583 
1584 #undef DO_ZZX
1585 
1586 #define DO_BITPERM(NAME, TYPE, OP) \
1587 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1588 {                                                              \
1589     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1590     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1591         TYPE nn = *(TYPE *)(vn + i);                           \
1592         TYPE mm = *(TYPE *)(vm + i);                           \
1593         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1594     }                                                          \
1595 }
1596 
1597 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1598 {
1599     uint64_t res = 0;
1600     int db, rb = 0;
1601 
1602     for (db = 0; db < n; ++db) {
1603         if ((mask >> db) & 1) {
1604             res |= ((data >> db) & 1) << rb;
1605             ++rb;
1606         }
1607     }
1608     return res;
1609 }
1610 
1611 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1612 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1613 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1614 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1615 
1616 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1617 {
1618     uint64_t res = 0;
1619     int rb, db = 0;
1620 
1621     for (rb = 0; rb < n; ++rb) {
1622         if ((mask >> rb) & 1) {
1623             res |= ((data >> db) & 1) << rb;
1624             ++db;
1625         }
1626     }
1627     return res;
1628 }
1629 
1630 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1631 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1632 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1633 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1634 
1635 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1636 {
1637     uint64_t resm = 0, resu = 0;
1638     int db, rbm = 0, rbu = 0;
1639 
1640     for (db = 0; db < n; ++db) {
1641         uint64_t val = (data >> db) & 1;
1642         if ((mask >> db) & 1) {
1643             resm |= val << rbm++;
1644         } else {
1645             resu |= val << rbu++;
1646         }
1647     }
1648 
1649     return resm | (resu << rbm);
1650 }
1651 
1652 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1653 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1654 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1655 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1656 
1657 #undef DO_BITPERM
1658 
1659 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1660 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1661 {                                                               \
1662     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1663     int sub_r = simd_data(desc);                                \
1664     if (sub_r) {                                                \
1665         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1666             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1667             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1668             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1669             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1670             acc_r = ADD_OP(acc_r, el2_i);                       \
1671             acc_i = SUB_OP(acc_i, el2_r);                       \
1672             *(TYPE *)(vd + H(i)) = acc_r;                       \
1673             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1674         }                                                       \
1675     } else {                                                    \
1676         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1677             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1678             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1679             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1680             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1681             acc_r = SUB_OP(acc_r, el2_i);                       \
1682             acc_i = ADD_OP(acc_i, el2_r);                       \
1683             *(TYPE *)(vd + H(i)) = acc_r;                       \
1684             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1685         }                                                       \
1686     }                                                           \
1687 }
1688 
1689 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1690 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1691 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1692 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1693 
1694 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1695 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1696 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1697 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1698 
1699 #undef DO_CADD
1700 
1701 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1702 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1703 {                                                              \
1704     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1705     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1706     int shift = simd_data(desc) >> 1;                          \
1707     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1708         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1709         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1710     }                                                          \
1711 }
1712 
1713 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1714 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1715 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1716 
1717 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1718 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1719 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1720 
1721 #undef DO_ZZI_SHLL
1722 
1723 /* Two-operand reduction expander, controlled by a predicate.
1724  * The difference between TYPERED and TYPERET has to do with
1725  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1726  * but TYPERET must be unsigned so that e.g. a 32-bit value
1727  * is not sign-extended to the ABI uint64_t return type.
1728  */
1729 /* ??? If we were to vectorize this by hand the reduction ordering
1730  * would change.  For integer operands, this is perfectly fine.
1731  */
1732 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1733 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1734 {                                                          \
1735     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1736     TYPERED ret = INIT;                                    \
1737     for (i = 0; i < opr_sz; ) {                            \
1738         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1739         do {                                               \
1740             if (pg & 1) {                                  \
1741                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1742                 ret = OP(ret, nn);                         \
1743             }                                              \
1744             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1745         } while (i & 15);                                  \
1746     }                                                      \
1747     return (TYPERET)ret;                                   \
1748 }
1749 
1750 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1751 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1752 {                                                          \
1753     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1754     TYPEE *n = vn;                                         \
1755     uint8_t *pg = vg;                                      \
1756     TYPER ret = INIT;                                      \
1757     for (i = 0; i < opr_sz; i += 1) {                      \
1758         if (pg[H1(i)] & 1) {                               \
1759             TYPEE nn = n[i];                               \
1760             ret = OP(ret, nn);                             \
1761         }                                                  \
1762     }                                                      \
1763     return ret;                                            \
1764 }
1765 
1766 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1767 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1768 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1769 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1770 
1771 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1772 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1773 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1774 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1775 
1776 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1777 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1778 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1779 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1780 
1781 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1782 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1783 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1784 
1785 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1786 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1787 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1788 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1789 
1790 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1791 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1792 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1793 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1794 
1795 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1796 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1797 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1798 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1799 
1800 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1801 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1802 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1803 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1804 
1805 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1806 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1807 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1808 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1809 
1810 #undef DO_VPZ
1811 #undef DO_VPZ_D
1812 
1813 /* Two vector operand, one scalar operand, unpredicated.  */
1814 #define DO_ZZI(NAME, TYPE, OP)                                       \
1815 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1816 {                                                                    \
1817     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1818     TYPE s = s64, *d = vd, *n = vn;                                  \
1819     for (i = 0; i < opr_sz; ++i) {                                   \
1820         d[i] = OP(n[i], s);                                          \
1821     }                                                                \
1822 }
1823 
1824 #define DO_SUBR(X, Y)   (Y - X)
1825 
1826 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1827 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1828 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1829 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1830 
1831 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1832 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1833 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1834 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1835 
1836 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1837 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1838 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1839 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1840 
1841 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1842 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1843 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1844 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1845 
1846 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1847 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1848 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1849 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1850 
1851 #undef DO_ZZI
1852 
1853 #undef DO_AND
1854 #undef DO_ORR
1855 #undef DO_EOR
1856 #undef DO_BIC
1857 #undef DO_ADD
1858 #undef DO_SUB
1859 #undef DO_MAX
1860 #undef DO_MIN
1861 #undef DO_ABD
1862 #undef DO_MUL
1863 #undef DO_DIV
1864 #undef DO_ASR
1865 #undef DO_LSR
1866 #undef DO_LSL
1867 #undef DO_SUBR
1868 
1869 /* Similar to the ARM LastActiveElement pseudocode function, except the
1870    result is multiplied by the element size.  This includes the not found
1871    indication; e.g. not found for esz=3 is -8.  */
1872 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1873 {
1874     uint64_t mask = pred_esz_masks[esz];
1875     intptr_t i = words;
1876 
1877     do {
1878         uint64_t this_g = g[--i] & mask;
1879         if (this_g) {
1880             return i * 64 + (63 - clz64(this_g));
1881         }
1882     } while (i > 0);
1883     return (intptr_t)-1 << esz;
1884 }
1885 
1886 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1887 {
1888     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1889     uint32_t flags = PREDTEST_INIT;
1890     uint64_t *d = vd, *g = vg;
1891     intptr_t i = 0;
1892 
1893     do {
1894         uint64_t this_d = d[i];
1895         uint64_t this_g = g[i];
1896 
1897         if (this_g) {
1898             if (!(flags & 4)) {
1899                 /* Set in D the first bit of G.  */
1900                 this_d |= this_g & -this_g;
1901                 d[i] = this_d;
1902             }
1903             flags = iter_predtest_fwd(this_d, this_g, flags);
1904         }
1905     } while (++i < words);
1906 
1907     return flags;
1908 }
1909 
1910 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1911 {
1912     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1913     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1914     uint32_t flags = PREDTEST_INIT;
1915     uint64_t *d = vd, *g = vg, esz_mask;
1916     intptr_t i, next;
1917 
1918     next = last_active_element(vd, words, esz) + (1 << esz);
1919     esz_mask = pred_esz_masks[esz];
1920 
1921     /* Similar to the pseudocode for pnext, but scaled by ESZ
1922        so that we find the correct bit.  */
1923     if (next < words * 64) {
1924         uint64_t mask = -1;
1925 
1926         if (next & 63) {
1927             mask = ~((1ull << (next & 63)) - 1);
1928             next &= -64;
1929         }
1930         do {
1931             uint64_t this_g = g[next / 64] & esz_mask & mask;
1932             if (this_g != 0) {
1933                 next = (next & -64) + ctz64(this_g);
1934                 break;
1935             }
1936             next += 64;
1937             mask = -1;
1938         } while (next < words * 64);
1939     }
1940 
1941     i = 0;
1942     do {
1943         uint64_t this_d = 0;
1944         if (i == next / 64) {
1945             this_d = 1ull << (next & 63);
1946         }
1947         d[i] = this_d;
1948         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1949     } while (++i < words);
1950 
1951     return flags;
1952 }
1953 
1954 /*
1955  * Copy Zn into Zd, and store zero into inactive elements.
1956  * If inv, store zeros into the active elements.
1957  */
1958 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1959 {
1960     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1961     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1962     uint64_t *d = vd, *n = vn;
1963     uint8_t *pg = vg;
1964 
1965     for (i = 0; i < opr_sz; i += 1) {
1966         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1967     }
1968 }
1969 
1970 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1971 {
1972     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1973     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1974     uint64_t *d = vd, *n = vn;
1975     uint8_t *pg = vg;
1976 
1977     for (i = 0; i < opr_sz; i += 1) {
1978         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1979     }
1980 }
1981 
1982 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1983 {
1984     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1985     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1986     uint64_t *d = vd, *n = vn;
1987     uint8_t *pg = vg;
1988 
1989     for (i = 0; i < opr_sz; i += 1) {
1990         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1991     }
1992 }
1993 
1994 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1995 {
1996     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1997     uint64_t *d = vd, *n = vn;
1998     uint8_t *pg = vg;
1999     uint8_t inv = simd_data(desc);
2000 
2001     for (i = 0; i < opr_sz; i += 1) {
2002         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2003     }
2004 }
2005 
2006 /* Three-operand expander, immediate operand, controlled by a predicate.
2007  */
2008 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2009 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2010 {                                                               \
2011     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2012     TYPE imm = simd_data(desc);                                 \
2013     for (i = 0; i < opr_sz; ) {                                 \
2014         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2015         do {                                                    \
2016             if (pg & 1) {                                       \
2017                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2018                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2019             }                                                   \
2020             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2021         } while (i & 15);                                       \
2022     }                                                           \
2023 }
2024 
2025 /* Similarly, specialized for 64-bit operands.  */
2026 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2027 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2028 {                                                               \
2029     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2030     TYPE *d = vd, *n = vn;                                      \
2031     TYPE imm = simd_data(desc);                                 \
2032     uint8_t *pg = vg;                                           \
2033     for (i = 0; i < opr_sz; i += 1) {                           \
2034         if (pg[H1(i)] & 1) {                                    \
2035             TYPE nn = n[i];                                     \
2036             d[i] = OP(nn, imm);                                 \
2037         }                                                       \
2038     }                                                           \
2039 }
2040 
2041 #define DO_SHR(N, M)  (N >> M)
2042 #define DO_SHL(N, M)  (N << M)
2043 
2044 /* Arithmetic shift right for division.  This rounds negative numbers
2045    toward zero as per signed division.  Therefore before shifting,
2046    when N is negative, add 2**M-1.  */
2047 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2048 
2049 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2050 {
2051     if (likely(sh < 64)) {
2052         return (x >> sh) + ((x >> (sh - 1)) & 1);
2053     } else if (sh == 64) {
2054         return x >> 63;
2055     } else {
2056         return 0;
2057     }
2058 }
2059 
2060 static inline int64_t do_srshr(int64_t x, unsigned sh)
2061 {
2062     if (likely(sh < 64)) {
2063         return (x >> sh) + ((x >> (sh - 1)) & 1);
2064     } else {
2065         /* Rounding the sign bit always produces 0. */
2066         return 0;
2067     }
2068 }
2069 
2070 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2071 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2072 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2073 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2074 
2075 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2076 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2077 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2078 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2079 
2080 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2081 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2082 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2083 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2084 
2085 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2086 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2087 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2088 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2089 
2090 /* SVE2 bitwise shift by immediate */
2091 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2092 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2093 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2094 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2095 
2096 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2097 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2098 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2099 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2100 
2101 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2102 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2103 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2104 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2105 
2106 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2107 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2108 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2109 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2110 
2111 #define do_suqrshl_b(n, m) \
2112    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2113 #define do_suqrshl_h(n, m) \
2114    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2115 #define do_suqrshl_s(n, m) \
2116    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2117 #define do_suqrshl_d(n, m) \
2118    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2119 
2120 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2121 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2122 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2123 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2124 
2125 #undef DO_ASRD
2126 #undef DO_ZPZI
2127 #undef DO_ZPZI_D
2128 
2129 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2130 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2131 {                                                            \
2132     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2133     int shift = simd_data(desc);                             \
2134     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2135         TYPEW nn = *(TYPEW *)(vn + i);                       \
2136         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2137     }                                                        \
2138 }
2139 
2140 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2141 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2142 {                                                                 \
2143     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2144     int shift = simd_data(desc);                                  \
2145     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2146         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2147         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2148     }                                                             \
2149 }
2150 
2151 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2152 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2153 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2154 
2155 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2156 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2157 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2158 
2159 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2160 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2161 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2162 
2163 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2164 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2165 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2166 
2167 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2168 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2169 #define DO_SQSHRUN_D(x, sh) \
2170     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2171 
2172 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2173 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2174 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2175 
2176 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2177 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2178 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2179 
2180 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2181 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2182 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2183 
2184 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2185 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2186 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2187 
2188 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2189 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2190 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2191 
2192 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2193 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2194 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2195 
2196 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2197 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2198 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2199 
2200 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2201 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2202 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2203 
2204 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2205 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2206 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2207 
2208 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2209 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2210 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2211 
2212 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2213 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2214 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2215 
2216 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2217 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2218 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2219 
2220 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2221 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2222 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2223 
2224 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2225 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2226 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2227 
2228 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2229 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2230 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2231 
2232 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2233 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2234 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2235 
2236 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2237 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2238 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2239 
2240 #undef DO_SHRNB
2241 #undef DO_SHRNT
2242 
2243 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2244 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2245 {                                                                           \
2246     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2247     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2248         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2249         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2250         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2251     }                                                                       \
2252 }
2253 
2254 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2255 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2256 {                                                                           \
2257     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2258     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2259         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2260         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2261         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2262     }                                                                       \
2263 }
2264 
2265 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2266 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2267 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2268 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2269 
2270 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2271 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2272 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2273 
2274 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2275 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2276 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2277 
2278 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2279 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2280 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2281 
2282 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2283 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2284 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2285 
2286 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2287 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2288 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2289 
2290 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2291 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2292 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2293 
2294 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2295 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2296 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2297 
2298 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2299 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2300 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2301 
2302 #undef DO_RSUBHN
2303 #undef DO_SUBHN
2304 #undef DO_RADDHN
2305 #undef DO_ADDHN
2306 
2307 #undef DO_BINOPNB
2308 
2309 /* Fully general four-operand expander, controlled by a predicate.
2310  */
2311 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2312 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2313                   void *vg, uint32_t desc)                    \
2314 {                                                             \
2315     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2316     for (i = 0; i < opr_sz; ) {                               \
2317         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2318         do {                                                  \
2319             if (pg & 1) {                                     \
2320                 TYPE nn = *(TYPE *)(vn + H(i));               \
2321                 TYPE mm = *(TYPE *)(vm + H(i));               \
2322                 TYPE aa = *(TYPE *)(va + H(i));               \
2323                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2324             }                                                 \
2325             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2326         } while (i & 15);                                     \
2327     }                                                         \
2328 }
2329 
2330 /* Similarly, specialized for 64-bit operands.  */
2331 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2332 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2333                   void *vg, uint32_t desc)                    \
2334 {                                                             \
2335     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2336     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2337     uint8_t *pg = vg;                                         \
2338     for (i = 0; i < opr_sz; i += 1) {                         \
2339         if (pg[H1(i)] & 1) {                                  \
2340             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2341             d[i] = OP(aa, nn, mm);                            \
2342         }                                                     \
2343     }                                                         \
2344 }
2345 
2346 #define DO_MLA(A, N, M)  (A + N * M)
2347 #define DO_MLS(A, N, M)  (A - N * M)
2348 
2349 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2350 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2351 
2352 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2353 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2354 
2355 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2356 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2357 
2358 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2359 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2360 
2361 #undef DO_MLA
2362 #undef DO_MLS
2363 #undef DO_ZPZZZ
2364 #undef DO_ZPZZZ_D
2365 
2366 void HELPER(sve_index_b)(void *vd, uint32_t start,
2367                          uint32_t incr, uint32_t desc)
2368 {
2369     intptr_t i, opr_sz = simd_oprsz(desc);
2370     uint8_t *d = vd;
2371     for (i = 0; i < opr_sz; i += 1) {
2372         d[H1(i)] = start + i * incr;
2373     }
2374 }
2375 
2376 void HELPER(sve_index_h)(void *vd, uint32_t start,
2377                          uint32_t incr, uint32_t desc)
2378 {
2379     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2380     uint16_t *d = vd;
2381     for (i = 0; i < opr_sz; i += 1) {
2382         d[H2(i)] = start + i * incr;
2383     }
2384 }
2385 
2386 void HELPER(sve_index_s)(void *vd, uint32_t start,
2387                          uint32_t incr, uint32_t desc)
2388 {
2389     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2390     uint32_t *d = vd;
2391     for (i = 0; i < opr_sz; i += 1) {
2392         d[H4(i)] = start + i * incr;
2393     }
2394 }
2395 
2396 void HELPER(sve_index_d)(void *vd, uint64_t start,
2397                          uint64_t incr, uint32_t desc)
2398 {
2399     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2400     uint64_t *d = vd;
2401     for (i = 0; i < opr_sz; i += 1) {
2402         d[i] = start + i * incr;
2403     }
2404 }
2405 
2406 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2407 {
2408     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2409     uint32_t sh = simd_data(desc);
2410     uint32_t *d = vd, *n = vn, *m = vm;
2411     for (i = 0; i < opr_sz; i += 1) {
2412         d[i] = n[i] + (m[i] << sh);
2413     }
2414 }
2415 
2416 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2417 {
2418     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2419     uint64_t sh = simd_data(desc);
2420     uint64_t *d = vd, *n = vn, *m = vm;
2421     for (i = 0; i < opr_sz; i += 1) {
2422         d[i] = n[i] + (m[i] << sh);
2423     }
2424 }
2425 
2426 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2427 {
2428     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2429     uint64_t sh = simd_data(desc);
2430     uint64_t *d = vd, *n = vn, *m = vm;
2431     for (i = 0; i < opr_sz; i += 1) {
2432         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2433     }
2434 }
2435 
2436 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2437 {
2438     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2439     uint64_t sh = simd_data(desc);
2440     uint64_t *d = vd, *n = vn, *m = vm;
2441     for (i = 0; i < opr_sz; i += 1) {
2442         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2443     }
2444 }
2445 
2446 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2447 {
2448     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2449     static const uint16_t coeff[] = {
2450         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2451         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2452         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2453         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2454     };
2455     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2456     uint16_t *d = vd, *n = vn;
2457 
2458     for (i = 0; i < opr_sz; i++) {
2459         uint16_t nn = n[i];
2460         intptr_t idx = extract32(nn, 0, 5);
2461         uint16_t exp = extract32(nn, 5, 5);
2462         d[i] = coeff[idx] | (exp << 10);
2463     }
2464 }
2465 
2466 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2467 {
2468     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2469     static const uint32_t coeff[] = {
2470         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2471         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2472         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2473         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2474         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2475         0x1ef532, 0x20b051, 0x227043, 0x243516,
2476         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2477         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2478         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2479         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2480         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2481         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2482         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2483         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2484         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2485         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2486     };
2487     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2488     uint32_t *d = vd, *n = vn;
2489 
2490     for (i = 0; i < opr_sz; i++) {
2491         uint32_t nn = n[i];
2492         intptr_t idx = extract32(nn, 0, 6);
2493         uint32_t exp = extract32(nn, 6, 8);
2494         d[i] = coeff[idx] | (exp << 23);
2495     }
2496 }
2497 
2498 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2499 {
2500     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2501     static const uint64_t coeff[] = {
2502         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2503         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2504         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2505         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2506         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2507         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2508         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2509         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2510         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2511         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2512         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2513         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2514         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2515         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2516         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2517         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2518         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2519         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2520         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2521         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2522         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2523         0xFA7C1819E90D8ull,
2524     };
2525     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2526     uint64_t *d = vd, *n = vn;
2527 
2528     for (i = 0; i < opr_sz; i++) {
2529         uint64_t nn = n[i];
2530         intptr_t idx = extract32(nn, 0, 6);
2531         uint64_t exp = extract32(nn, 6, 11);
2532         d[i] = coeff[idx] | (exp << 52);
2533     }
2534 }
2535 
2536 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2537 {
2538     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2539     uint16_t *d = vd, *n = vn, *m = vm;
2540     for (i = 0; i < opr_sz; i += 1) {
2541         uint16_t nn = n[i];
2542         uint16_t mm = m[i];
2543         if (mm & 1) {
2544             nn = float16_one;
2545         }
2546         d[i] = nn ^ (mm & 2) << 14;
2547     }
2548 }
2549 
2550 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2551 {
2552     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2553     uint32_t *d = vd, *n = vn, *m = vm;
2554     for (i = 0; i < opr_sz; i += 1) {
2555         uint32_t nn = n[i];
2556         uint32_t mm = m[i];
2557         if (mm & 1) {
2558             nn = float32_one;
2559         }
2560         d[i] = nn ^ (mm & 2) << 30;
2561     }
2562 }
2563 
2564 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2565 {
2566     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2567     uint64_t *d = vd, *n = vn, *m = vm;
2568     for (i = 0; i < opr_sz; i += 1) {
2569         uint64_t nn = n[i];
2570         uint64_t mm = m[i];
2571         if (mm & 1) {
2572             nn = float64_one;
2573         }
2574         d[i] = nn ^ (mm & 2) << 62;
2575     }
2576 }
2577 
2578 /*
2579  * Signed saturating addition with scalar operand.
2580  */
2581 
2582 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2583 {
2584     intptr_t i, oprsz = simd_oprsz(desc);
2585 
2586     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2587         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2588     }
2589 }
2590 
2591 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2592 {
2593     intptr_t i, oprsz = simd_oprsz(desc);
2594 
2595     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2596         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2597     }
2598 }
2599 
2600 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2601 {
2602     intptr_t i, oprsz = simd_oprsz(desc);
2603 
2604     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2605         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2606     }
2607 }
2608 
2609 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2610 {
2611     intptr_t i, oprsz = simd_oprsz(desc);
2612 
2613     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2614         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2615     }
2616 }
2617 
2618 /*
2619  * Unsigned saturating addition with scalar operand.
2620  */
2621 
2622 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2623 {
2624     intptr_t i, oprsz = simd_oprsz(desc);
2625 
2626     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2627         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2628     }
2629 }
2630 
2631 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2632 {
2633     intptr_t i, oprsz = simd_oprsz(desc);
2634 
2635     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2636         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2637     }
2638 }
2639 
2640 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2641 {
2642     intptr_t i, oprsz = simd_oprsz(desc);
2643 
2644     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2645         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2646     }
2647 }
2648 
2649 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2650 {
2651     intptr_t i, oprsz = simd_oprsz(desc);
2652 
2653     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2654         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2655     }
2656 }
2657 
2658 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2659 {
2660     intptr_t i, oprsz = simd_oprsz(desc);
2661 
2662     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2663         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2664     }
2665 }
2666 
2667 /* Two operand predicated copy immediate with merge.  All valid immediates
2668  * can fit within 17 signed bits in the simd_data field.
2669  */
2670 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2671                          uint64_t mm, uint32_t desc)
2672 {
2673     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2674     uint64_t *d = vd, *n = vn;
2675     uint8_t *pg = vg;
2676 
2677     mm = dup_const(MO_8, mm);
2678     for (i = 0; i < opr_sz; i += 1) {
2679         uint64_t nn = n[i];
2680         uint64_t pp = expand_pred_b(pg[H1(i)]);
2681         d[i] = (mm & pp) | (nn & ~pp);
2682     }
2683 }
2684 
2685 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2686                          uint64_t mm, uint32_t desc)
2687 {
2688     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2689     uint64_t *d = vd, *n = vn;
2690     uint8_t *pg = vg;
2691 
2692     mm = dup_const(MO_16, mm);
2693     for (i = 0; i < opr_sz; i += 1) {
2694         uint64_t nn = n[i];
2695         uint64_t pp = expand_pred_h(pg[H1(i)]);
2696         d[i] = (mm & pp) | (nn & ~pp);
2697     }
2698 }
2699 
2700 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2701                          uint64_t mm, uint32_t desc)
2702 {
2703     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2704     uint64_t *d = vd, *n = vn;
2705     uint8_t *pg = vg;
2706 
2707     mm = dup_const(MO_32, mm);
2708     for (i = 0; i < opr_sz; i += 1) {
2709         uint64_t nn = n[i];
2710         uint64_t pp = expand_pred_s(pg[H1(i)]);
2711         d[i] = (mm & pp) | (nn & ~pp);
2712     }
2713 }
2714 
2715 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2716                          uint64_t mm, uint32_t desc)
2717 {
2718     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2719     uint64_t *d = vd, *n = vn;
2720     uint8_t *pg = vg;
2721 
2722     for (i = 0; i < opr_sz; i += 1) {
2723         uint64_t nn = n[i];
2724         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2725     }
2726 }
2727 
2728 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2729 {
2730     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2731     uint64_t *d = vd;
2732     uint8_t *pg = vg;
2733 
2734     val = dup_const(MO_8, val);
2735     for (i = 0; i < opr_sz; i += 1) {
2736         d[i] = val & expand_pred_b(pg[H1(i)]);
2737     }
2738 }
2739 
2740 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2741 {
2742     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2743     uint64_t *d = vd;
2744     uint8_t *pg = vg;
2745 
2746     val = dup_const(MO_16, val);
2747     for (i = 0; i < opr_sz; i += 1) {
2748         d[i] = val & expand_pred_h(pg[H1(i)]);
2749     }
2750 }
2751 
2752 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2753 {
2754     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2755     uint64_t *d = vd;
2756     uint8_t *pg = vg;
2757 
2758     val = dup_const(MO_32, val);
2759     for (i = 0; i < opr_sz; i += 1) {
2760         d[i] = val & expand_pred_s(pg[H1(i)]);
2761     }
2762 }
2763 
2764 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2765 {
2766     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2767     uint64_t *d = vd;
2768     uint8_t *pg = vg;
2769 
2770     for (i = 0; i < opr_sz; i += 1) {
2771         d[i] = (pg[H1(i)] & 1 ? val : 0);
2772     }
2773 }
2774 
2775 /* Big-endian hosts need to frob the byte indices.  If the copy
2776  * happens to be 8-byte aligned, then no frobbing necessary.
2777  */
2778 static void swap_memmove(void *vd, void *vs, size_t n)
2779 {
2780     uintptr_t d = (uintptr_t)vd;
2781     uintptr_t s = (uintptr_t)vs;
2782     uintptr_t o = (d | s | n) & 7;
2783     size_t i;
2784 
2785 #if !HOST_BIG_ENDIAN
2786     o = 0;
2787 #endif
2788     switch (o) {
2789     case 0:
2790         memmove(vd, vs, n);
2791         break;
2792 
2793     case 4:
2794         if (d < s || d >= s + n) {
2795             for (i = 0; i < n; i += 4) {
2796                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2797             }
2798         } else {
2799             for (i = n; i > 0; ) {
2800                 i -= 4;
2801                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2802             }
2803         }
2804         break;
2805 
2806     case 2:
2807     case 6:
2808         if (d < s || d >= s + n) {
2809             for (i = 0; i < n; i += 2) {
2810                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2811             }
2812         } else {
2813             for (i = n; i > 0; ) {
2814                 i -= 2;
2815                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2816             }
2817         }
2818         break;
2819 
2820     default:
2821         if (d < s || d >= s + n) {
2822             for (i = 0; i < n; i++) {
2823                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2824             }
2825         } else {
2826             for (i = n; i > 0; ) {
2827                 i -= 1;
2828                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2829             }
2830         }
2831         break;
2832     }
2833 }
2834 
2835 /* Similarly for memset of 0.  */
2836 static void swap_memzero(void *vd, size_t n)
2837 {
2838     uintptr_t d = (uintptr_t)vd;
2839     uintptr_t o = (d | n) & 7;
2840     size_t i;
2841 
2842     /* Usually, the first bit of a predicate is set, so N is 0.  */
2843     if (likely(n == 0)) {
2844         return;
2845     }
2846 
2847 #if !HOST_BIG_ENDIAN
2848     o = 0;
2849 #endif
2850     switch (o) {
2851     case 0:
2852         memset(vd, 0, n);
2853         break;
2854 
2855     case 4:
2856         for (i = 0; i < n; i += 4) {
2857             *(uint32_t *)H1_4(d + i) = 0;
2858         }
2859         break;
2860 
2861     case 2:
2862     case 6:
2863         for (i = 0; i < n; i += 2) {
2864             *(uint16_t *)H1_2(d + i) = 0;
2865         }
2866         break;
2867 
2868     default:
2869         for (i = 0; i < n; i++) {
2870             *(uint8_t *)H1(d + i) = 0;
2871         }
2872         break;
2873     }
2874 }
2875 
2876 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2877 {
2878     intptr_t opr_sz = simd_oprsz(desc);
2879     size_t n_ofs = simd_data(desc);
2880     size_t n_siz = opr_sz - n_ofs;
2881 
2882     if (vd != vm) {
2883         swap_memmove(vd, vn + n_ofs, n_siz);
2884         swap_memmove(vd + n_siz, vm, n_ofs);
2885     } else if (vd != vn) {
2886         swap_memmove(vd + n_siz, vd, n_ofs);
2887         swap_memmove(vd, vn + n_ofs, n_siz);
2888     } else {
2889         /* vd == vn == vm.  Need temp space.  */
2890         ARMVectorReg tmp;
2891         swap_memmove(&tmp, vm, n_ofs);
2892         swap_memmove(vd, vd + n_ofs, n_siz);
2893         memcpy(vd + n_siz, &tmp, n_ofs);
2894     }
2895 }
2896 
2897 #define DO_INSR(NAME, TYPE, H) \
2898 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2899 {                                                                  \
2900     intptr_t opr_sz = simd_oprsz(desc);                            \
2901     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2902     *(TYPE *)(vd + H(0)) = val;                                    \
2903 }
2904 
2905 DO_INSR(sve_insr_b, uint8_t, H1)
2906 DO_INSR(sve_insr_h, uint16_t, H1_2)
2907 DO_INSR(sve_insr_s, uint32_t, H1_4)
2908 DO_INSR(sve_insr_d, uint64_t, H1_8)
2909 
2910 #undef DO_INSR
2911 
2912 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2913 {
2914     intptr_t i, j, opr_sz = simd_oprsz(desc);
2915     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2916         uint64_t f = *(uint64_t *)(vn + i);
2917         uint64_t b = *(uint64_t *)(vn + j);
2918         *(uint64_t *)(vd + i) = bswap64(b);
2919         *(uint64_t *)(vd + j) = bswap64(f);
2920     }
2921 }
2922 
2923 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2924 {
2925     intptr_t i, j, opr_sz = simd_oprsz(desc);
2926     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2927         uint64_t f = *(uint64_t *)(vn + i);
2928         uint64_t b = *(uint64_t *)(vn + j);
2929         *(uint64_t *)(vd + i) = hswap64(b);
2930         *(uint64_t *)(vd + j) = hswap64(f);
2931     }
2932 }
2933 
2934 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2935 {
2936     intptr_t i, j, opr_sz = simd_oprsz(desc);
2937     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2938         uint64_t f = *(uint64_t *)(vn + i);
2939         uint64_t b = *(uint64_t *)(vn + j);
2940         *(uint64_t *)(vd + i) = rol64(b, 32);
2941         *(uint64_t *)(vd + j) = rol64(f, 32);
2942     }
2943 }
2944 
2945 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2946 {
2947     intptr_t i, j, opr_sz = simd_oprsz(desc);
2948     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2949         uint64_t f = *(uint64_t *)(vn + i);
2950         uint64_t b = *(uint64_t *)(vn + j);
2951         *(uint64_t *)(vd + i) = b;
2952         *(uint64_t *)(vd + j) = f;
2953     }
2954 }
2955 
2956 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2957 
2958 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2959                            bool is_tbx, tb_impl_fn *fn)
2960 {
2961     ARMVectorReg scratch;
2962     uintptr_t oprsz = simd_oprsz(desc);
2963 
2964     if (unlikely(vd == vn)) {
2965         vn = memcpy(&scratch, vn, oprsz);
2966     }
2967 
2968     fn(vd, vn, NULL, vm, oprsz, is_tbx);
2969 }
2970 
2971 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2972                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2973 {
2974     ARMVectorReg scratch;
2975     uintptr_t oprsz = simd_oprsz(desc);
2976 
2977     if (unlikely(vd == vn0)) {
2978         vn0 = memcpy(&scratch, vn0, oprsz);
2979         if (vd == vn1) {
2980             vn1 = vn0;
2981         }
2982     } else if (unlikely(vd == vn1)) {
2983         vn1 = memcpy(&scratch, vn1, oprsz);
2984     }
2985 
2986     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2987 }
2988 
2989 #define DO_TB(SUFF, TYPE, H)                                            \
2990 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
2991                                 void *vm, uintptr_t oprsz, bool is_tbx) \
2992 {                                                                       \
2993     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
2994     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
2995     for (i = 0; i < nelem; ++i) {                                       \
2996         TYPE index = indexes[H1(i)], val = 0;                           \
2997         if (index < nelem) {                                            \
2998             val = tbl0[H(index)];                                       \
2999         } else {                                                        \
3000             index -= nelem;                                             \
3001             if (tbl1 && index < nelem) {                                \
3002                 val = tbl1[H(index)];                                   \
3003             } else if (is_tbx) {                                        \
3004                 continue;                                               \
3005             }                                                           \
3006         }                                                               \
3007         d[H(i)] = val;                                                  \
3008     }                                                                   \
3009 }                                                                       \
3010 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3011 {                                                                       \
3012     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3013 }                                                                       \
3014 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3015                              void *vm, uint32_t desc)                   \
3016 {                                                                       \
3017     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3018 }                                                                       \
3019 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3020 {                                                                       \
3021     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3022 }
3023 
3024 DO_TB(b, uint8_t, H1)
3025 DO_TB(h, uint16_t, H2)
3026 DO_TB(s, uint32_t, H4)
3027 DO_TB(d, uint64_t, H8)
3028 
3029 #undef DO_TB
3030 
3031 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3032 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3033 {                                                              \
3034     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3035     TYPED *d = vd;                                             \
3036     TYPES *n = vn;                                             \
3037     ARMVectorReg tmp;                                          \
3038     if (unlikely(vn - vd < opr_sz)) {                          \
3039         n = memcpy(&tmp, n, opr_sz / 2);                       \
3040     }                                                          \
3041     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3042         d[HD(i)] = n[HS(i)];                                   \
3043     }                                                          \
3044 }
3045 
3046 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3047 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3048 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3049 
3050 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3051 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3052 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3053 
3054 #undef DO_UNPK
3055 
3056 /* Mask of bits included in the even numbered predicates of width esz.
3057  * We also use this for expand_bits/compress_bits, and so extend the
3058  * same pattern out to 16-bit units.
3059  */
3060 static const uint64_t even_bit_esz_masks[5] = {
3061     0x5555555555555555ull,
3062     0x3333333333333333ull,
3063     0x0f0f0f0f0f0f0f0full,
3064     0x00ff00ff00ff00ffull,
3065     0x0000ffff0000ffffull,
3066 };
3067 
3068 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3069  * For N==0, this corresponds to the operation that in qemu/bitops.h
3070  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3071  * section 7-2 Shuffling Bits.
3072  */
3073 static uint64_t expand_bits(uint64_t x, int n)
3074 {
3075     int i;
3076 
3077     x &= 0xffffffffu;
3078     for (i = 4; i >= n; i--) {
3079         int sh = 1 << i;
3080         x = ((x << sh) | x) & even_bit_esz_masks[i];
3081     }
3082     return x;
3083 }
3084 
3085 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3086  * For N==0, this corresponds to the operation that in qemu/bitops.h
3087  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3088  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3089  */
3090 static uint64_t compress_bits(uint64_t x, int n)
3091 {
3092     int i;
3093 
3094     for (i = n; i <= 4; i++) {
3095         int sh = 1 << i;
3096         x &= even_bit_esz_masks[i];
3097         x = (x >> sh) | x;
3098     }
3099     return x & 0xffffffffu;
3100 }
3101 
3102 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3103 {
3104     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3105     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3106     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3107     int esize = 1 << esz;
3108     uint64_t *d = vd;
3109     intptr_t i;
3110 
3111     if (oprsz <= 8) {
3112         uint64_t nn = *(uint64_t *)vn;
3113         uint64_t mm = *(uint64_t *)vm;
3114         int half = 4 * oprsz;
3115 
3116         nn = extract64(nn, high * half, half);
3117         mm = extract64(mm, high * half, half);
3118         nn = expand_bits(nn, esz);
3119         mm = expand_bits(mm, esz);
3120         d[0] = nn | (mm << esize);
3121     } else {
3122         ARMPredicateReg tmp;
3123 
3124         /* We produce output faster than we consume input.
3125            Therefore we must be mindful of possible overlap.  */
3126         if (vd == vn) {
3127             vn = memcpy(&tmp, vn, oprsz);
3128             if (vd == vm) {
3129                 vm = vn;
3130             }
3131         } else if (vd == vm) {
3132             vm = memcpy(&tmp, vm, oprsz);
3133         }
3134         if (high) {
3135             high = oprsz >> 1;
3136         }
3137 
3138         if ((oprsz & 7) == 0) {
3139             uint32_t *n = vn, *m = vm;
3140             high >>= 2;
3141 
3142             for (i = 0; i < oprsz / 8; i++) {
3143                 uint64_t nn = n[H4(high + i)];
3144                 uint64_t mm = m[H4(high + i)];
3145 
3146                 nn = expand_bits(nn, esz);
3147                 mm = expand_bits(mm, esz);
3148                 d[i] = nn | (mm << esize);
3149             }
3150         } else {
3151             uint8_t *n = vn, *m = vm;
3152             uint16_t *d16 = vd;
3153 
3154             for (i = 0; i < oprsz / 2; i++) {
3155                 uint16_t nn = n[H1(high + i)];
3156                 uint16_t mm = m[H1(high + i)];
3157 
3158                 nn = expand_bits(nn, esz);
3159                 mm = expand_bits(mm, esz);
3160                 d16[H2(i)] = nn | (mm << esize);
3161             }
3162         }
3163     }
3164 }
3165 
3166 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3167 {
3168     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3169     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3170     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3171     uint64_t *d = vd, *n = vn, *m = vm;
3172     uint64_t l, h;
3173     intptr_t i;
3174 
3175     if (oprsz <= 8) {
3176         l = compress_bits(n[0] >> odd, esz);
3177         h = compress_bits(m[0] >> odd, esz);
3178         d[0] = l | (h << (4 * oprsz));
3179     } else {
3180         ARMPredicateReg tmp_m;
3181         intptr_t oprsz_16 = oprsz / 16;
3182 
3183         if ((vm - vd) < (uintptr_t)oprsz) {
3184             m = memcpy(&tmp_m, vm, oprsz);
3185         }
3186 
3187         for (i = 0; i < oprsz_16; i++) {
3188             l = n[2 * i + 0];
3189             h = n[2 * i + 1];
3190             l = compress_bits(l >> odd, esz);
3191             h = compress_bits(h >> odd, esz);
3192             d[i] = l | (h << 32);
3193         }
3194 
3195         /*
3196          * For VL which is not a multiple of 512, the results from M do not
3197          * align nicely with the uint64_t for D.  Put the aligned results
3198          * from M into TMP_M and then copy it into place afterward.
3199          */
3200         if (oprsz & 15) {
3201             int final_shift = (oprsz & 15) * 2;
3202 
3203             l = n[2 * i + 0];
3204             h = n[2 * i + 1];
3205             l = compress_bits(l >> odd, esz);
3206             h = compress_bits(h >> odd, esz);
3207             d[i] = l | (h << final_shift);
3208 
3209             for (i = 0; i < oprsz_16; i++) {
3210                 l = m[2 * i + 0];
3211                 h = m[2 * i + 1];
3212                 l = compress_bits(l >> odd, esz);
3213                 h = compress_bits(h >> odd, esz);
3214                 tmp_m.p[i] = l | (h << 32);
3215             }
3216             l = m[2 * i + 0];
3217             h = m[2 * i + 1];
3218             l = compress_bits(l >> odd, esz);
3219             h = compress_bits(h >> odd, esz);
3220             tmp_m.p[i] = l | (h << final_shift);
3221 
3222             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3223         } else {
3224             for (i = 0; i < oprsz_16; i++) {
3225                 l = m[2 * i + 0];
3226                 h = m[2 * i + 1];
3227                 l = compress_bits(l >> odd, esz);
3228                 h = compress_bits(h >> odd, esz);
3229                 d[oprsz_16 + i] = l | (h << 32);
3230             }
3231         }
3232     }
3233 }
3234 
3235 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3236 {
3237     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3238     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3239     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3240     uint64_t *d = vd, *n = vn, *m = vm;
3241     uint64_t mask;
3242     int shr, shl;
3243     intptr_t i;
3244 
3245     shl = 1 << esz;
3246     shr = 0;
3247     mask = even_bit_esz_masks[esz];
3248     if (odd) {
3249         mask <<= shl;
3250         shr = shl;
3251         shl = 0;
3252     }
3253 
3254     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3255         uint64_t nn = (n[i] & mask) >> shr;
3256         uint64_t mm = (m[i] & mask) << shl;
3257         d[i] = nn + mm;
3258     }
3259 }
3260 
3261 /* Reverse units of 2**N bits.  */
3262 static uint64_t reverse_bits_64(uint64_t x, int n)
3263 {
3264     int i, sh;
3265 
3266     x = bswap64(x);
3267     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3268         uint64_t mask = even_bit_esz_masks[i];
3269         x = ((x & mask) << sh) | ((x >> sh) & mask);
3270     }
3271     return x;
3272 }
3273 
3274 static uint8_t reverse_bits_8(uint8_t x, int n)
3275 {
3276     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3277     int i, sh;
3278 
3279     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3280         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3281     }
3282     return x;
3283 }
3284 
3285 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3286 {
3287     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3288     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3289     intptr_t i, oprsz_2 = oprsz / 2;
3290 
3291     if (oprsz <= 8) {
3292         uint64_t l = *(uint64_t *)vn;
3293         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3294         *(uint64_t *)vd = l;
3295     } else if ((oprsz & 15) == 0) {
3296         for (i = 0; i < oprsz_2; i += 8) {
3297             intptr_t ih = oprsz - 8 - i;
3298             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3299             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3300             *(uint64_t *)(vd + i) = h;
3301             *(uint64_t *)(vd + ih) = l;
3302         }
3303     } else {
3304         for (i = 0; i < oprsz_2; i += 1) {
3305             intptr_t il = H1(i);
3306             intptr_t ih = H1(oprsz - 1 - i);
3307             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3308             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3309             *(uint8_t *)(vd + il) = h;
3310             *(uint8_t *)(vd + ih) = l;
3311         }
3312     }
3313 }
3314 
3315 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3316 {
3317     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3318     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3319     uint64_t *d = vd;
3320     intptr_t i;
3321 
3322     if (oprsz <= 8) {
3323         uint64_t nn = *(uint64_t *)vn;
3324         int half = 4 * oprsz;
3325 
3326         nn = extract64(nn, high * half, half);
3327         nn = expand_bits(nn, 0);
3328         d[0] = nn;
3329     } else {
3330         ARMPredicateReg tmp_n;
3331 
3332         /* We produce output faster than we consume input.
3333            Therefore we must be mindful of possible overlap.  */
3334         if ((vn - vd) < (uintptr_t)oprsz) {
3335             vn = memcpy(&tmp_n, vn, oprsz);
3336         }
3337         if (high) {
3338             high = oprsz >> 1;
3339         }
3340 
3341         if ((oprsz & 7) == 0) {
3342             uint32_t *n = vn;
3343             high >>= 2;
3344 
3345             for (i = 0; i < oprsz / 8; i++) {
3346                 uint64_t nn = n[H4(high + i)];
3347                 d[i] = expand_bits(nn, 0);
3348             }
3349         } else {
3350             uint16_t *d16 = vd;
3351             uint8_t *n = vn;
3352 
3353             for (i = 0; i < oprsz / 2; i++) {
3354                 uint16_t nn = n[H1(high + i)];
3355                 d16[H2(i)] = expand_bits(nn, 0);
3356             }
3357         }
3358     }
3359 }
3360 
3361 #define DO_ZIP(NAME, TYPE, H) \
3362 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3363 {                                                                    \
3364     intptr_t oprsz = simd_oprsz(desc);                               \
3365     intptr_t odd_ofs = simd_data(desc);                              \
3366     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3367     ARMVectorReg tmp_n, tmp_m;                                       \
3368     /* We produce output faster than we consume input.               \
3369        Therefore we must be mindful of possible overlap.  */         \
3370     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3371         vn = memcpy(&tmp_n, vn, oprsz);                              \
3372     }                                                                \
3373     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3374         vm = memcpy(&tmp_m, vm, oprsz);                              \
3375     }                                                                \
3376     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3377         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3378         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3379             *(TYPE *)(vm + odd_ofs + H(i));                          \
3380     }                                                                \
3381     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3382         memset(vd + oprsz - 16, 0, 16);                              \
3383     }                                                                \
3384 }
3385 
3386 DO_ZIP(sve_zip_b, uint8_t, H1)
3387 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3388 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3389 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3390 DO_ZIP(sve2_zip_q, Int128, )
3391 
3392 #define DO_UZP(NAME, TYPE, H) \
3393 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3394 {                                                                      \
3395     intptr_t oprsz = simd_oprsz(desc);                                 \
3396     intptr_t odd_ofs = simd_data(desc);                                \
3397     intptr_t i, p;                                                     \
3398     ARMVectorReg tmp_m;                                                \
3399     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3400         vm = memcpy(&tmp_m, vm, oprsz);                                \
3401     }                                                                  \
3402     i = 0, p = odd_ofs;                                                \
3403     do {                                                               \
3404         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3405         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3406     } while (p < oprsz);                                               \
3407     p -= oprsz;                                                        \
3408     do {                                                               \
3409         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3410         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3411     } while (p < oprsz);                                               \
3412     tcg_debug_assert(i == oprsz);                                      \
3413 }
3414 
3415 DO_UZP(sve_uzp_b, uint8_t, H1)
3416 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3417 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3418 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3419 DO_UZP(sve2_uzp_q, Int128, )
3420 
3421 #define DO_TRN(NAME, TYPE, H) \
3422 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3423 {                                                                      \
3424     intptr_t oprsz = simd_oprsz(desc);                                 \
3425     intptr_t odd_ofs = simd_data(desc);                                \
3426     intptr_t i;                                                        \
3427     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3428         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3429         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3430         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3431         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3432     }                                                                  \
3433     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3434         memset(vd + oprsz - 16, 0, 16);                                \
3435     }                                                                  \
3436 }
3437 
3438 DO_TRN(sve_trn_b, uint8_t, H1)
3439 DO_TRN(sve_trn_h, uint16_t, H1_2)
3440 DO_TRN(sve_trn_s, uint32_t, H1_4)
3441 DO_TRN(sve_trn_d, uint64_t, H1_8)
3442 DO_TRN(sve2_trn_q, Int128, )
3443 
3444 #undef DO_ZIP
3445 #undef DO_UZP
3446 #undef DO_TRN
3447 
3448 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3449 {
3450     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3451     uint32_t *d = vd, *n = vn;
3452     uint8_t *pg = vg;
3453 
3454     for (i = j = 0; i < opr_sz; i++) {
3455         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3456             d[H4(j)] = n[H4(i)];
3457             j++;
3458         }
3459     }
3460     for (; j < opr_sz; j++) {
3461         d[H4(j)] = 0;
3462     }
3463 }
3464 
3465 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3466 {
3467     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3468     uint64_t *d = vd, *n = vn;
3469     uint8_t *pg = vg;
3470 
3471     for (i = j = 0; i < opr_sz; i++) {
3472         if (pg[H1(i)] & 1) {
3473             d[j] = n[i];
3474             j++;
3475         }
3476     }
3477     for (; j < opr_sz; j++) {
3478         d[j] = 0;
3479     }
3480 }
3481 
3482 /* Similar to the ARM LastActiveElement pseudocode function, except the
3483  * result is multiplied by the element size.  This includes the not found
3484  * indication; e.g. not found for esz=3 is -8.
3485  */
3486 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3487 {
3488     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3489     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3490 
3491     return last_active_element(vg, words, esz);
3492 }
3493 
3494 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3495 {
3496     intptr_t opr_sz = simd_oprsz(desc) / 8;
3497     int esz = simd_data(desc);
3498     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3499     intptr_t i, first_i, last_i;
3500     ARMVectorReg tmp;
3501 
3502     first_i = last_i = 0;
3503     first_g = last_g = 0;
3504 
3505     /* Find the extent of the active elements within VG.  */
3506     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3507         pg = *(uint64_t *)(vg + i) & mask;
3508         if (pg) {
3509             if (last_g == 0) {
3510                 last_g = pg;
3511                 last_i = i;
3512             }
3513             first_g = pg;
3514             first_i = i;
3515         }
3516     }
3517 
3518     len = 0;
3519     if (first_g != 0) {
3520         first_i = first_i * 8 + ctz64(first_g);
3521         last_i = last_i * 8 + 63 - clz64(last_g);
3522         len = last_i - first_i + (1 << esz);
3523         if (vd == vm) {
3524             vm = memcpy(&tmp, vm, opr_sz * 8);
3525         }
3526         swap_memmove(vd, vn + first_i, len);
3527     }
3528     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3529 }
3530 
3531 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3532                             void *vg, uint32_t desc)
3533 {
3534     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3535     uint64_t *d = vd, *n = vn, *m = vm;
3536     uint8_t *pg = vg;
3537 
3538     for (i = 0; i < opr_sz; i += 1) {
3539         uint64_t nn = n[i], mm = m[i];
3540         uint64_t pp = expand_pred_b(pg[H1(i)]);
3541         d[i] = (nn & pp) | (mm & ~pp);
3542     }
3543 }
3544 
3545 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3546                             void *vg, uint32_t desc)
3547 {
3548     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3549     uint64_t *d = vd, *n = vn, *m = vm;
3550     uint8_t *pg = vg;
3551 
3552     for (i = 0; i < opr_sz; i += 1) {
3553         uint64_t nn = n[i], mm = m[i];
3554         uint64_t pp = expand_pred_h(pg[H1(i)]);
3555         d[i] = (nn & pp) | (mm & ~pp);
3556     }
3557 }
3558 
3559 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3560                             void *vg, uint32_t desc)
3561 {
3562     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3563     uint64_t *d = vd, *n = vn, *m = vm;
3564     uint8_t *pg = vg;
3565 
3566     for (i = 0; i < opr_sz; i += 1) {
3567         uint64_t nn = n[i], mm = m[i];
3568         uint64_t pp = expand_pred_s(pg[H1(i)]);
3569         d[i] = (nn & pp) | (mm & ~pp);
3570     }
3571 }
3572 
3573 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3574                             void *vg, uint32_t desc)
3575 {
3576     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3577     uint64_t *d = vd, *n = vn, *m = vm;
3578     uint8_t *pg = vg;
3579 
3580     for (i = 0; i < opr_sz; i += 1) {
3581         uint64_t nn = n[i], mm = m[i];
3582         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3583     }
3584 }
3585 
3586 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3587                             void *vg, uint32_t desc)
3588 {
3589     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3590     Int128 *d = vd, *n = vn, *m = vm;
3591     uint16_t *pg = vg;
3592 
3593     for (i = 0; i < opr_sz; i += 1) {
3594         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3595     }
3596 }
3597 
3598 /* Two operand comparison controlled by a predicate.
3599  * ??? It is very tempting to want to be able to expand this inline
3600  * with x86 instructions, e.g.
3601  *
3602  *    vcmpeqw    zm, zn, %ymm0
3603  *    vpmovmskb  %ymm0, %eax
3604  *    and        $0x5555, %eax
3605  *    and        pg, %eax
3606  *
3607  * or even aarch64, e.g.
3608  *
3609  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3610  *    cmeq       v0.8h, zn, zm
3611  *    and        v0.8h, v0.8h, mask
3612  *    addv       h0, v0.8h
3613  *    and        v0.8b, pg
3614  *
3615  * However, coming up with an abstraction that allows vector inputs and
3616  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3617  * scalar outputs, is tricky.
3618  */
3619 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3620 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3621 {                                                                            \
3622     intptr_t opr_sz = simd_oprsz(desc);                                      \
3623     uint32_t flags = PREDTEST_INIT;                                          \
3624     intptr_t i = opr_sz;                                                     \
3625     do {                                                                     \
3626         uint64_t out = 0, pg;                                                \
3627         do {                                                                 \
3628             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3629             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3630             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3631             out |= nn OP mm;                                                 \
3632         } while (i & 63);                                                    \
3633         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3634         out &= pg;                                                           \
3635         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3636         flags = iter_predtest_bwd(out, pg, flags);                           \
3637     } while (i > 0);                                                         \
3638     return flags;                                                            \
3639 }
3640 
3641 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3642     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3643 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3644     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3645 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3646     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3647 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3648     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3649 
3650 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3651 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3652 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3653 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3654 
3655 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3656 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3657 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3658 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3659 
3660 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3661 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3662 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3663 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3664 
3665 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3666 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3667 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3668 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3669 
3670 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3671 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3672 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3673 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3674 
3675 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3676 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3677 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3678 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3679 
3680 #undef DO_CMP_PPZZ_B
3681 #undef DO_CMP_PPZZ_H
3682 #undef DO_CMP_PPZZ_S
3683 #undef DO_CMP_PPZZ_D
3684 #undef DO_CMP_PPZZ
3685 
3686 /* Similar, but the second source is "wide".  */
3687 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3688 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3689 {                                                                            \
3690     intptr_t opr_sz = simd_oprsz(desc);                                      \
3691     uint32_t flags = PREDTEST_INIT;                                          \
3692     intptr_t i = opr_sz;                                                     \
3693     do {                                                                     \
3694         uint64_t out = 0, pg;                                                \
3695         do {                                                                 \
3696             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3697             do {                                                             \
3698                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3699                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3700                 out |= nn OP mm;                                             \
3701             } while (i & 7);                                                 \
3702         } while (i & 63);                                                    \
3703         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3704         out &= pg;                                                           \
3705         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3706         flags = iter_predtest_bwd(out, pg, flags);                           \
3707     } while (i > 0);                                                         \
3708     return flags;                                                            \
3709 }
3710 
3711 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3712     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3713 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3714     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3715 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3716     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3717 
3718 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3719 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3720 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3721 
3722 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3723 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3724 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3725 
3726 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3727 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3728 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3729 
3730 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3731 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3732 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3733 
3734 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3735 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3736 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3737 
3738 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3739 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3740 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3741 
3742 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3743 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3744 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3745 
3746 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3747 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3748 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3749 
3750 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3751 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3752 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3753 
3754 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3755 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3756 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3757 
3758 #undef DO_CMP_PPZW_B
3759 #undef DO_CMP_PPZW_H
3760 #undef DO_CMP_PPZW_S
3761 #undef DO_CMP_PPZW
3762 
3763 /* Similar, but the second source is immediate.  */
3764 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3765 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3766 {                                                                    \
3767     intptr_t opr_sz = simd_oprsz(desc);                              \
3768     uint32_t flags = PREDTEST_INIT;                                  \
3769     TYPE mm = simd_data(desc);                                       \
3770     intptr_t i = opr_sz;                                             \
3771     do {                                                             \
3772         uint64_t out = 0, pg;                                        \
3773         do {                                                         \
3774             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3775             TYPE nn = *(TYPE *)(vn + H(i));                          \
3776             out |= nn OP mm;                                         \
3777         } while (i & 63);                                            \
3778         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3779         out &= pg;                                                   \
3780         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3781         flags = iter_predtest_bwd(out, pg, flags);                   \
3782     } while (i > 0);                                                 \
3783     return flags;                                                    \
3784 }
3785 
3786 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3787     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3788 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3789     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3790 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3791     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3792 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3793     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3794 
3795 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3796 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3797 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3798 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3799 
3800 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3801 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3802 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3803 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3804 
3805 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3806 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3807 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3808 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3809 
3810 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3811 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3812 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3813 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3814 
3815 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3816 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3817 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3818 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3819 
3820 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3821 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3822 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3823 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3824 
3825 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3826 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3827 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3828 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3829 
3830 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3831 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3832 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3833 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3834 
3835 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3836 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3837 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3838 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3839 
3840 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3841 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3842 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3843 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3844 
3845 #undef DO_CMP_PPZI_B
3846 #undef DO_CMP_PPZI_H
3847 #undef DO_CMP_PPZI_S
3848 #undef DO_CMP_PPZI_D
3849 #undef DO_CMP_PPZI
3850 
3851 /* Similar to the ARM LastActive pseudocode function.  */
3852 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3853 {
3854     intptr_t i;
3855 
3856     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3857         uint64_t pg = *(uint64_t *)(vg + i);
3858         if (pg) {
3859             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3860         }
3861     }
3862     return 0;
3863 }
3864 
3865 /* Compute a mask into RETB that is true for all G, up to and including
3866  * (if after) or excluding (if !after) the first G & N.
3867  * Return true if BRK found.
3868  */
3869 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3870                         bool brk, bool after)
3871 {
3872     uint64_t b;
3873 
3874     if (brk) {
3875         b = 0;
3876     } else if ((g & n) == 0) {
3877         /* For all G, no N are set; break not found.  */
3878         b = g;
3879     } else {
3880         /* Break somewhere in N.  Locate it.  */
3881         b = g & n;            /* guard true, pred true */
3882         b = b & -b;           /* first such */
3883         if (after) {
3884             b = b | (b - 1);  /* break after same */
3885         } else {
3886             b = b - 1;        /* break before same */
3887         }
3888         brk = true;
3889     }
3890 
3891     *retb = b;
3892     return brk;
3893 }
3894 
3895 /* Compute a zeroing BRK.  */
3896 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3897                           intptr_t oprsz, bool after)
3898 {
3899     bool brk = false;
3900     intptr_t i;
3901 
3902     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3903         uint64_t this_b, this_g = g[i];
3904 
3905         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3906         d[i] = this_b & this_g;
3907     }
3908 }
3909 
3910 /* Likewise, but also compute flags.  */
3911 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3912                                intptr_t oprsz, bool after)
3913 {
3914     uint32_t flags = PREDTEST_INIT;
3915     bool brk = false;
3916     intptr_t i;
3917 
3918     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3919         uint64_t this_b, this_d, this_g = g[i];
3920 
3921         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3922         d[i] = this_d = this_b & this_g;
3923         flags = iter_predtest_fwd(this_d, this_g, flags);
3924     }
3925     return flags;
3926 }
3927 
3928 /* Compute a merging BRK.  */
3929 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3930                           intptr_t oprsz, bool after)
3931 {
3932     bool brk = false;
3933     intptr_t i;
3934 
3935     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3936         uint64_t this_b, this_g = g[i];
3937 
3938         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3939         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3940     }
3941 }
3942 
3943 /* Likewise, but also compute flags.  */
3944 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3945                                intptr_t oprsz, bool after)
3946 {
3947     uint32_t flags = PREDTEST_INIT;
3948     bool brk = false;
3949     intptr_t i;
3950 
3951     for (i = 0; i < oprsz / 8; ++i) {
3952         uint64_t this_b, this_d = d[i], this_g = g[i];
3953 
3954         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3955         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3956         flags = iter_predtest_fwd(this_d, this_g, flags);
3957     }
3958     return flags;
3959 }
3960 
3961 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3962 {
3963     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3964      * The compiler should turn this into 4 64-bit integer stores.
3965      */
3966     memset(d, 0, sizeof(ARMPredicateReg));
3967     return PREDTEST_INIT;
3968 }
3969 
3970 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3971                        uint32_t pred_desc)
3972 {
3973     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3974     if (last_active_pred(vn, vg, oprsz)) {
3975         compute_brk_z(vd, vm, vg, oprsz, true);
3976     } else {
3977         do_zero(vd, oprsz);
3978     }
3979 }
3980 
3981 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3982                             uint32_t pred_desc)
3983 {
3984     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3985     if (last_active_pred(vn, vg, oprsz)) {
3986         return compute_brks_z(vd, vm, vg, oprsz, true);
3987     } else {
3988         return do_zero(vd, oprsz);
3989     }
3990 }
3991 
3992 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3993                        uint32_t pred_desc)
3994 {
3995     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3996     if (last_active_pred(vn, vg, oprsz)) {
3997         compute_brk_z(vd, vm, vg, oprsz, false);
3998     } else {
3999         do_zero(vd, oprsz);
4000     }
4001 }
4002 
4003 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4004                             uint32_t pred_desc)
4005 {
4006     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4007     if (last_active_pred(vn, vg, oprsz)) {
4008         return compute_brks_z(vd, vm, vg, oprsz, false);
4009     } else {
4010         return do_zero(vd, oprsz);
4011     }
4012 }
4013 
4014 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4015 {
4016     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4017     compute_brk_z(vd, vn, vg, oprsz, true);
4018 }
4019 
4020 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4021 {
4022     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4023     return compute_brks_z(vd, vn, vg, oprsz, true);
4024 }
4025 
4026 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4027 {
4028     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4029     compute_brk_z(vd, vn, vg, oprsz, false);
4030 }
4031 
4032 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4033 {
4034     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4035     return compute_brks_z(vd, vn, vg, oprsz, false);
4036 }
4037 
4038 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4039 {
4040     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4041     compute_brk_m(vd, vn, vg, oprsz, true);
4042 }
4043 
4044 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4045 {
4046     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4047     return compute_brks_m(vd, vn, vg, oprsz, true);
4048 }
4049 
4050 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4051 {
4052     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4053     compute_brk_m(vd, vn, vg, oprsz, false);
4054 }
4055 
4056 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4057 {
4058     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4059     return compute_brks_m(vd, vn, vg, oprsz, false);
4060 }
4061 
4062 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4063 {
4064     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4065     if (!last_active_pred(vn, vg, oprsz)) {
4066         do_zero(vd, oprsz);
4067     }
4068 }
4069 
4070 /* As if PredTest(Ones(PL), D, esz).  */
4071 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4072                               uint64_t esz_mask)
4073 {
4074     uint32_t flags = PREDTEST_INIT;
4075     intptr_t i;
4076 
4077     for (i = 0; i < oprsz / 8; i++) {
4078         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4079     }
4080     if (oprsz & 7) {
4081         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4082         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4083     }
4084     return flags;
4085 }
4086 
4087 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4088 {
4089     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4090     if (last_active_pred(vn, vg, oprsz)) {
4091         return predtest_ones(vd, oprsz, -1);
4092     } else {
4093         return do_zero(vd, oprsz);
4094     }
4095 }
4096 
4097 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4098 {
4099     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4100     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4101     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4102     intptr_t i;
4103 
4104     for (i = 0; i < words; ++i) {
4105         uint64_t t = n[i] & g[i] & mask;
4106         sum += ctpop64(t);
4107     }
4108     return sum;
4109 }
4110 
4111 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4112 {
4113     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4114     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4115     uint64_t esz_mask = pred_esz_masks[esz];
4116     ARMPredicateReg *d = vd;
4117     uint32_t flags;
4118     intptr_t i;
4119 
4120     /* Begin with a zero predicate register.  */
4121     flags = do_zero(d, oprsz);
4122     if (count == 0) {
4123         return flags;
4124     }
4125 
4126     /* Set all of the requested bits.  */
4127     for (i = 0; i < count / 64; ++i) {
4128         d->p[i] = esz_mask;
4129     }
4130     if (count & 63) {
4131         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4132     }
4133 
4134     return predtest_ones(d, oprsz, esz_mask);
4135 }
4136 
4137 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4138 {
4139     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4140     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4141     uint64_t esz_mask = pred_esz_masks[esz];
4142     ARMPredicateReg *d = vd;
4143     intptr_t i, invcount, oprbits;
4144     uint64_t bits;
4145 
4146     if (count == 0) {
4147         return do_zero(d, oprsz);
4148     }
4149 
4150     oprbits = oprsz * 8;
4151     tcg_debug_assert(count <= oprbits);
4152 
4153     bits = esz_mask;
4154     if (oprbits & 63) {
4155         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4156     }
4157 
4158     invcount = oprbits - count;
4159     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4160         d->p[i] = bits;
4161         bits = esz_mask;
4162     }
4163 
4164     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4165 
4166     while (--i >= 0) {
4167         d->p[i] = 0;
4168     }
4169 
4170     return predtest_ones(d, oprsz, esz_mask);
4171 }
4172 
4173 /* Recursive reduction on a function;
4174  * C.f. the ARM ARM function ReducePredicated.
4175  *
4176  * While it would be possible to write this without the DATA temporary,
4177  * it is much simpler to process the predicate register this way.
4178  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4179  * little to gain with a more complex non-recursive form.
4180  */
4181 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4182 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4183 {                                                                     \
4184     if (n == 1) {                                                     \
4185         return *data;                                                 \
4186     } else {                                                          \
4187         uintptr_t half = n / 2;                                       \
4188         TYPE lo = NAME##_reduce(data, status, half);                  \
4189         TYPE hi = NAME##_reduce(data + half, status, half);           \
4190         return TYPE##_##FUNC(lo, hi, status);                         \
4191     }                                                                 \
4192 }                                                                     \
4193 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
4194 {                                                                     \
4195     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4196     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4197     for (i = 0; i < oprsz; ) {                                        \
4198         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4199         do {                                                          \
4200             TYPE nn = *(TYPE *)(vn + H(i));                           \
4201             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4202             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4203         } while (i & 15);                                             \
4204     }                                                                 \
4205     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4206         *(TYPE *)((void *)data + i) = IDENT;                          \
4207     }                                                                 \
4208     return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
4209 }
4210 
4211 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4212 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4213 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4214 
4215 /* Identity is floatN_default_nan, without the function call.  */
4216 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4217 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4218 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4219 
4220 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4221 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4222 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4223 
4224 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4225 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4226 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4227 
4228 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4229 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4230 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4231 
4232 #undef DO_REDUCE
4233 
4234 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4235                              void *status, uint32_t desc)
4236 {
4237     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4238     float16 result = nn;
4239 
4240     do {
4241         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4242         do {
4243             if (pg & 1) {
4244                 float16 mm = *(float16 *)(vm + H1_2(i));
4245                 result = float16_add(result, mm, status);
4246             }
4247             i += sizeof(float16), pg >>= sizeof(float16);
4248         } while (i & 15);
4249     } while (i < opr_sz);
4250 
4251     return result;
4252 }
4253 
4254 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4255                              void *status, uint32_t desc)
4256 {
4257     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4258     float32 result = nn;
4259 
4260     do {
4261         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4262         do {
4263             if (pg & 1) {
4264                 float32 mm = *(float32 *)(vm + H1_2(i));
4265                 result = float32_add(result, mm, status);
4266             }
4267             i += sizeof(float32), pg >>= sizeof(float32);
4268         } while (i & 15);
4269     } while (i < opr_sz);
4270 
4271     return result;
4272 }
4273 
4274 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4275                              void *status, uint32_t desc)
4276 {
4277     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4278     uint64_t *m = vm;
4279     uint8_t *pg = vg;
4280 
4281     for (i = 0; i < opr_sz; i++) {
4282         if (pg[H1(i)] & 1) {
4283             nn = float64_add(nn, m[i], status);
4284         }
4285     }
4286 
4287     return nn;
4288 }
4289 
4290 /* Fully general three-operand expander, controlled by a predicate,
4291  * With the extra float_status parameter.
4292  */
4293 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4294 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4295                   void *status, uint32_t desc)                  \
4296 {                                                               \
4297     intptr_t i = simd_oprsz(desc);                              \
4298     uint64_t *g = vg;                                           \
4299     do {                                                        \
4300         uint64_t pg = g[(i - 1) >> 6];                          \
4301         do {                                                    \
4302             i -= sizeof(TYPE);                                  \
4303             if (likely((pg >> (i & 63)) & 1)) {                 \
4304                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4305                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4306                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4307             }                                                   \
4308         } while (i & 63);                                       \
4309     } while (i != 0);                                           \
4310 }
4311 
4312 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4313 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4314 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4315 
4316 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4317 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4318 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4319 
4320 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4321 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4322 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4323 
4324 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4325 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4326 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4327 
4328 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4329 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4330 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4331 
4332 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4333 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4334 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4335 
4336 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4337 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4338 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4339 
4340 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4341 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4342 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4343 
4344 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4345 {
4346     return float16_abs(float16_sub(a, b, s));
4347 }
4348 
4349 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4350 {
4351     return float32_abs(float32_sub(a, b, s));
4352 }
4353 
4354 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4355 {
4356     return float64_abs(float64_sub(a, b, s));
4357 }
4358 
4359 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4360 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4361 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4362 
4363 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4364 {
4365     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4366     return float64_scalbn(a, b_int, s);
4367 }
4368 
4369 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4370 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4371 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4372 
4373 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4374 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4375 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4376 
4377 #undef DO_ZPZZ_FP
4378 
4379 /* Three-operand expander, with one scalar operand, controlled by
4380  * a predicate, with the extra float_status parameter.
4381  */
4382 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4383 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4384                   void *status, uint32_t desc)                    \
4385 {                                                                 \
4386     intptr_t i = simd_oprsz(desc);                                \
4387     uint64_t *g = vg;                                             \
4388     TYPE mm = scalar;                                             \
4389     do {                                                          \
4390         uint64_t pg = g[(i - 1) >> 6];                            \
4391         do {                                                      \
4392             i -= sizeof(TYPE);                                    \
4393             if (likely((pg >> (i & 63)) & 1)) {                   \
4394                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4395                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4396             }                                                     \
4397         } while (i & 63);                                         \
4398     } while (i != 0);                                             \
4399 }
4400 
4401 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4402 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4403 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4404 
4405 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4406 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4407 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4408 
4409 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4410 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4411 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4412 
4413 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4414 {
4415     return float16_sub(b, a, s);
4416 }
4417 
4418 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4419 {
4420     return float32_sub(b, a, s);
4421 }
4422 
4423 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4424 {
4425     return float64_sub(b, a, s);
4426 }
4427 
4428 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4429 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4430 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4431 
4432 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4433 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4434 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4435 
4436 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4437 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4438 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4439 
4440 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4441 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4442 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4443 
4444 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4445 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4446 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4447 
4448 /* Fully general two-operand expander, controlled by a predicate,
4449  * With the extra float_status parameter.
4450  */
4451 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4452 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4453 {                                                                     \
4454     intptr_t i = simd_oprsz(desc);                                    \
4455     uint64_t *g = vg;                                                 \
4456     do {                                                              \
4457         uint64_t pg = g[(i - 1) >> 6];                                \
4458         do {                                                          \
4459             i -= sizeof(TYPE);                                        \
4460             if (likely((pg >> (i & 63)) & 1)) {                       \
4461                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4462                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4463             }                                                         \
4464         } while (i & 63);                                             \
4465     } while (i != 0);                                                 \
4466 }
4467 
4468 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4469  * FZ16.  When converting from fp16, this affects flushing input denormals;
4470  * when converting to fp16, this affects flushing output denormals.
4471  */
4472 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4473 {
4474     bool save = get_flush_inputs_to_zero(fpst);
4475     float32 ret;
4476 
4477     set_flush_inputs_to_zero(false, fpst);
4478     ret = float16_to_float32(f, true, fpst);
4479     set_flush_inputs_to_zero(save, fpst);
4480     return ret;
4481 }
4482 
4483 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4484 {
4485     bool save = get_flush_inputs_to_zero(fpst);
4486     float64 ret;
4487 
4488     set_flush_inputs_to_zero(false, fpst);
4489     ret = float16_to_float64(f, true, fpst);
4490     set_flush_inputs_to_zero(save, fpst);
4491     return ret;
4492 }
4493 
4494 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4495 {
4496     bool save = get_flush_to_zero(fpst);
4497     float16 ret;
4498 
4499     set_flush_to_zero(false, fpst);
4500     ret = float32_to_float16(f, true, fpst);
4501     set_flush_to_zero(save, fpst);
4502     return ret;
4503 }
4504 
4505 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4506 {
4507     bool save = get_flush_to_zero(fpst);
4508     float16 ret;
4509 
4510     set_flush_to_zero(false, fpst);
4511     ret = float64_to_float16(f, true, fpst);
4512     set_flush_to_zero(save, fpst);
4513     return ret;
4514 }
4515 
4516 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4517 {
4518     if (float16_is_any_nan(f)) {
4519         float_raise(float_flag_invalid, s);
4520         return 0;
4521     }
4522     return float16_to_int16_round_to_zero(f, s);
4523 }
4524 
4525 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4526 {
4527     if (float16_is_any_nan(f)) {
4528         float_raise(float_flag_invalid, s);
4529         return 0;
4530     }
4531     return float16_to_int64_round_to_zero(f, s);
4532 }
4533 
4534 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4535 {
4536     if (float32_is_any_nan(f)) {
4537         float_raise(float_flag_invalid, s);
4538         return 0;
4539     }
4540     return float32_to_int64_round_to_zero(f, s);
4541 }
4542 
4543 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4544 {
4545     if (float64_is_any_nan(f)) {
4546         float_raise(float_flag_invalid, s);
4547         return 0;
4548     }
4549     return float64_to_int64_round_to_zero(f, s);
4550 }
4551 
4552 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4553 {
4554     if (float16_is_any_nan(f)) {
4555         float_raise(float_flag_invalid, s);
4556         return 0;
4557     }
4558     return float16_to_uint16_round_to_zero(f, s);
4559 }
4560 
4561 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4562 {
4563     if (float16_is_any_nan(f)) {
4564         float_raise(float_flag_invalid, s);
4565         return 0;
4566     }
4567     return float16_to_uint64_round_to_zero(f, s);
4568 }
4569 
4570 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4571 {
4572     if (float32_is_any_nan(f)) {
4573         float_raise(float_flag_invalid, s);
4574         return 0;
4575     }
4576     return float32_to_uint64_round_to_zero(f, s);
4577 }
4578 
4579 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4580 {
4581     if (float64_is_any_nan(f)) {
4582         float_raise(float_flag_invalid, s);
4583         return 0;
4584     }
4585     return float64_to_uint64_round_to_zero(f, s);
4586 }
4587 
4588 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4589 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4590 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4591 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4592 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4593 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4594 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4595 
4596 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4597 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4598 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4599 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4600 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4601 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4602 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4603 
4604 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4605 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4606 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4607 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4608 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4609 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4610 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4611 
4612 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4613 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4614 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4615 
4616 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4617 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4618 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4619 
4620 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4621 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4622 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4623 
4624 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4625 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4626 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4627 
4628 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4629 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4630 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4631 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4632 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4633 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4634 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4635 
4636 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4637 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4638 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4639 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4640 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4641 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4642 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4643 
4644 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4645 {
4646     /* Extract frac to the top of the uint32_t. */
4647     uint32_t frac = (uint32_t)a << (16 + 6);
4648     int16_t exp = extract32(a, 10, 5);
4649 
4650     if (unlikely(exp == 0)) {
4651         if (frac != 0) {
4652             if (!get_flush_inputs_to_zero(s)) {
4653                 /* denormal: bias - fractional_zeros */
4654                 return -15 - clz32(frac);
4655             }
4656             /* flush to zero */
4657             float_raise(float_flag_input_denormal, s);
4658         }
4659     } else if (unlikely(exp == 0x1f)) {
4660         if (frac == 0) {
4661             return INT16_MAX; /* infinity */
4662         }
4663     } else {
4664         /* normal: exp - bias */
4665         return exp - 15;
4666     }
4667     /* nan or zero */
4668     float_raise(float_flag_invalid, s);
4669     return INT16_MIN;
4670 }
4671 
4672 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4673 {
4674     /* Extract frac to the top of the uint32_t. */
4675     uint32_t frac = a << 9;
4676     int32_t exp = extract32(a, 23, 8);
4677 
4678     if (unlikely(exp == 0)) {
4679         if (frac != 0) {
4680             if (!get_flush_inputs_to_zero(s)) {
4681                 /* denormal: bias - fractional_zeros */
4682                 return -127 - clz32(frac);
4683             }
4684             /* flush to zero */
4685             float_raise(float_flag_input_denormal, s);
4686         }
4687     } else if (unlikely(exp == 0xff)) {
4688         if (frac == 0) {
4689             return INT32_MAX; /* infinity */
4690         }
4691     } else {
4692         /* normal: exp - bias */
4693         return exp - 127;
4694     }
4695     /* nan or zero */
4696     float_raise(float_flag_invalid, s);
4697     return INT32_MIN;
4698 }
4699 
4700 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4701 {
4702     /* Extract frac to the top of the uint64_t. */
4703     uint64_t frac = a << 12;
4704     int64_t exp = extract64(a, 52, 11);
4705 
4706     if (unlikely(exp == 0)) {
4707         if (frac != 0) {
4708             if (!get_flush_inputs_to_zero(s)) {
4709                 /* denormal: bias - fractional_zeros */
4710                 return -1023 - clz64(frac);
4711             }
4712             /* flush to zero */
4713             float_raise(float_flag_input_denormal, s);
4714         }
4715     } else if (unlikely(exp == 0x7ff)) {
4716         if (frac == 0) {
4717             return INT64_MAX; /* infinity */
4718         }
4719     } else {
4720         /* normal: exp - bias */
4721         return exp - 1023;
4722     }
4723     /* nan or zero */
4724     float_raise(float_flag_invalid, s);
4725     return INT64_MIN;
4726 }
4727 
4728 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4729 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4730 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4731 
4732 #undef DO_ZPZ_FP
4733 
4734 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4735                             float_status *status, uint32_t desc,
4736                             uint16_t neg1, uint16_t neg3)
4737 {
4738     intptr_t i = simd_oprsz(desc);
4739     uint64_t *g = vg;
4740 
4741     do {
4742         uint64_t pg = g[(i - 1) >> 6];
4743         do {
4744             i -= 2;
4745             if (likely((pg >> (i & 63)) & 1)) {
4746                 float16 e1, e2, e3, r;
4747 
4748                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4749                 e2 = *(uint16_t *)(vm + H1_2(i));
4750                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4751                 r = float16_muladd(e1, e2, e3, 0, status);
4752                 *(uint16_t *)(vd + H1_2(i)) = r;
4753             }
4754         } while (i & 63);
4755     } while (i != 0);
4756 }
4757 
4758 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4759                               void *vg, void *status, uint32_t desc)
4760 {
4761     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4762 }
4763 
4764 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4765                               void *vg, void *status, uint32_t desc)
4766 {
4767     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4768 }
4769 
4770 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4771                                void *vg, void *status, uint32_t desc)
4772 {
4773     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4774 }
4775 
4776 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4777                                void *vg, void *status, uint32_t desc)
4778 {
4779     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4780 }
4781 
4782 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4783                             float_status *status, uint32_t desc,
4784                             uint32_t neg1, uint32_t neg3)
4785 {
4786     intptr_t i = simd_oprsz(desc);
4787     uint64_t *g = vg;
4788 
4789     do {
4790         uint64_t pg = g[(i - 1) >> 6];
4791         do {
4792             i -= 4;
4793             if (likely((pg >> (i & 63)) & 1)) {
4794                 float32 e1, e2, e3, r;
4795 
4796                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4797                 e2 = *(uint32_t *)(vm + H1_4(i));
4798                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4799                 r = float32_muladd(e1, e2, e3, 0, status);
4800                 *(uint32_t *)(vd + H1_4(i)) = r;
4801             }
4802         } while (i & 63);
4803     } while (i != 0);
4804 }
4805 
4806 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4807                               void *vg, void *status, uint32_t desc)
4808 {
4809     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4810 }
4811 
4812 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4813                               void *vg, void *status, uint32_t desc)
4814 {
4815     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4816 }
4817 
4818 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4819                                void *vg, void *status, uint32_t desc)
4820 {
4821     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4822 }
4823 
4824 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4825                                void *vg, void *status, uint32_t desc)
4826 {
4827     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4828 }
4829 
4830 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4831                             float_status *status, uint32_t desc,
4832                             uint64_t neg1, uint64_t neg3)
4833 {
4834     intptr_t i = simd_oprsz(desc);
4835     uint64_t *g = vg;
4836 
4837     do {
4838         uint64_t pg = g[(i - 1) >> 6];
4839         do {
4840             i -= 8;
4841             if (likely((pg >> (i & 63)) & 1)) {
4842                 float64 e1, e2, e3, r;
4843 
4844                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4845                 e2 = *(uint64_t *)(vm + i);
4846                 e3 = *(uint64_t *)(va + i) ^ neg3;
4847                 r = float64_muladd(e1, e2, e3, 0, status);
4848                 *(uint64_t *)(vd + i) = r;
4849             }
4850         } while (i & 63);
4851     } while (i != 0);
4852 }
4853 
4854 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4855                               void *vg, void *status, uint32_t desc)
4856 {
4857     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4858 }
4859 
4860 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4861                               void *vg, void *status, uint32_t desc)
4862 {
4863     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4864 }
4865 
4866 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4867                                void *vg, void *status, uint32_t desc)
4868 {
4869     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4870 }
4871 
4872 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4873                                void *vg, void *status, uint32_t desc)
4874 {
4875     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4876 }
4877 
4878 /* Two operand floating-point comparison controlled by a predicate.
4879  * Unlike the integer version, we are not allowed to optimistically
4880  * compare operands, since the comparison may have side effects wrt
4881  * the FPSR.
4882  */
4883 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
4884 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
4885                   void *status, uint32_t desc)                          \
4886 {                                                                       \
4887     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
4888     uint64_t *d = vd, *g = vg;                                          \
4889     do {                                                                \
4890         uint64_t out = 0, pg = g[j];                                    \
4891         do {                                                            \
4892             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
4893             if (likely((pg >> (i & 63)) & 1)) {                         \
4894                 TYPE nn = *(TYPE *)(vn + H(i));                         \
4895                 TYPE mm = *(TYPE *)(vm + H(i));                         \
4896                 out |= OP(TYPE, nn, mm, status);                        \
4897             }                                                           \
4898         } while (i & 63);                                               \
4899         d[j--] = out;                                                   \
4900     } while (i > 0);                                                    \
4901 }
4902 
4903 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4904     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4905 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4906     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4907 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4908     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4909 
4910 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4911     DO_FPCMP_PPZZ_H(NAME, OP)   \
4912     DO_FPCMP_PPZZ_S(NAME, OP)   \
4913     DO_FPCMP_PPZZ_D(NAME, OP)
4914 
4915 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
4916 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
4917 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
4918 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
4919 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
4920 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
4921 #define DO_FCMUO(TYPE, X, Y, ST)  \
4922     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4923 #define DO_FACGE(TYPE, X, Y, ST)  \
4924     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4925 #define DO_FACGT(TYPE, X, Y, ST)  \
4926     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4927 
4928 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4929 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4930 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4931 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4932 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4933 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4934 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4935 
4936 #undef DO_FPCMP_PPZZ_ALL
4937 #undef DO_FPCMP_PPZZ_D
4938 #undef DO_FPCMP_PPZZ_S
4939 #undef DO_FPCMP_PPZZ_H
4940 #undef DO_FPCMP_PPZZ
4941 
4942 /* One operand floating-point comparison against zero, controlled
4943  * by a predicate.
4944  */
4945 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
4946 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
4947                   void *status, uint32_t desc)             \
4948 {                                                          \
4949     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
4950     uint64_t *d = vd, *g = vg;                             \
4951     do {                                                   \
4952         uint64_t out = 0, pg = g[j];                       \
4953         do {                                               \
4954             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
4955             if ((pg >> (i & 63)) & 1) {                    \
4956                 TYPE nn = *(TYPE *)(vn + H(i));            \
4957                 out |= OP(TYPE, nn, 0, status);            \
4958             }                                              \
4959         } while (i & 63);                                  \
4960         d[j--] = out;                                      \
4961     } while (i > 0);                                       \
4962 }
4963 
4964 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4965     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4966 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4967     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4968 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4969     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4970 
4971 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4972     DO_FPCMP_PPZ0_H(NAME, OP)   \
4973     DO_FPCMP_PPZ0_S(NAME, OP)   \
4974     DO_FPCMP_PPZ0_D(NAME, OP)
4975 
4976 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4977 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4978 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4979 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4980 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4981 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4982 
4983 /* FP Trig Multiply-Add. */
4984 
4985 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4986 {
4987     static const float16 coeff[16] = {
4988         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4990     };
4991     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4992     intptr_t x = simd_data(desc);
4993     float16 *d = vd, *n = vn, *m = vm;
4994     for (i = 0; i < opr_sz; i++) {
4995         float16 mm = m[i];
4996         intptr_t xx = x;
4997         if (float16_is_neg(mm)) {
4998             mm = float16_abs(mm);
4999             xx += 8;
5000         }
5001         d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5002     }
5003 }
5004 
5005 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5006 {
5007     static const float32 coeff[16] = {
5008         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5009         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5010         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5011         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5012     };
5013     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5014     intptr_t x = simd_data(desc);
5015     float32 *d = vd, *n = vn, *m = vm;
5016     for (i = 0; i < opr_sz; i++) {
5017         float32 mm = m[i];
5018         intptr_t xx = x;
5019         if (float32_is_neg(mm)) {
5020             mm = float32_abs(mm);
5021             xx += 8;
5022         }
5023         d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5024     }
5025 }
5026 
5027 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5028 {
5029     static const float64 coeff[16] = {
5030         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5031         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5032         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5033         0x3de5d8408868552full, 0x0000000000000000ull,
5034         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5035         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5036         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5037         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5038     };
5039     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5040     intptr_t x = simd_data(desc);
5041     float64 *d = vd, *n = vn, *m = vm;
5042     for (i = 0; i < opr_sz; i++) {
5043         float64 mm = m[i];
5044         intptr_t xx = x;
5045         if (float64_is_neg(mm)) {
5046             mm = float64_abs(mm);
5047             xx += 8;
5048         }
5049         d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5050     }
5051 }
5052 
5053 /*
5054  * FP Complex Add
5055  */
5056 
5057 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5058                          void *vs, uint32_t desc)
5059 {
5060     intptr_t j, i = simd_oprsz(desc);
5061     uint64_t *g = vg;
5062     float16 neg_imag = float16_set_sign(0, simd_data(desc));
5063     float16 neg_real = float16_chs(neg_imag);
5064 
5065     do {
5066         uint64_t pg = g[(i - 1) >> 6];
5067         do {
5068             float16 e0, e1, e2, e3;
5069 
5070             /* I holds the real index; J holds the imag index.  */
5071             j = i - sizeof(float16);
5072             i -= 2 * sizeof(float16);
5073 
5074             e0 = *(float16 *)(vn + H1_2(i));
5075             e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5076             e2 = *(float16 *)(vn + H1_2(j));
5077             e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5078 
5079             if (likely((pg >> (i & 63)) & 1)) {
5080                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5081             }
5082             if (likely((pg >> (j & 63)) & 1)) {
5083                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5084             }
5085         } while (i & 63);
5086     } while (i != 0);
5087 }
5088 
5089 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5090                          void *vs, uint32_t desc)
5091 {
5092     intptr_t j, i = simd_oprsz(desc);
5093     uint64_t *g = vg;
5094     float32 neg_imag = float32_set_sign(0, simd_data(desc));
5095     float32 neg_real = float32_chs(neg_imag);
5096 
5097     do {
5098         uint64_t pg = g[(i - 1) >> 6];
5099         do {
5100             float32 e0, e1, e2, e3;
5101 
5102             /* I holds the real index; J holds the imag index.  */
5103             j = i - sizeof(float32);
5104             i -= 2 * sizeof(float32);
5105 
5106             e0 = *(float32 *)(vn + H1_2(i));
5107             e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5108             e2 = *(float32 *)(vn + H1_2(j));
5109             e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5110 
5111             if (likely((pg >> (i & 63)) & 1)) {
5112                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5113             }
5114             if (likely((pg >> (j & 63)) & 1)) {
5115                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5116             }
5117         } while (i & 63);
5118     } while (i != 0);
5119 }
5120 
5121 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5122                          void *vs, uint32_t desc)
5123 {
5124     intptr_t j, i = simd_oprsz(desc);
5125     uint64_t *g = vg;
5126     float64 neg_imag = float64_set_sign(0, simd_data(desc));
5127     float64 neg_real = float64_chs(neg_imag);
5128 
5129     do {
5130         uint64_t pg = g[(i - 1) >> 6];
5131         do {
5132             float64 e0, e1, e2, e3;
5133 
5134             /* I holds the real index; J holds the imag index.  */
5135             j = i - sizeof(float64);
5136             i -= 2 * sizeof(float64);
5137 
5138             e0 = *(float64 *)(vn + H1_2(i));
5139             e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5140             e2 = *(float64 *)(vn + H1_2(j));
5141             e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5142 
5143             if (likely((pg >> (i & 63)) & 1)) {
5144                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5145             }
5146             if (likely((pg >> (j & 63)) & 1)) {
5147                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5148             }
5149         } while (i & 63);
5150     } while (i != 0);
5151 }
5152 
5153 /*
5154  * FP Complex Multiply
5155  */
5156 
5157 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5158                                void *vg, void *status, uint32_t desc)
5159 {
5160     intptr_t j, i = simd_oprsz(desc);
5161     unsigned rot = simd_data(desc);
5162     bool flip = rot & 1;
5163     float16 neg_imag, neg_real;
5164     uint64_t *g = vg;
5165 
5166     neg_imag = float16_set_sign(0, (rot & 2) != 0);
5167     neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5168 
5169     do {
5170         uint64_t pg = g[(i - 1) >> 6];
5171         do {
5172             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5173 
5174             /* I holds the real index; J holds the imag index.  */
5175             j = i - sizeof(float16);
5176             i -= 2 * sizeof(float16);
5177 
5178             nr = *(float16 *)(vn + H1_2(i));
5179             ni = *(float16 *)(vn + H1_2(j));
5180             mr = *(float16 *)(vm + H1_2(i));
5181             mi = *(float16 *)(vm + H1_2(j));
5182 
5183             e2 = (flip ? ni : nr);
5184             e1 = (flip ? mi : mr) ^ neg_real;
5185             e4 = e2;
5186             e3 = (flip ? mr : mi) ^ neg_imag;
5187 
5188             if (likely((pg >> (i & 63)) & 1)) {
5189                 d = *(float16 *)(va + H1_2(i));
5190                 d = float16_muladd(e2, e1, d, 0, status);
5191                 *(float16 *)(vd + H1_2(i)) = d;
5192             }
5193             if (likely((pg >> (j & 63)) & 1)) {
5194                 d = *(float16 *)(va + H1_2(j));
5195                 d = float16_muladd(e4, e3, d, 0, status);
5196                 *(float16 *)(vd + H1_2(j)) = d;
5197             }
5198         } while (i & 63);
5199     } while (i != 0);
5200 }
5201 
5202 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5203                                void *vg, void *status, uint32_t desc)
5204 {
5205     intptr_t j, i = simd_oprsz(desc);
5206     unsigned rot = simd_data(desc);
5207     bool flip = rot & 1;
5208     float32 neg_imag, neg_real;
5209     uint64_t *g = vg;
5210 
5211     neg_imag = float32_set_sign(0, (rot & 2) != 0);
5212     neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5213 
5214     do {
5215         uint64_t pg = g[(i - 1) >> 6];
5216         do {
5217             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5218 
5219             /* I holds the real index; J holds the imag index.  */
5220             j = i - sizeof(float32);
5221             i -= 2 * sizeof(float32);
5222 
5223             nr = *(float32 *)(vn + H1_2(i));
5224             ni = *(float32 *)(vn + H1_2(j));
5225             mr = *(float32 *)(vm + H1_2(i));
5226             mi = *(float32 *)(vm + H1_2(j));
5227 
5228             e2 = (flip ? ni : nr);
5229             e1 = (flip ? mi : mr) ^ neg_real;
5230             e4 = e2;
5231             e3 = (flip ? mr : mi) ^ neg_imag;
5232 
5233             if (likely((pg >> (i & 63)) & 1)) {
5234                 d = *(float32 *)(va + H1_2(i));
5235                 d = float32_muladd(e2, e1, d, 0, status);
5236                 *(float32 *)(vd + H1_2(i)) = d;
5237             }
5238             if (likely((pg >> (j & 63)) & 1)) {
5239                 d = *(float32 *)(va + H1_2(j));
5240                 d = float32_muladd(e4, e3, d, 0, status);
5241                 *(float32 *)(vd + H1_2(j)) = d;
5242             }
5243         } while (i & 63);
5244     } while (i != 0);
5245 }
5246 
5247 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5248                                void *vg, void *status, uint32_t desc)
5249 {
5250     intptr_t j, i = simd_oprsz(desc);
5251     unsigned rot = simd_data(desc);
5252     bool flip = rot & 1;
5253     float64 neg_imag, neg_real;
5254     uint64_t *g = vg;
5255 
5256     neg_imag = float64_set_sign(0, (rot & 2) != 0);
5257     neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5258 
5259     do {
5260         uint64_t pg = g[(i - 1) >> 6];
5261         do {
5262             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5263 
5264             /* I holds the real index; J holds the imag index.  */
5265             j = i - sizeof(float64);
5266             i -= 2 * sizeof(float64);
5267 
5268             nr = *(float64 *)(vn + H1_2(i));
5269             ni = *(float64 *)(vn + H1_2(j));
5270             mr = *(float64 *)(vm + H1_2(i));
5271             mi = *(float64 *)(vm + H1_2(j));
5272 
5273             e2 = (flip ? ni : nr);
5274             e1 = (flip ? mi : mr) ^ neg_real;
5275             e4 = e2;
5276             e3 = (flip ? mr : mi) ^ neg_imag;
5277 
5278             if (likely((pg >> (i & 63)) & 1)) {
5279                 d = *(float64 *)(va + H1_2(i));
5280                 d = float64_muladd(e2, e1, d, 0, status);
5281                 *(float64 *)(vd + H1_2(i)) = d;
5282             }
5283             if (likely((pg >> (j & 63)) & 1)) {
5284                 d = *(float64 *)(va + H1_2(j));
5285                 d = float64_muladd(e4, e3, d, 0, status);
5286                 *(float64 *)(vd + H1_2(j)) = d;
5287             }
5288         } while (i & 63);
5289     } while (i != 0);
5290 }
5291 
5292 /*
5293  * Load contiguous data, protected by a governing predicate.
5294  */
5295 
5296 /*
5297  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5298  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5299  * element >= @reg_off, or @reg_max if there were no active elements at all.
5300  */
5301 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5302                                  intptr_t reg_max, int esz)
5303 {
5304     uint64_t pg_mask = pred_esz_masks[esz];
5305     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5306 
5307     /* In normal usage, the first element is active.  */
5308     if (likely(pg & 1)) {
5309         return reg_off;
5310     }
5311 
5312     if (pg == 0) {
5313         reg_off &= -64;
5314         do {
5315             reg_off += 64;
5316             if (unlikely(reg_off >= reg_max)) {
5317                 /* The entire predicate was false.  */
5318                 return reg_max;
5319             }
5320             pg = vg[reg_off >> 6] & pg_mask;
5321         } while (pg == 0);
5322     }
5323     reg_off += ctz64(pg);
5324 
5325     /* We should never see an out of range predicate bit set.  */
5326     tcg_debug_assert(reg_off < reg_max);
5327     return reg_off;
5328 }
5329 
5330 /*
5331  * Resolve the guest virtual address to info->host and info->flags.
5332  * If @nofault, return false if the page is invalid, otherwise
5333  * exit via page fault exception.
5334  */
5335 
5336 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5337                     target_ulong addr, int mem_off, MMUAccessType access_type,
5338                     int mmu_idx, uintptr_t retaddr)
5339 {
5340     int flags;
5341 
5342     addr += mem_off;
5343 
5344     /*
5345      * User-only currently always issues with TBI.  See the comment
5346      * above useronly_clean_ptr.  Usually we clean this top byte away
5347      * during translation, but we can't do that for e.g. vector + imm
5348      * addressing modes.
5349      *
5350      * We currently always enable TBI for user-only, and do not provide
5351      * a way to turn it off.  So clean the pointer unconditionally here,
5352      * rather than look it up here, or pass it down from above.
5353      */
5354     addr = useronly_clean_ptr(addr);
5355 
5356 #ifdef CONFIG_USER_ONLY
5357     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5358                                &info->host, retaddr);
5359 #else
5360     CPUTLBEntryFull *full;
5361     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5362                               &info->host, &full, retaddr);
5363 #endif
5364     info->flags = flags;
5365 
5366     if (flags & TLB_INVALID_MASK) {
5367         g_assert(nofault);
5368         return false;
5369     }
5370 
5371 #ifdef CONFIG_USER_ONLY
5372     memset(&info->attrs, 0, sizeof(info->attrs));
5373     /* Require both ANON and MTE; see allocation_tag_mem(). */
5374     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5375 #else
5376     info->attrs = full->attrs;
5377     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5378 #endif
5379 
5380     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5381     info->host -= mem_off;
5382     return true;
5383 }
5384 
5385 /*
5386  * Find first active element on each page, and a loose bound for the
5387  * final element on each page.  Identify any single element that spans
5388  * the page boundary.  Return true if there are any active elements.
5389  */
5390 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5391                             intptr_t reg_max, int esz, int msize)
5392 {
5393     const int esize = 1 << esz;
5394     const uint64_t pg_mask = pred_esz_masks[esz];
5395     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5396     intptr_t mem_off_last, mem_off_split;
5397     intptr_t page_split, elt_split;
5398     intptr_t i;
5399 
5400     /* Set all of the element indices to -1, and the TLB data to 0. */
5401     memset(info, -1, offsetof(SVEContLdSt, page));
5402     memset(info->page, 0, sizeof(info->page));
5403 
5404     /* Gross scan over the entire predicate to find bounds. */
5405     i = 0;
5406     do {
5407         uint64_t pg = vg[i] & pg_mask;
5408         if (pg) {
5409             reg_off_last = i * 64 + 63 - clz64(pg);
5410             if (reg_off_first < 0) {
5411                 reg_off_first = i * 64 + ctz64(pg);
5412             }
5413         }
5414     } while (++i * 64 < reg_max);
5415 
5416     if (unlikely(reg_off_first < 0)) {
5417         /* No active elements, no pages touched. */
5418         return false;
5419     }
5420     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5421 
5422     info->reg_off_first[0] = reg_off_first;
5423     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5424     mem_off_last = (reg_off_last >> esz) * msize;
5425 
5426     page_split = -(addr | TARGET_PAGE_MASK);
5427     if (likely(mem_off_last + msize <= page_split)) {
5428         /* The entire operation fits within a single page. */
5429         info->reg_off_last[0] = reg_off_last;
5430         return true;
5431     }
5432 
5433     info->page_split = page_split;
5434     elt_split = page_split / msize;
5435     reg_off_split = elt_split << esz;
5436     mem_off_split = elt_split * msize;
5437 
5438     /*
5439      * This is the last full element on the first page, but it is not
5440      * necessarily active.  If there is no full element, i.e. the first
5441      * active element is the one that's split, this value remains -1.
5442      * It is useful as iteration bounds.
5443      */
5444     if (elt_split != 0) {
5445         info->reg_off_last[0] = reg_off_split - esize;
5446     }
5447 
5448     /* Determine if an unaligned element spans the pages.  */
5449     if (page_split % msize != 0) {
5450         /* It is helpful to know if the split element is active. */
5451         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5452             info->reg_off_split = reg_off_split;
5453             info->mem_off_split = mem_off_split;
5454 
5455             if (reg_off_split == reg_off_last) {
5456                 /* The page crossing element is last. */
5457                 return true;
5458             }
5459         }
5460         reg_off_split += esize;
5461         mem_off_split += msize;
5462     }
5463 
5464     /*
5465      * We do want the first active element on the second page, because
5466      * this may affect the address reported in an exception.
5467      */
5468     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5469     tcg_debug_assert(reg_off_split <= reg_off_last);
5470     info->reg_off_first[1] = reg_off_split;
5471     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5472     info->reg_off_last[1] = reg_off_last;
5473     return true;
5474 }
5475 
5476 /*
5477  * Resolve the guest virtual addresses to info->page[].
5478  * Control the generation of page faults with @fault.  Return false if
5479  * there is no work to do, which can only happen with @fault == FAULT_NO.
5480  */
5481 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5482                          CPUARMState *env, target_ulong addr,
5483                          MMUAccessType access_type, uintptr_t retaddr)
5484 {
5485     int mmu_idx = arm_env_mmu_index(env);
5486     int mem_off = info->mem_off_first[0];
5487     bool nofault = fault == FAULT_NO;
5488     bool have_work = true;
5489 
5490     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5491                         access_type, mmu_idx, retaddr)) {
5492         /* No work to be done. */
5493         return false;
5494     }
5495 
5496     if (likely(info->page_split < 0)) {
5497         /* The entire operation was on the one page. */
5498         return true;
5499     }
5500 
5501     /*
5502      * If the second page is invalid, then we want the fault address to be
5503      * the first byte on that page which is accessed.
5504      */
5505     if (info->mem_off_split >= 0) {
5506         /*
5507          * There is an element split across the pages.  The fault address
5508          * should be the first byte of the second page.
5509          */
5510         mem_off = info->page_split;
5511         /*
5512          * If the split element is also the first active element
5513          * of the vector, then:  For first-fault we should continue
5514          * to generate faults for the second page.  For no-fault,
5515          * we have work only if the second page is valid.
5516          */
5517         if (info->mem_off_first[0] < info->mem_off_split) {
5518             nofault = FAULT_FIRST;
5519             have_work = false;
5520         }
5521     } else {
5522         /*
5523          * There is no element split across the pages.  The fault address
5524          * should be the first active element on the second page.
5525          */
5526         mem_off = info->mem_off_first[1];
5527         /*
5528          * There must have been one active element on the first page,
5529          * so we're out of first-fault territory.
5530          */
5531         nofault = fault != FAULT_ALL;
5532     }
5533 
5534     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5535                                 access_type, mmu_idx, retaddr);
5536     return have_work;
5537 }
5538 
5539 #ifndef CONFIG_USER_ONLY
5540 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5541                                uint64_t *vg, target_ulong addr,
5542                                int esize, int msize, int wp_access,
5543                                uintptr_t retaddr)
5544 {
5545     intptr_t mem_off, reg_off, reg_last;
5546     int flags0 = info->page[0].flags;
5547     int flags1 = info->page[1].flags;
5548 
5549     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5550         return;
5551     }
5552 
5553     /* Indicate that watchpoints are handled. */
5554     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5555     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5556 
5557     if (flags0 & TLB_WATCHPOINT) {
5558         mem_off = info->mem_off_first[0];
5559         reg_off = info->reg_off_first[0];
5560         reg_last = info->reg_off_last[0];
5561 
5562         while (reg_off <= reg_last) {
5563             uint64_t pg = vg[reg_off >> 6];
5564             do {
5565                 if ((pg >> (reg_off & 63)) & 1) {
5566                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5567                                          msize, info->page[0].attrs,
5568                                          wp_access, retaddr);
5569                 }
5570                 reg_off += esize;
5571                 mem_off += msize;
5572             } while (reg_off <= reg_last && (reg_off & 63));
5573         }
5574     }
5575 
5576     mem_off = info->mem_off_split;
5577     if (mem_off >= 0) {
5578         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5579                              info->page[0].attrs, wp_access, retaddr);
5580     }
5581 
5582     mem_off = info->mem_off_first[1];
5583     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5584         reg_off = info->reg_off_first[1];
5585         reg_last = info->reg_off_last[1];
5586 
5587         do {
5588             uint64_t pg = vg[reg_off >> 6];
5589             do {
5590                 if ((pg >> (reg_off & 63)) & 1) {
5591                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5592                                          msize, info->page[1].attrs,
5593                                          wp_access, retaddr);
5594                 }
5595                 reg_off += esize;
5596                 mem_off += msize;
5597             } while (reg_off & 63);
5598         } while (reg_off <= reg_last);
5599     }
5600 }
5601 #endif
5602 
5603 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5604                              uint64_t *vg, target_ulong addr, int esize,
5605                              int msize, uint32_t mtedesc, uintptr_t ra)
5606 {
5607     intptr_t mem_off, reg_off, reg_last;
5608 
5609     /* Process the page only if MemAttr == Tagged. */
5610     if (info->page[0].tagged) {
5611         mem_off = info->mem_off_first[0];
5612         reg_off = info->reg_off_first[0];
5613         reg_last = info->reg_off_split;
5614         if (reg_last < 0) {
5615             reg_last = info->reg_off_last[0];
5616         }
5617 
5618         do {
5619             uint64_t pg = vg[reg_off >> 6];
5620             do {
5621                 if ((pg >> (reg_off & 63)) & 1) {
5622                     mte_check(env, mtedesc, addr, ra);
5623                 }
5624                 reg_off += esize;
5625                 mem_off += msize;
5626             } while (reg_off <= reg_last && (reg_off & 63));
5627         } while (reg_off <= reg_last);
5628     }
5629 
5630     mem_off = info->mem_off_first[1];
5631     if (mem_off >= 0 && info->page[1].tagged) {
5632         reg_off = info->reg_off_first[1];
5633         reg_last = info->reg_off_last[1];
5634 
5635         do {
5636             uint64_t pg = vg[reg_off >> 6];
5637             do {
5638                 if ((pg >> (reg_off & 63)) & 1) {
5639                     mte_check(env, mtedesc, addr, ra);
5640                 }
5641                 reg_off += esize;
5642                 mem_off += msize;
5643             } while (reg_off & 63);
5644         } while (reg_off <= reg_last);
5645     }
5646 }
5647 
5648 /*
5649  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5650  */
5651 static inline QEMU_ALWAYS_INLINE
5652 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5653                uint32_t desc, const uintptr_t retaddr,
5654                const int esz, const int msz, const int N, uint32_t mtedesc,
5655                sve_ldst1_host_fn *host_fn,
5656                sve_ldst1_tlb_fn *tlb_fn)
5657 {
5658     const unsigned rd = simd_data(desc);
5659     const intptr_t reg_max = simd_oprsz(desc);
5660     intptr_t reg_off, reg_last, mem_off;
5661     SVEContLdSt info;
5662     void *host;
5663     int flags, i;
5664 
5665     /* Find the active elements.  */
5666     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5667         /* The entire predicate was false; no load occurs.  */
5668         for (i = 0; i < N; ++i) {
5669             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5670         }
5671         return;
5672     }
5673 
5674     /* Probe the page(s).  Exit with exception for any invalid page. */
5675     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5676 
5677     /* Handle watchpoints for all active elements. */
5678     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5679                               BP_MEM_READ, retaddr);
5680 
5681     /*
5682      * Handle mte checks for all active elements.
5683      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5684      */
5685     if (mtedesc) {
5686         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5687                                 mtedesc, retaddr);
5688     }
5689 
5690     flags = info.page[0].flags | info.page[1].flags;
5691     if (unlikely(flags != 0)) {
5692         /*
5693          * At least one page includes MMIO.
5694          * Any bus operation can fail with cpu_transaction_failed,
5695          * which for ARM will raise SyncExternal.  Perform the load
5696          * into scratch memory to preserve register state until the end.
5697          */
5698         ARMVectorReg scratch[4] = { };
5699 
5700         mem_off = info.mem_off_first[0];
5701         reg_off = info.reg_off_first[0];
5702         reg_last = info.reg_off_last[1];
5703         if (reg_last < 0) {
5704             reg_last = info.reg_off_split;
5705             if (reg_last < 0) {
5706                 reg_last = info.reg_off_last[0];
5707             }
5708         }
5709 
5710         do {
5711             uint64_t pg = vg[reg_off >> 6];
5712             do {
5713                 if ((pg >> (reg_off & 63)) & 1) {
5714                     for (i = 0; i < N; ++i) {
5715                         tlb_fn(env, &scratch[i], reg_off,
5716                                addr + mem_off + (i << msz), retaddr);
5717                     }
5718                 }
5719                 reg_off += 1 << esz;
5720                 mem_off += N << msz;
5721             } while (reg_off & 63);
5722         } while (reg_off <= reg_last);
5723 
5724         for (i = 0; i < N; ++i) {
5725             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5726         }
5727         return;
5728     }
5729 
5730     /* The entire operation is in RAM, on valid pages. */
5731 
5732     for (i = 0; i < N; ++i) {
5733         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5734     }
5735 
5736     mem_off = info.mem_off_first[0];
5737     reg_off = info.reg_off_first[0];
5738     reg_last = info.reg_off_last[0];
5739     host = info.page[0].host;
5740 
5741     set_helper_retaddr(retaddr);
5742 
5743     while (reg_off <= reg_last) {
5744         uint64_t pg = vg[reg_off >> 6];
5745         do {
5746             if ((pg >> (reg_off & 63)) & 1) {
5747                 for (i = 0; i < N; ++i) {
5748                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5749                             host + mem_off + (i << msz));
5750                 }
5751             }
5752             reg_off += 1 << esz;
5753             mem_off += N << msz;
5754         } while (reg_off <= reg_last && (reg_off & 63));
5755     }
5756 
5757     clear_helper_retaddr();
5758 
5759     /*
5760      * Use the slow path to manage the cross-page misalignment.
5761      * But we know this is RAM and cannot trap.
5762      */
5763     mem_off = info.mem_off_split;
5764     if (unlikely(mem_off >= 0)) {
5765         reg_off = info.reg_off_split;
5766         for (i = 0; i < N; ++i) {
5767             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5768                    addr + mem_off + (i << msz), retaddr);
5769         }
5770     }
5771 
5772     mem_off = info.mem_off_first[1];
5773     if (unlikely(mem_off >= 0)) {
5774         reg_off = info.reg_off_first[1];
5775         reg_last = info.reg_off_last[1];
5776         host = info.page[1].host;
5777 
5778         set_helper_retaddr(retaddr);
5779 
5780         do {
5781             uint64_t pg = vg[reg_off >> 6];
5782             do {
5783                 if ((pg >> (reg_off & 63)) & 1) {
5784                     for (i = 0; i < N; ++i) {
5785                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5786                                 host + mem_off + (i << msz));
5787                     }
5788                 }
5789                 reg_off += 1 << esz;
5790                 mem_off += N << msz;
5791             } while (reg_off & 63);
5792         } while (reg_off <= reg_last);
5793 
5794         clear_helper_retaddr();
5795     }
5796 }
5797 
5798 static inline QEMU_ALWAYS_INLINE
5799 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5800                    uint32_t desc, const uintptr_t ra,
5801                    const int esz, const int msz, const int N,
5802                    sve_ldst1_host_fn *host_fn,
5803                    sve_ldst1_tlb_fn *tlb_fn)
5804 {
5805     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5806     int bit55 = extract64(addr, 55, 1);
5807 
5808     /* Remove mtedesc from the normal sve descriptor. */
5809     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5810 
5811     /* Perform gross MTE suppression early. */
5812     if (!tbi_check(mtedesc, bit55) ||
5813         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
5814         mtedesc = 0;
5815     }
5816 
5817     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5818 }
5819 
5820 #define DO_LD1_1(NAME, ESZ)                                             \
5821 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
5822                             target_ulong addr, uint32_t desc)           \
5823 {                                                                       \
5824     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
5825               sve_##NAME##_host, sve_##NAME##_tlb);                     \
5826 }                                                                       \
5827 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
5828                                 target_ulong addr, uint32_t desc)       \
5829 {                                                                       \
5830     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
5831                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
5832 }
5833 
5834 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
5835 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
5836                                target_ulong addr, uint32_t desc)        \
5837 {                                                                       \
5838     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5839               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
5840 }                                                                       \
5841 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
5842                                target_ulong addr, uint32_t desc)        \
5843 {                                                                       \
5844     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5845               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
5846 }                                                                       \
5847 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
5848                                    target_ulong addr, uint32_t desc)    \
5849 {                                                                       \
5850     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5851                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
5852 }                                                                       \
5853 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
5854                                    target_ulong addr, uint32_t desc)    \
5855 {                                                                       \
5856     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5857                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
5858 }
5859 
5860 DO_LD1_1(ld1bb,  MO_8)
5861 DO_LD1_1(ld1bhu, MO_16)
5862 DO_LD1_1(ld1bhs, MO_16)
5863 DO_LD1_1(ld1bsu, MO_32)
5864 DO_LD1_1(ld1bss, MO_32)
5865 DO_LD1_1(ld1bdu, MO_64)
5866 DO_LD1_1(ld1bds, MO_64)
5867 
5868 DO_LD1_2(ld1hh,  MO_16, MO_16)
5869 DO_LD1_2(ld1hsu, MO_32, MO_16)
5870 DO_LD1_2(ld1hss, MO_32, MO_16)
5871 DO_LD1_2(ld1hdu, MO_64, MO_16)
5872 DO_LD1_2(ld1hds, MO_64, MO_16)
5873 
5874 DO_LD1_2(ld1ss,  MO_32, MO_32)
5875 DO_LD1_2(ld1sdu, MO_64, MO_32)
5876 DO_LD1_2(ld1sds, MO_64, MO_32)
5877 
5878 DO_LD1_2(ld1dd,  MO_64, MO_64)
5879 
5880 #undef DO_LD1_1
5881 #undef DO_LD1_2
5882 
5883 #define DO_LDN_1(N)                                                     \
5884 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
5885                              target_ulong addr, uint32_t desc)          \
5886 {                                                                       \
5887     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
5888               sve_ld1bb_host, sve_ld1bb_tlb);                           \
5889 }                                                                       \
5890 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
5891                                  target_ulong addr, uint32_t desc)      \
5892 {                                                                       \
5893     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
5894                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
5895 }
5896 
5897 #define DO_LDN_2(N, SUFF, ESZ)                                          \
5898 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
5899                                     target_ulong addr, uint32_t desc)   \
5900 {                                                                       \
5901     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5902               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
5903 }                                                                       \
5904 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
5905                                     target_ulong addr, uint32_t desc)   \
5906 {                                                                       \
5907     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5908               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
5909 }                                                                       \
5910 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
5911                                         target_ulong addr, uint32_t desc) \
5912 {                                                                       \
5913     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5914                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
5915 }                                                                       \
5916 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
5917                                         target_ulong addr, uint32_t desc) \
5918 {                                                                       \
5919     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5920                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
5921 }
5922 
5923 DO_LDN_1(2)
5924 DO_LDN_1(3)
5925 DO_LDN_1(4)
5926 
5927 DO_LDN_2(2, hh, MO_16)
5928 DO_LDN_2(3, hh, MO_16)
5929 DO_LDN_2(4, hh, MO_16)
5930 
5931 DO_LDN_2(2, ss, MO_32)
5932 DO_LDN_2(3, ss, MO_32)
5933 DO_LDN_2(4, ss, MO_32)
5934 
5935 DO_LDN_2(2, dd, MO_64)
5936 DO_LDN_2(3, dd, MO_64)
5937 DO_LDN_2(4, dd, MO_64)
5938 
5939 #undef DO_LDN_1
5940 #undef DO_LDN_2
5941 
5942 /*
5943  * Load contiguous data, first-fault and no-fault.
5944  *
5945  * For user-only, we control the race between page_check_range and
5946  * another thread's munmap by using set/clear_helper_retaddr.  Any
5947  * SEGV that occurs between those markers is assumed to be because
5948  * the guest page vanished.  Keep that block as small as possible
5949  * so that unrelated QEMU bugs are not blamed on the guest.
5950  */
5951 
5952 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
5953  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5954  * option, which leaves subsequent data unchanged.
5955  */
5956 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5957 {
5958     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5959 
5960     if (i & 63) {
5961         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5962         i = ROUND_UP(i, 64);
5963     }
5964     for (; i < oprsz; i += 64) {
5965         ffr[i / 64] = 0;
5966     }
5967 }
5968 
5969 /*
5970  * Common helper for all contiguous no-fault and first-fault loads.
5971  */
5972 static inline QEMU_ALWAYS_INLINE
5973 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5974                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5975                    const int esz, const int msz, const SVEContFault fault,
5976                    sve_ldst1_host_fn *host_fn,
5977                    sve_ldst1_tlb_fn *tlb_fn)
5978 {
5979     const unsigned rd = simd_data(desc);
5980     void *vd = &env->vfp.zregs[rd];
5981     const intptr_t reg_max = simd_oprsz(desc);
5982     intptr_t reg_off, mem_off, reg_last;
5983     SVEContLdSt info;
5984     int flags;
5985     void *host;
5986 
5987     /* Find the active elements.  */
5988     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5989         /* The entire predicate was false; no load occurs.  */
5990         memset(vd, 0, reg_max);
5991         return;
5992     }
5993     reg_off = info.reg_off_first[0];
5994 
5995     /* Probe the page(s). */
5996     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5997         /* Fault on first element. */
5998         tcg_debug_assert(fault == FAULT_NO);
5999         memset(vd, 0, reg_max);
6000         goto do_fault;
6001     }
6002 
6003     mem_off = info.mem_off_first[0];
6004     flags = info.page[0].flags;
6005 
6006     /*
6007      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6008      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6009      */
6010     if (!info.page[0].tagged) {
6011         mtedesc = 0;
6012     }
6013 
6014     if (fault == FAULT_FIRST) {
6015         /* Trapping mte check for the first-fault element.  */
6016         if (mtedesc) {
6017             mte_check(env, mtedesc, addr + mem_off, retaddr);
6018         }
6019 
6020         /*
6021          * Special handling of the first active element,
6022          * if it crosses a page boundary or is MMIO.
6023          */
6024         bool is_split = mem_off == info.mem_off_split;
6025         if (unlikely(flags != 0) || unlikely(is_split)) {
6026             /*
6027              * Use the slow path for cross-page handling.
6028              * Might trap for MMIO or watchpoints.
6029              */
6030             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6031 
6032             /* After any fault, zero the other elements. */
6033             swap_memzero(vd, reg_off);
6034             reg_off += 1 << esz;
6035             mem_off += 1 << msz;
6036             swap_memzero(vd + reg_off, reg_max - reg_off);
6037 
6038             if (is_split) {
6039                 goto second_page;
6040             }
6041         } else {
6042             memset(vd, 0, reg_max);
6043         }
6044     } else {
6045         memset(vd, 0, reg_max);
6046         if (unlikely(mem_off == info.mem_off_split)) {
6047             /* The first active element crosses a page boundary. */
6048             flags |= info.page[1].flags;
6049             if (unlikely(flags & TLB_MMIO)) {
6050                 /* Some page is MMIO, see below. */
6051                 goto do_fault;
6052             }
6053             if (unlikely(flags & TLB_WATCHPOINT) &&
6054                 (cpu_watchpoint_address_matches
6055                  (env_cpu(env), addr + mem_off, 1 << msz)
6056                  & BP_MEM_READ)) {
6057                 /* Watchpoint hit, see below. */
6058                 goto do_fault;
6059             }
6060             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6061                 goto do_fault;
6062             }
6063             /*
6064              * Use the slow path for cross-page handling.
6065              * This is RAM, without a watchpoint, and will not trap.
6066              */
6067             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6068             goto second_page;
6069         }
6070     }
6071 
6072     /*
6073      * From this point on, all memory operations are MemSingleNF.
6074      *
6075      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6076      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6077      *
6078      * Unfortuately we do not have access to the memory attributes from the
6079      * PTE to tell Device memory from Normal memory.  So we make a mostly
6080      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6081      * This gives the right answer for the common cases of "Normal memory,
6082      * backed by host RAM" and "Device memory, backed by MMIO".
6083      * The architecture allows us to suppress an NF load and return
6084      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6085      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6086      * get wrong is "Device memory, backed by host RAM", for which we
6087      * should return (UNKNOWN, FAULT) for but do not.
6088      *
6089      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6090      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6091      * architectural breakpoints the same.
6092      */
6093     if (unlikely(flags & TLB_MMIO)) {
6094         goto do_fault;
6095     }
6096 
6097     reg_last = info.reg_off_last[0];
6098     host = info.page[0].host;
6099 
6100     set_helper_retaddr(retaddr);
6101 
6102     do {
6103         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6104         do {
6105             if ((pg >> (reg_off & 63)) & 1) {
6106                 if (unlikely(flags & TLB_WATCHPOINT) &&
6107                     (cpu_watchpoint_address_matches
6108                      (env_cpu(env), addr + mem_off, 1 << msz)
6109                      & BP_MEM_READ)) {
6110                     clear_helper_retaddr();
6111                     goto do_fault;
6112                 }
6113                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6114                     clear_helper_retaddr();
6115                     goto do_fault;
6116                 }
6117                 host_fn(vd, reg_off, host + mem_off);
6118             }
6119             reg_off += 1 << esz;
6120             mem_off += 1 << msz;
6121         } while (reg_off <= reg_last && (reg_off & 63));
6122     } while (reg_off <= reg_last);
6123 
6124     clear_helper_retaddr();
6125 
6126     /*
6127      * MemSingleNF is allowed to fail for any reason.  We have special
6128      * code above to handle the first element crossing a page boundary.
6129      * As an implementation choice, decline to handle a cross-page element
6130      * in any other position.
6131      */
6132     reg_off = info.reg_off_split;
6133     if (reg_off >= 0) {
6134         goto do_fault;
6135     }
6136 
6137  second_page:
6138     reg_off = info.reg_off_first[1];
6139     if (likely(reg_off < 0)) {
6140         /* No active elements on the second page.  All done. */
6141         return;
6142     }
6143 
6144     /*
6145      * MemSingleNF is allowed to fail for any reason.  As an implementation
6146      * choice, decline to handle elements on the second page.  This should
6147      * be low frequency as the guest walks through memory -- the next
6148      * iteration of the guest's loop should be aligned on the page boundary,
6149      * and then all following iterations will stay aligned.
6150      */
6151 
6152  do_fault:
6153     record_fault(env, reg_off, reg_max);
6154 }
6155 
6156 static inline QEMU_ALWAYS_INLINE
6157 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6158                        uint32_t desc, const uintptr_t retaddr,
6159                        const int esz, const int msz, const SVEContFault fault,
6160                        sve_ldst1_host_fn *host_fn,
6161                        sve_ldst1_tlb_fn *tlb_fn)
6162 {
6163     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6164     int bit55 = extract64(addr, 55, 1);
6165 
6166     /* Remove mtedesc from the normal sve descriptor. */
6167     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6168 
6169     /* Perform gross MTE suppression early. */
6170     if (!tbi_check(mtedesc, bit55) ||
6171         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6172         mtedesc = 0;
6173     }
6174 
6175     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6176                   esz, msz, fault, host_fn, tlb_fn);
6177 }
6178 
6179 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6180 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6181                                  target_ulong addr, uint32_t desc)      \
6182 {                                                                       \
6183     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6184                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6185 }                                                                       \
6186 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6187                                  target_ulong addr, uint32_t desc)      \
6188 {                                                                       \
6189     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6190                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6191 }                                                                       \
6192 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6193                                      target_ulong addr, uint32_t desc)  \
6194 {                                                                       \
6195     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6196                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6197 }                                                                       \
6198 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6199                                      target_ulong addr, uint32_t desc)  \
6200 {                                                                       \
6201     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6202                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6203 }
6204 
6205 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6206 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6207                                     target_ulong addr, uint32_t desc)   \
6208 {                                                                       \
6209     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6210                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6211 }                                                                       \
6212 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6213                                     target_ulong addr, uint32_t desc)   \
6214 {                                                                       \
6215     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6216                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6217 }                                                                       \
6218 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6219                                     target_ulong addr, uint32_t desc)   \
6220 {                                                                       \
6221     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6222                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6223 }                                                                       \
6224 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6225                                     target_ulong addr, uint32_t desc)   \
6226 {                                                                       \
6227     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6228                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6229 }                                                                       \
6230 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6231                                         target_ulong addr, uint32_t desc) \
6232 {                                                                       \
6233     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6234                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6235 }                                                                       \
6236 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6237                                         target_ulong addr, uint32_t desc) \
6238 {                                                                       \
6239     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6240                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6241 }                                                                       \
6242 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6243                                         target_ulong addr, uint32_t desc) \
6244 {                                                                       \
6245     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6246                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6247 }                                                                       \
6248 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6249                                         target_ulong addr, uint32_t desc) \
6250 {                                                                       \
6251     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6252                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6253 }
6254 
6255 DO_LDFF1_LDNF1_1(bb,  MO_8)
6256 DO_LDFF1_LDNF1_1(bhu, MO_16)
6257 DO_LDFF1_LDNF1_1(bhs, MO_16)
6258 DO_LDFF1_LDNF1_1(bsu, MO_32)
6259 DO_LDFF1_LDNF1_1(bss, MO_32)
6260 DO_LDFF1_LDNF1_1(bdu, MO_64)
6261 DO_LDFF1_LDNF1_1(bds, MO_64)
6262 
6263 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6264 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6265 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6266 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6267 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6268 
6269 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6270 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6271 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6272 
6273 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6274 
6275 #undef DO_LDFF1_LDNF1_1
6276 #undef DO_LDFF1_LDNF1_2
6277 
6278 /*
6279  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6280  */
6281 
6282 static inline QEMU_ALWAYS_INLINE
6283 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6284                uint32_t desc, const uintptr_t retaddr,
6285                const int esz, const int msz, const int N, uint32_t mtedesc,
6286                sve_ldst1_host_fn *host_fn,
6287                sve_ldst1_tlb_fn *tlb_fn)
6288 {
6289     const unsigned rd = simd_data(desc);
6290     const intptr_t reg_max = simd_oprsz(desc);
6291     intptr_t reg_off, reg_last, mem_off;
6292     SVEContLdSt info;
6293     void *host;
6294     int i, flags;
6295 
6296     /* Find the active elements.  */
6297     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6298         /* The entire predicate was false; no store occurs.  */
6299         return;
6300     }
6301 
6302     /* Probe the page(s).  Exit with exception for any invalid page. */
6303     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6304 
6305     /* Handle watchpoints for all active elements. */
6306     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6307                               BP_MEM_WRITE, retaddr);
6308 
6309     /*
6310      * Handle mte checks for all active elements.
6311      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6312      */
6313     if (mtedesc) {
6314         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6315                                 mtedesc, retaddr);
6316     }
6317 
6318     flags = info.page[0].flags | info.page[1].flags;
6319     if (unlikely(flags != 0)) {
6320 #ifdef CONFIG_USER_ONLY
6321         g_assert_not_reached();
6322 #else
6323         /*
6324          * At least one page includes MMIO.
6325          * Any bus operation can fail with cpu_transaction_failed,
6326          * which for ARM will raise SyncExternal.  We cannot avoid
6327          * this fault and will leave with the store incomplete.
6328          */
6329         mem_off = info.mem_off_first[0];
6330         reg_off = info.reg_off_first[0];
6331         reg_last = info.reg_off_last[1];
6332         if (reg_last < 0) {
6333             reg_last = info.reg_off_split;
6334             if (reg_last < 0) {
6335                 reg_last = info.reg_off_last[0];
6336             }
6337         }
6338 
6339         do {
6340             uint64_t pg = vg[reg_off >> 6];
6341             do {
6342                 if ((pg >> (reg_off & 63)) & 1) {
6343                     for (i = 0; i < N; ++i) {
6344                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6345                                addr + mem_off + (i << msz), retaddr);
6346                     }
6347                 }
6348                 reg_off += 1 << esz;
6349                 mem_off += N << msz;
6350             } while (reg_off & 63);
6351         } while (reg_off <= reg_last);
6352         return;
6353 #endif
6354     }
6355 
6356     mem_off = info.mem_off_first[0];
6357     reg_off = info.reg_off_first[0];
6358     reg_last = info.reg_off_last[0];
6359     host = info.page[0].host;
6360 
6361     set_helper_retaddr(retaddr);
6362 
6363     while (reg_off <= reg_last) {
6364         uint64_t pg = vg[reg_off >> 6];
6365         do {
6366             if ((pg >> (reg_off & 63)) & 1) {
6367                 for (i = 0; i < N; ++i) {
6368                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6369                             host + mem_off + (i << msz));
6370                 }
6371             }
6372             reg_off += 1 << esz;
6373             mem_off += N << msz;
6374         } while (reg_off <= reg_last && (reg_off & 63));
6375     }
6376 
6377     clear_helper_retaddr();
6378 
6379     /*
6380      * Use the slow path to manage the cross-page misalignment.
6381      * But we know this is RAM and cannot trap.
6382      */
6383     mem_off = info.mem_off_split;
6384     if (unlikely(mem_off >= 0)) {
6385         reg_off = info.reg_off_split;
6386         for (i = 0; i < N; ++i) {
6387             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6388                    addr + mem_off + (i << msz), retaddr);
6389         }
6390     }
6391 
6392     mem_off = info.mem_off_first[1];
6393     if (unlikely(mem_off >= 0)) {
6394         reg_off = info.reg_off_first[1];
6395         reg_last = info.reg_off_last[1];
6396         host = info.page[1].host;
6397 
6398         set_helper_retaddr(retaddr);
6399 
6400         do {
6401             uint64_t pg = vg[reg_off >> 6];
6402             do {
6403                 if ((pg >> (reg_off & 63)) & 1) {
6404                     for (i = 0; i < N; ++i) {
6405                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6406                                 host + mem_off + (i << msz));
6407                     }
6408                 }
6409                 reg_off += 1 << esz;
6410                 mem_off += N << msz;
6411             } while (reg_off & 63);
6412         } while (reg_off <= reg_last);
6413 
6414         clear_helper_retaddr();
6415     }
6416 }
6417 
6418 static inline QEMU_ALWAYS_INLINE
6419 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6420                    uint32_t desc, const uintptr_t ra,
6421                    const int esz, const int msz, const int N,
6422                    sve_ldst1_host_fn *host_fn,
6423                    sve_ldst1_tlb_fn *tlb_fn)
6424 {
6425     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6426     int bit55 = extract64(addr, 55, 1);
6427 
6428     /* Remove mtedesc from the normal sve descriptor. */
6429     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6430 
6431     /* Perform gross MTE suppression early. */
6432     if (!tbi_check(mtedesc, bit55) ||
6433         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6434         mtedesc = 0;
6435     }
6436 
6437     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6438 }
6439 
6440 #define DO_STN_1(N, NAME, ESZ)                                          \
6441 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6442                                  target_ulong addr, uint32_t desc)      \
6443 {                                                                       \
6444     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6445               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6446 }                                                                       \
6447 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6448                                      target_ulong addr, uint32_t desc)  \
6449 {                                                                       \
6450     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6451                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6452 }
6453 
6454 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6455 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6456                                     target_ulong addr, uint32_t desc)   \
6457 {                                                                       \
6458     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6459               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6460 }                                                                       \
6461 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6462                                     target_ulong addr, uint32_t desc)   \
6463 {                                                                       \
6464     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6465               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6466 }                                                                       \
6467 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6468                                         target_ulong addr, uint32_t desc) \
6469 {                                                                       \
6470     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6471                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6472 }                                                                       \
6473 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6474                                         target_ulong addr, uint32_t desc) \
6475 {                                                                       \
6476     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6477                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6478 }
6479 
6480 DO_STN_1(1, bb, MO_8)
6481 DO_STN_1(1, bh, MO_16)
6482 DO_STN_1(1, bs, MO_32)
6483 DO_STN_1(1, bd, MO_64)
6484 DO_STN_1(2, bb, MO_8)
6485 DO_STN_1(3, bb, MO_8)
6486 DO_STN_1(4, bb, MO_8)
6487 
6488 DO_STN_2(1, hh, MO_16, MO_16)
6489 DO_STN_2(1, hs, MO_32, MO_16)
6490 DO_STN_2(1, hd, MO_64, MO_16)
6491 DO_STN_2(2, hh, MO_16, MO_16)
6492 DO_STN_2(3, hh, MO_16, MO_16)
6493 DO_STN_2(4, hh, MO_16, MO_16)
6494 
6495 DO_STN_2(1, ss, MO_32, MO_32)
6496 DO_STN_2(1, sd, MO_64, MO_32)
6497 DO_STN_2(2, ss, MO_32, MO_32)
6498 DO_STN_2(3, ss, MO_32, MO_32)
6499 DO_STN_2(4, ss, MO_32, MO_32)
6500 
6501 DO_STN_2(1, dd, MO_64, MO_64)
6502 DO_STN_2(2, dd, MO_64, MO_64)
6503 DO_STN_2(3, dd, MO_64, MO_64)
6504 DO_STN_2(4, dd, MO_64, MO_64)
6505 
6506 #undef DO_STN_1
6507 #undef DO_STN_2
6508 
6509 /*
6510  * Loads with a vector index.
6511  */
6512 
6513 /*
6514  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6515  */
6516 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6517 
6518 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6519 {
6520     return *(uint32_t *)(reg + H1_4(reg_ofs));
6521 }
6522 
6523 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6524 {
6525     return *(int32_t *)(reg + H1_4(reg_ofs));
6526 }
6527 
6528 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6529 {
6530     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6531 }
6532 
6533 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6534 {
6535     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6536 }
6537 
6538 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6539 {
6540     return *(uint64_t *)(reg + reg_ofs);
6541 }
6542 
6543 static inline QEMU_ALWAYS_INLINE
6544 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6545                target_ulong base, uint32_t desc, uintptr_t retaddr,
6546                uint32_t mtedesc, int esize, int msize,
6547                zreg_off_fn *off_fn,
6548                sve_ldst1_host_fn *host_fn,
6549                sve_ldst1_tlb_fn *tlb_fn)
6550 {
6551     const int mmu_idx = arm_env_mmu_index(env);
6552     const intptr_t reg_max = simd_oprsz(desc);
6553     const int scale = simd_data(desc);
6554     ARMVectorReg scratch;
6555     intptr_t reg_off;
6556     SVEHostPage info, info2;
6557 
6558     memset(&scratch, 0, reg_max);
6559     reg_off = 0;
6560     do {
6561         uint64_t pg = vg[reg_off >> 6];
6562         do {
6563             if (likely(pg & 1)) {
6564                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6565                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6566 
6567                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6568                                mmu_idx, retaddr);
6569 
6570                 if (likely(in_page >= msize)) {
6571                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6572                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6573                                              info.attrs, BP_MEM_READ, retaddr);
6574                     }
6575                     if (mtedesc && info.tagged) {
6576                         mte_check(env, mtedesc, addr, retaddr);
6577                     }
6578                     if (unlikely(info.flags & TLB_MMIO)) {
6579                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6580                     } else {
6581                         set_helper_retaddr(retaddr);
6582                         host_fn(&scratch, reg_off, info.host);
6583                         clear_helper_retaddr();
6584                     }
6585                 } else {
6586                     /* Element crosses the page boundary. */
6587                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6588                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6589                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6590                         cpu_check_watchpoint(env_cpu(env), addr,
6591                                              msize, info.attrs,
6592                                              BP_MEM_READ, retaddr);
6593                     }
6594                     if (mtedesc && info.tagged) {
6595                         mte_check(env, mtedesc, addr, retaddr);
6596                     }
6597                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6598                 }
6599             }
6600             reg_off += esize;
6601             pg >>= esize;
6602         } while (reg_off & 63);
6603     } while (reg_off < reg_max);
6604 
6605     /* Wait until all exceptions have been raised to write back.  */
6606     memcpy(vd, &scratch, reg_max);
6607 }
6608 
6609 static inline QEMU_ALWAYS_INLINE
6610 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6611                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6612                    int esize, int msize, zreg_off_fn *off_fn,
6613                    sve_ldst1_host_fn *host_fn,
6614                    sve_ldst1_tlb_fn *tlb_fn)
6615 {
6616     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6617     /* Remove mtedesc from the normal sve descriptor. */
6618     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6619 
6620     /*
6621      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6622      * offset base entirely over the address space hole to change the
6623      * pointer tag, or change the bit55 selector.  So we could here
6624      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6625      */
6626     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6627               esize, msize, off_fn, host_fn, tlb_fn);
6628 }
6629 
6630 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6631 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6632                                  void *vm, target_ulong base, uint32_t desc) \
6633 {                                                                            \
6634     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6635               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6636 }                                                                            \
6637 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6638      void *vm, target_ulong base, uint32_t desc)                             \
6639 {                                                                            \
6640     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6641                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6642 }
6643 
6644 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6645 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6646                                  void *vm, target_ulong base, uint32_t desc) \
6647 {                                                                            \
6648     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6649               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6650 }                                                                            \
6651 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6652     void *vm, target_ulong base, uint32_t desc)                              \
6653 {                                                                            \
6654     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6655                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6656 }
6657 
6658 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6659 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6660 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6661 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6662 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6663 
6664 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6665 DO_LD1_ZPZ_S(bss, zss, MO_8)
6666 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6667 DO_LD1_ZPZ_D(bds, zss, MO_8)
6668 DO_LD1_ZPZ_D(bds, zd, MO_8)
6669 
6670 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6671 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6672 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6673 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6674 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6675 
6676 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6677 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6678 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6679 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6680 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6681 
6682 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6683 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6684 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6685 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6686 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6687 
6688 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6689 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6690 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6691 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6692 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6693 
6694 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6695 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6696 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6697 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6698 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6699 
6700 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6701 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6702 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6703 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6704 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6705 
6706 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6707 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6708 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6709 
6710 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6711 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6712 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6713 
6714 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6715 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6716 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6717 
6718 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6719 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6720 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6721 
6722 #undef DO_LD1_ZPZ_S
6723 #undef DO_LD1_ZPZ_D
6724 
6725 /* First fault loads with a vector index.  */
6726 
6727 /*
6728  * Common helpers for all gather first-faulting loads.
6729  */
6730 
6731 static inline QEMU_ALWAYS_INLINE
6732 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6733                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6734                  uint32_t mtedesc, const int esz, const int msz,
6735                  zreg_off_fn *off_fn,
6736                  sve_ldst1_host_fn *host_fn,
6737                  sve_ldst1_tlb_fn *tlb_fn)
6738 {
6739     const int mmu_idx = arm_env_mmu_index(env);
6740     const intptr_t reg_max = simd_oprsz(desc);
6741     const int scale = simd_data(desc);
6742     const int esize = 1 << esz;
6743     const int msize = 1 << msz;
6744     intptr_t reg_off;
6745     SVEHostPage info;
6746     target_ulong addr, in_page;
6747     ARMVectorReg scratch;
6748 
6749     /* Skip to the first true predicate.  */
6750     reg_off = find_next_active(vg, 0, reg_max, esz);
6751     if (unlikely(reg_off >= reg_max)) {
6752         /* The entire predicate was false; no load occurs.  */
6753         memset(vd, 0, reg_max);
6754         return;
6755     }
6756 
6757     /* Protect against overlap between vd and vm. */
6758     if (unlikely(vd == vm)) {
6759         vm = memcpy(&scratch, vm, reg_max);
6760     }
6761 
6762     /*
6763      * Probe the first element, allowing faults.
6764      */
6765     addr = base + (off_fn(vm, reg_off) << scale);
6766     if (mtedesc) {
6767         mte_check(env, mtedesc, addr, retaddr);
6768     }
6769     tlb_fn(env, vd, reg_off, addr, retaddr);
6770 
6771     /* After any fault, zero the other elements. */
6772     swap_memzero(vd, reg_off);
6773     reg_off += esize;
6774     swap_memzero(vd + reg_off, reg_max - reg_off);
6775 
6776     /*
6777      * Probe the remaining elements, not allowing faults.
6778      */
6779     while (reg_off < reg_max) {
6780         uint64_t pg = vg[reg_off >> 6];
6781         do {
6782             if (likely((pg >> (reg_off & 63)) & 1)) {
6783                 addr = base + (off_fn(vm, reg_off) << scale);
6784                 in_page = -(addr | TARGET_PAGE_MASK);
6785 
6786                 if (unlikely(in_page < msize)) {
6787                     /* Stop if the element crosses a page boundary. */
6788                     goto fault;
6789                 }
6790 
6791                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6792                                mmu_idx, retaddr);
6793                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6794                     goto fault;
6795                 }
6796                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6797                     (cpu_watchpoint_address_matches
6798                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6799                     goto fault;
6800                 }
6801                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6802                     goto fault;
6803                 }
6804 
6805                 set_helper_retaddr(retaddr);
6806                 host_fn(vd, reg_off, info.host);
6807                 clear_helper_retaddr();
6808             }
6809             reg_off += esize;
6810         } while (reg_off & 63);
6811     }
6812     return;
6813 
6814  fault:
6815     record_fault(env, reg_off, reg_max);
6816 }
6817 
6818 static inline QEMU_ALWAYS_INLINE
6819 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6820                      target_ulong base, uint32_t desc, uintptr_t retaddr,
6821                      const int esz, const int msz,
6822                      zreg_off_fn *off_fn,
6823                      sve_ldst1_host_fn *host_fn,
6824                      sve_ldst1_tlb_fn *tlb_fn)
6825 {
6826     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6827     /* Remove mtedesc from the normal sve descriptor. */
6828     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6829 
6830     /*
6831      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6832      * offset base entirely over the address space hole to change the
6833      * pointer tag, or change the bit55 selector.  So we could here
6834      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6835      */
6836     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6837                 esz, msz, off_fn, host_fn, tlb_fn);
6838 }
6839 
6840 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
6841 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6842     (CPUARMState *env, void *vd, void *vg,                              \
6843      void *vm, target_ulong base, uint32_t desc)                        \
6844 {                                                                       \
6845     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
6846                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6847 }                                                                       \
6848 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6849     (CPUARMState *env, void *vd, void *vg,                              \
6850      void *vm, target_ulong base, uint32_t desc)                        \
6851 {                                                                       \
6852     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
6853                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6854 }
6855 
6856 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
6857 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6858     (CPUARMState *env, void *vd, void *vg,                              \
6859      void *vm, target_ulong base, uint32_t desc)                        \
6860 {                                                                       \
6861     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
6862                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6863 }                                                                       \
6864 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6865     (CPUARMState *env, void *vd, void *vg,                              \
6866      void *vm, target_ulong base, uint32_t desc)                        \
6867 {                                                                       \
6868     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
6869                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6870 }
6871 
6872 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6873 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6874 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6875 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6876 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6877 
6878 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6879 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6880 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6881 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6882 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6883 
6884 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6885 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6886 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6887 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6888 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6889 
6890 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6891 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6892 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6893 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6894 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6895 
6896 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6897 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6898 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6899 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6900 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6901 
6902 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6903 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6904 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6905 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6906 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6907 
6908 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
6909 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
6910 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6911 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6912 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6913 
6914 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
6915 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
6916 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6917 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6918 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6919 
6920 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6921 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6922 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6923 
6924 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6925 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6926 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6927 
6928 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6929 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6930 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6931 
6932 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6933 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6934 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6935 
6936 /* Stores with a vector index.  */
6937 
6938 static inline QEMU_ALWAYS_INLINE
6939 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6940                target_ulong base, uint32_t desc, uintptr_t retaddr,
6941                uint32_t mtedesc, int esize, int msize,
6942                zreg_off_fn *off_fn,
6943                sve_ldst1_host_fn *host_fn,
6944                sve_ldst1_tlb_fn *tlb_fn)
6945 {
6946     const int mmu_idx = arm_env_mmu_index(env);
6947     const intptr_t reg_max = simd_oprsz(desc);
6948     const int scale = simd_data(desc);
6949     void *host[ARM_MAX_VQ * 4];
6950     intptr_t reg_off, i;
6951     SVEHostPage info, info2;
6952 
6953     /*
6954      * Probe all of the elements for host addresses and flags.
6955      */
6956     i = reg_off = 0;
6957     do {
6958         uint64_t pg = vg[reg_off >> 6];
6959         do {
6960             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6961             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6962 
6963             host[i] = NULL;
6964             if (likely((pg >> (reg_off & 63)) & 1)) {
6965                 if (likely(in_page >= msize)) {
6966                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6967                                    mmu_idx, retaddr);
6968                     if (!(info.flags & TLB_MMIO)) {
6969                         host[i] = info.host;
6970                     }
6971                 } else {
6972                     /*
6973                      * Element crosses the page boundary.
6974                      * Probe both pages, but do not record the host address,
6975                      * so that we use the slow path.
6976                      */
6977                     sve_probe_page(&info, false, env, addr, 0,
6978                                    MMU_DATA_STORE, mmu_idx, retaddr);
6979                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6980                                    MMU_DATA_STORE, mmu_idx, retaddr);
6981                     info.flags |= info2.flags;
6982                 }
6983 
6984                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6985                     cpu_check_watchpoint(env_cpu(env), addr, msize,
6986                                          info.attrs, BP_MEM_WRITE, retaddr);
6987                 }
6988 
6989                 if (mtedesc && info.tagged) {
6990                     mte_check(env, mtedesc, addr, retaddr);
6991                 }
6992             }
6993             i += 1;
6994             reg_off += esize;
6995         } while (reg_off & 63);
6996     } while (reg_off < reg_max);
6997 
6998     /*
6999      * Now that we have recognized all exceptions except SyncExternal
7000      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7001      *
7002      * Note for the common case of an element in RAM, not crossing a page
7003      * boundary, we have stored the host address in host[].  This doubles
7004      * as a first-level check against the predicate, since only enabled
7005      * elements have non-null host addresses.
7006      */
7007     i = reg_off = 0;
7008     do {
7009         void *h = host[i];
7010         if (likely(h != NULL)) {
7011             set_helper_retaddr(retaddr);
7012             host_fn(vd, reg_off, h);
7013             clear_helper_retaddr();
7014         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7015             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7016             tlb_fn(env, vd, reg_off, addr, retaddr);
7017         }
7018         i += 1;
7019         reg_off += esize;
7020     } while (reg_off < reg_max);
7021 }
7022 
7023 static inline QEMU_ALWAYS_INLINE
7024 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7025                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7026                    int esize, int msize, zreg_off_fn *off_fn,
7027                    sve_ldst1_host_fn *host_fn,
7028                    sve_ldst1_tlb_fn *tlb_fn)
7029 {
7030     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7031     /* Remove mtedesc from the normal sve descriptor. */
7032     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7033 
7034     /*
7035      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7036      * offset base entirely over the address space hole to change the
7037      * pointer tag, or change the bit55 selector.  So we could here
7038      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7039      */
7040     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7041               esize, msize, off_fn, host_fn, tlb_fn);
7042 }
7043 
7044 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7045 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7046                                  void *vm, target_ulong base, uint32_t desc) \
7047 {                                                                       \
7048     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7049               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7050 }                                                                       \
7051 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7052     void *vm, target_ulong base, uint32_t desc)                         \
7053 {                                                                       \
7054     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7055                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7056 }
7057 
7058 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7059 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7060                                  void *vm, target_ulong base, uint32_t desc) \
7061 {                                                                       \
7062     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7063               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7064 }                                                                       \
7065 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7066     void *vm, target_ulong base, uint32_t desc)                         \
7067 {                                                                       \
7068     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7069                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7070 }
7071 
7072 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7073 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7074 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7075 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7076 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7077 
7078 DO_ST1_ZPZ_S(bs, zss, MO_8)
7079 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7080 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7081 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7082 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7083 
7084 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7085 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7086 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7087 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7088 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7089 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7090 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7091 
7092 DO_ST1_ZPZ_D(bd, zss, MO_8)
7093 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7094 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7095 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7096 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7097 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7098 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7099 
7100 DO_ST1_ZPZ_D(bd, zd, MO_8)
7101 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7102 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7103 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7104 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7105 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7106 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7107 
7108 #undef DO_ST1_ZPZ_S
7109 #undef DO_ST1_ZPZ_D
7110 
7111 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7112 {
7113     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7114     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7115 
7116     for (i = 0; i < opr_sz; ++i) {
7117         d[i] = n[i] ^ m[i] ^ k[i];
7118     }
7119 }
7120 
7121 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7122 {
7123     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7124     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7125 
7126     for (i = 0; i < opr_sz; ++i) {
7127         d[i] = n[i] ^ (m[i] & ~k[i]);
7128     }
7129 }
7130 
7131 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7132 {
7133     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7134     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7135 
7136     for (i = 0; i < opr_sz; ++i) {
7137         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7138     }
7139 }
7140 
7141 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7142 {
7143     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7144     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7145 
7146     for (i = 0; i < opr_sz; ++i) {
7147         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7148     }
7149 }
7150 
7151 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7152 {
7153     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7154     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7155 
7156     for (i = 0; i < opr_sz; ++i) {
7157         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7158     }
7159 }
7160 
7161 /*
7162  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7163  * See hasless(v,1) from
7164  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7165  */
7166 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7167 {
7168     int bits = 8 << esz;
7169     uint64_t ones = dup_const(esz, 1);
7170     uint64_t signs = ones << (bits - 1);
7171     uint64_t cmp0, cmp1;
7172 
7173     cmp1 = dup_const(esz, n);
7174     cmp0 = cmp1 ^ m0;
7175     cmp1 = cmp1 ^ m1;
7176     cmp0 = (cmp0 - ones) & ~cmp0;
7177     cmp1 = (cmp1 - ones) & ~cmp1;
7178     return (cmp0 | cmp1) & signs;
7179 }
7180 
7181 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7182                                 uint32_t desc, int esz, bool nmatch)
7183 {
7184     uint16_t esz_mask = pred_esz_masks[esz];
7185     intptr_t opr_sz = simd_oprsz(desc);
7186     uint32_t flags = PREDTEST_INIT;
7187     intptr_t i, j, k;
7188 
7189     for (i = 0; i < opr_sz; i += 16) {
7190         uint64_t m0 = *(uint64_t *)(vm + i);
7191         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7192         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7193         uint16_t out = 0;
7194 
7195         for (j = 0; j < 16; j += 8) {
7196             uint64_t n = *(uint64_t *)(vn + i + j);
7197 
7198             for (k = 0; k < 8; k += 1 << esz) {
7199                 if (pg & (1 << (j + k))) {
7200                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7201                     out |= (o ^ nmatch) << (j + k);
7202                 }
7203             }
7204         }
7205         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7206         flags = iter_predtest_fwd(out, pg, flags);
7207     }
7208     return flags;
7209 }
7210 
7211 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7212 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7213 {                                                                             \
7214     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7215 }
7216 
7217 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7218 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7219 
7220 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7221 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7222 
7223 #undef DO_PPZZ_MATCH
7224 
7225 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7226                             uint32_t desc)
7227 {
7228     ARMVectorReg scratch;
7229     intptr_t i, j;
7230     intptr_t opr_sz = simd_oprsz(desc);
7231     uint32_t *d = vd, *n = vn, *m = vm;
7232     uint8_t *pg = vg;
7233 
7234     if (d == n) {
7235         n = memcpy(&scratch, n, opr_sz);
7236         if (d == m) {
7237             m = n;
7238         }
7239     } else if (d == m) {
7240         m = memcpy(&scratch, m, opr_sz);
7241     }
7242 
7243     for (i = 0; i < opr_sz; i += 4) {
7244         uint64_t count = 0;
7245         uint8_t pred;
7246 
7247         pred = pg[H1(i >> 3)] >> (i & 7);
7248         if (pred & 1) {
7249             uint32_t nn = n[H4(i >> 2)];
7250 
7251             for (j = 0; j <= i; j += 4) {
7252                 pred = pg[H1(j >> 3)] >> (j & 7);
7253                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7254                     ++count;
7255                 }
7256             }
7257         }
7258         d[H4(i >> 2)] = count;
7259     }
7260 }
7261 
7262 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7263                             uint32_t desc)
7264 {
7265     ARMVectorReg scratch;
7266     intptr_t i, j;
7267     intptr_t opr_sz = simd_oprsz(desc);
7268     uint64_t *d = vd, *n = vn, *m = vm;
7269     uint8_t *pg = vg;
7270 
7271     if (d == n) {
7272         n = memcpy(&scratch, n, opr_sz);
7273         if (d == m) {
7274             m = n;
7275         }
7276     } else if (d == m) {
7277         m = memcpy(&scratch, m, opr_sz);
7278     }
7279 
7280     for (i = 0; i < opr_sz / 8; ++i) {
7281         uint64_t count = 0;
7282         if (pg[H1(i)] & 1) {
7283             uint64_t nn = n[i];
7284             for (j = 0; j <= i; ++j) {
7285                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7286                     ++count;
7287                 }
7288             }
7289         }
7290         d[i] = count;
7291     }
7292 }
7293 
7294 /*
7295  * Returns the number of bytes in m0 and m1 that match n.
7296  * Unlike do_match2 we don't just need true/false, we need an exact count.
7297  * This requires two extra logical operations.
7298  */
7299 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7300 {
7301     const uint64_t mask = dup_const(MO_8, 0x7f);
7302     uint64_t cmp0, cmp1;
7303 
7304     cmp1 = dup_const(MO_8, n);
7305     cmp0 = cmp1 ^ m0;
7306     cmp1 = cmp1 ^ m1;
7307 
7308     /*
7309      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7310      * 2: carry in to msb if byte != 0 (+ mask)
7311      * 3: set msb if cmp has msb set (| cmp)
7312      * 4: set ~msb to ignore them (| mask)
7313      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7314      * 5: invert, resulting in 0x80 if and only if byte == 0.
7315      */
7316     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7317     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7318 
7319     /*
7320      * Combine the two compares in a way that the bits do
7321      * not overlap, and so preserves the count of set bits.
7322      * If the host has an efficient instruction for ctpop,
7323      * then ctpop(x) + ctpop(y) has the same number of
7324      * operations as ctpop(x | (y >> 1)).  If the host does
7325      * not have an efficient ctpop, then we only want to
7326      * use it once.
7327      */
7328     return ctpop64(cmp0 | (cmp1 >> 1));
7329 }
7330 
7331 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7332 {
7333     intptr_t i, j;
7334     intptr_t opr_sz = simd_oprsz(desc);
7335 
7336     for (i = 0; i < opr_sz; i += 16) {
7337         uint64_t n0 = *(uint64_t *)(vn + i);
7338         uint64_t m0 = *(uint64_t *)(vm + i);
7339         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7340         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7341         uint64_t out0 = 0;
7342         uint64_t out1 = 0;
7343 
7344         for (j = 0; j < 64; j += 8) {
7345             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7346             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7347             out0 |= cnt0 << j;
7348             out1 |= cnt1 << j;
7349         }
7350 
7351         *(uint64_t *)(vd + i) = out0;
7352         *(uint64_t *)(vd + i + 8) = out1;
7353     }
7354 }
7355 
7356 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7357 {
7358     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7359     int shr = simd_data(desc);
7360     int shl = 8 - shr;
7361     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7362     uint64_t *d = vd, *n = vn, *m = vm;
7363 
7364     for (i = 0; i < opr_sz; ++i) {
7365         uint64_t t = n[i] ^ m[i];
7366         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7367     }
7368 }
7369 
7370 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7371 {
7372     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7373     int shr = simd_data(desc);
7374     int shl = 16 - shr;
7375     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7376     uint64_t *d = vd, *n = vn, *m = vm;
7377 
7378     for (i = 0; i < opr_sz; ++i) {
7379         uint64_t t = n[i] ^ m[i];
7380         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7381     }
7382 }
7383 
7384 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7385 {
7386     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7387     int shr = simd_data(desc);
7388     uint32_t *d = vd, *n = vn, *m = vm;
7389 
7390     for (i = 0; i < opr_sz; ++i) {
7391         d[i] = ror32(n[i] ^ m[i], shr);
7392     }
7393 }
7394 
7395 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7396                      void *status, uint32_t desc)
7397 {
7398     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7399 
7400     for (s = 0; s < opr_sz; ++s) {
7401         float32 *n = vn + s * sizeof(float32) * 4;
7402         float32 *m = vm + s * sizeof(float32) * 4;
7403         float32 *a = va + s * sizeof(float32) * 4;
7404         float32 *d = vd + s * sizeof(float32) * 4;
7405         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7406         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7407         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7408         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7409         float32 p0, p1;
7410 
7411         /* i = 0, j = 0 */
7412         p0 = float32_mul(n00, m00, status);
7413         p1 = float32_mul(n01, m01, status);
7414         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7415 
7416         /* i = 0, j = 1 */
7417         p0 = float32_mul(n00, m10, status);
7418         p1 = float32_mul(n01, m11, status);
7419         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7420 
7421         /* i = 1, j = 0 */
7422         p0 = float32_mul(n10, m00, status);
7423         p1 = float32_mul(n11, m01, status);
7424         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7425 
7426         /* i = 1, j = 1 */
7427         p0 = float32_mul(n10, m10, status);
7428         p1 = float32_mul(n11, m11, status);
7429         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7430     }
7431 }
7432 
7433 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7434                      void *status, uint32_t desc)
7435 {
7436     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7437 
7438     for (s = 0; s < opr_sz; ++s) {
7439         float64 *n = vn + s * sizeof(float64) * 4;
7440         float64 *m = vm + s * sizeof(float64) * 4;
7441         float64 *a = va + s * sizeof(float64) * 4;
7442         float64 *d = vd + s * sizeof(float64) * 4;
7443         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7444         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7445         float64 p0, p1;
7446 
7447         /* i = 0, j = 0 */
7448         p0 = float64_mul(n00, m00, status);
7449         p1 = float64_mul(n01, m01, status);
7450         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7451 
7452         /* i = 0, j = 1 */
7453         p0 = float64_mul(n00, m10, status);
7454         p1 = float64_mul(n01, m11, status);
7455         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7456 
7457         /* i = 1, j = 0 */
7458         p0 = float64_mul(n10, m00, status);
7459         p1 = float64_mul(n11, m01, status);
7460         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7461 
7462         /* i = 1, j = 1 */
7463         p0 = float64_mul(n10, m10, status);
7464         p1 = float64_mul(n11, m11, status);
7465         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7466     }
7467 }
7468 
7469 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7470 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7471 {                                                                             \
7472     intptr_t i = simd_oprsz(desc);                                            \
7473     uint64_t *g = vg;                                                         \
7474     do {                                                                      \
7475         uint64_t pg = g[(i - 1) >> 6];                                        \
7476         do {                                                                  \
7477             i -= sizeof(TYPEW);                                               \
7478             if (likely((pg >> (i & 63)) & 1)) {                               \
7479                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7480                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7481             }                                                                 \
7482         } while (i & 63);                                                     \
7483     } while (i != 0);                                                         \
7484 }
7485 
7486 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7487 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7488 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7489 
7490 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7491 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7492 {                                                                             \
7493     intptr_t i = simd_oprsz(desc);                                            \
7494     uint64_t *g = vg;                                                         \
7495     do {                                                                      \
7496         uint64_t pg = g[(i - 1) >> 6];                                        \
7497         do {                                                                  \
7498             i -= sizeof(TYPEW);                                               \
7499             if (likely((pg >> (i & 63)) & 1)) {                               \
7500                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7501                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7502             }                                                                 \
7503         } while (i & 63);                                                     \
7504     } while (i != 0);                                                         \
7505 }
7506 
7507 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7508 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7509 
7510 #undef DO_FCVTLT
7511 #undef DO_FCVTNT
7512