xref: /openbmc/qemu/target/arm/tcg/sve_helper.c (revision a820a85ac83180ed597a01d05450b6f656a8c206)
1  /*
2   * ARM SVE Operations
3   *
4   * Copyright (c) 2018 Linaro, Ltd.
5   *
6   * This library is free software; you can redistribute it and/or
7   * modify it under the terms of the GNU Lesser General Public
8   * License as published by the Free Software Foundation; either
9   * version 2.1 of the License, or (at your option) any later version.
10   *
11   * This library is distributed in the hope that it will be useful,
12   * but WITHOUT ANY WARRANTY; without even the implied warranty of
13   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   * Lesser General Public License for more details.
15   *
16   * You should have received a copy of the GNU Lesser General Public
17   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18   */
19  
20  #include "qemu/osdep.h"
21  #include "cpu.h"
22  #include "internals.h"
23  #include "exec/exec-all.h"
24  #include "exec/page-protection.h"
25  #include "exec/helper-proto.h"
26  #include "tcg/tcg-gvec-desc.h"
27  #include "fpu/softfloat.h"
28  #include "tcg/tcg.h"
29  #include "vec_internal.h"
30  #include "sve_ldst_internal.h"
31  #include "hw/core/tcg-cpu-ops.h"
32  
33  
34  /* Return a value for NZCV as per the ARM PredTest pseudofunction.
35   *
36   * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
37   * and bit 0 set if C is set.  Compare the definitions of these variables
38   * within CPUARMState.
39   */
40  
41  /* For no G bits set, NZCV = C.  */
42  #define PREDTEST_INIT  1
43  
44  /* This is an iterative function, called for each Pd and Pg word
45   * moving forward.
46   */
iter_predtest_fwd(uint64_t d,uint64_t g,uint32_t flags)47  static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
48  {
49      if (likely(g)) {
50          /* Compute N from first D & G.
51             Use bit 2 to signal first G bit seen.  */
52          if (!(flags & 4)) {
53              flags |= ((d & (g & -g)) != 0) << 31;
54              flags |= 4;
55          }
56  
57          /* Accumulate Z from each D & G.  */
58          flags |= ((d & g) != 0) << 1;
59  
60          /* Compute C from last !(D & G).  Replace previous.  */
61          flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
62      }
63      return flags;
64  }
65  
66  /* This is an iterative function, called for each Pd and Pg word
67   * moving backward.
68   */
iter_predtest_bwd(uint64_t d,uint64_t g,uint32_t flags)69  static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
70  {
71      if (likely(g)) {
72          /* Compute C from first (i.e last) !(D & G).
73             Use bit 2 to signal first G bit seen.  */
74          if (!(flags & 4)) {
75              flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
76              flags |= (d & pow2floor(g)) == 0;
77          }
78  
79          /* Accumulate Z from each D & G.  */
80          flags |= ((d & g) != 0) << 1;
81  
82          /* Compute N from last (i.e first) D & G.  Replace previous.  */
83          flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
84      }
85      return flags;
86  }
87  
88  /* The same for a single word predicate.  */
HELPER(sve_predtest1)89  uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
90  {
91      return iter_predtest_fwd(d, g, PREDTEST_INIT);
92  }
93  
94  /* The same for a multi-word predicate.  */
HELPER(sve_predtest)95  uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
96  {
97      uint32_t flags = PREDTEST_INIT;
98      uint64_t *d = vd, *g = vg;
99      uintptr_t i = 0;
100  
101      do {
102          flags = iter_predtest_fwd(d[i], g[i], flags);
103      } while (++i < words);
104  
105      return flags;
106  }
107  
108  /* Similarly for single word elements.  */
expand_pred_s(uint8_t byte)109  static inline uint64_t expand_pred_s(uint8_t byte)
110  {
111      static const uint64_t word[] = {
112          [0x01] = 0x00000000ffffffffull,
113          [0x10] = 0xffffffff00000000ull,
114          [0x11] = 0xffffffffffffffffull,
115      };
116      return word[byte & 0x11];
117  }
118  
119  #define LOGICAL_PPPP(NAME, FUNC) \
120  void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
121  {                                                                         \
122      uintptr_t opr_sz = simd_oprsz(desc);                                  \
123      uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
124      uintptr_t i;                                                          \
125      for (i = 0; i < opr_sz / 8; ++i) {                                    \
126          d[i] = FUNC(n[i], m[i], g[i]);                                    \
127      }                                                                     \
128  }
129  
130  #define DO_AND(N, M, G)  (((N) & (M)) & (G))
131  #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
132  #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
133  #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
134  #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
135  #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
136  #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
137  #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
138  
LOGICAL_PPPP(sve_and_pppp,DO_AND)139  LOGICAL_PPPP(sve_and_pppp, DO_AND)
140  LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
141  LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
142  LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
143  LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
144  LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
145  LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
146  LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
147  
148  #undef DO_AND
149  #undef DO_BIC
150  #undef DO_EOR
151  #undef DO_ORR
152  #undef DO_ORN
153  #undef DO_NOR
154  #undef DO_NAND
155  #undef DO_SEL
156  #undef LOGICAL_PPPP
157  
158  /* Fully general three-operand expander, controlled by a predicate.
159   * This is complicated by the host-endian storage of the register file.
160   */
161  /* ??? I don't expect the compiler could ever vectorize this itself.
162   * With some tables we can convert bit masks to byte masks, and with
163   * extra care wrt byte/word ordering we could use gcc generic vectors
164   * and do 16 bytes at a time.
165   */
166  #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
167  void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
168  {                                                                       \
169      intptr_t i, opr_sz = simd_oprsz(desc);                              \
170      for (i = 0; i < opr_sz; ) {                                         \
171          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
172          do {                                                            \
173              if (pg & 1) {                                               \
174                  TYPE nn = *(TYPE *)(vn + H(i));                         \
175                  TYPE mm = *(TYPE *)(vm + H(i));                         \
176                  *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
177              }                                                           \
178              i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
179          } while (i & 15);                                               \
180      }                                                                   \
181  }
182  
183  /* Similarly, specialized for 64-bit operands.  */
184  #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
185  void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
186  {                                                               \
187      intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
188      TYPE *d = vd, *n = vn, *m = vm;                             \
189      uint8_t *pg = vg;                                           \
190      for (i = 0; i < opr_sz; i += 1) {                           \
191          if (pg[H1(i)] & 1) {                                    \
192              TYPE nn = n[i], mm = m[i];                          \
193              d[i] = OP(nn, mm);                                  \
194          }                                                       \
195      }                                                           \
196  }
197  
198  #define DO_AND(N, M)  (N & M)
199  #define DO_EOR(N, M)  (N ^ M)
200  #define DO_ORR(N, M)  (N | M)
201  #define DO_BIC(N, M)  (N & ~M)
202  #define DO_ADD(N, M)  (N + M)
203  #define DO_SUB(N, M)  (N - M)
204  #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
205  #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
206  #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
207  #define DO_MUL(N, M)  (N * M)
208  
209  
210  /*
211   * We must avoid the C undefined behaviour cases: division by
212   * zero and signed division of INT_MIN by -1. Both of these
213   * have architecturally defined required results for Arm.
214   * We special case all signed divisions by -1 to avoid having
215   * to deduce the minimum integer for the type involved.
216   */
217  #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
218  #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
219  
220  DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
221  DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
222  DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
223  DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
224  
225  DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
226  DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
227  DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
228  DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
229  
230  DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
231  DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
232  DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
233  DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
234  
235  DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
236  DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
237  DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
238  DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
239  
240  DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
241  DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
242  DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
243  DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
244  
245  DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
246  DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
247  DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
248  DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
249  
250  DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
251  DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
252  DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
253  DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
254  
255  DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
256  DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
257  DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
258  DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
259  
260  DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
261  DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
262  DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
263  DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
264  
265  DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
266  DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
267  DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
268  DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
269  
270  DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
271  DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
272  DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
273  DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
274  
275  DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
276  DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
277  DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
278  DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
279  
280  /* Because the computation type is at least twice as large as required,
281     these work for both signed and unsigned source types.  */
282  static inline uint8_t do_mulh_b(int32_t n, int32_t m)
283  {
284      return (n * m) >> 8;
285  }
286  
do_mulh_h(int32_t n,int32_t m)287  static inline uint16_t do_mulh_h(int32_t n, int32_t m)
288  {
289      return (n * m) >> 16;
290  }
291  
do_mulh_s(int64_t n,int64_t m)292  static inline uint32_t do_mulh_s(int64_t n, int64_t m)
293  {
294      return (n * m) >> 32;
295  }
296  
do_smulh_d(uint64_t n,uint64_t m)297  static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
298  {
299      uint64_t lo, hi;
300      muls64(&lo, &hi, n, m);
301      return hi;
302  }
303  
do_umulh_d(uint64_t n,uint64_t m)304  static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
305  {
306      uint64_t lo, hi;
307      mulu64(&lo, &hi, n, m);
308      return hi;
309  }
310  
DO_ZPZZ(sve_mul_zpzz_b,uint8_t,H1,DO_MUL)311  DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
312  DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
313  DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
314  DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
315  
316  DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
317  DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
318  DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
319  DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
320  
321  DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
322  DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
323  DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
324  DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
325  
326  DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
327  DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
328  
329  DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
330  DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
331  
332  /* Note that all bits of the shift are significant
333     and not modulo the element size.  */
334  #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
335  #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
336  #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
337  
338  DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
339  DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
340  DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
341  
342  DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
343  DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
344  DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
345  
346  DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
347  DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
348  DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
349  
350  DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
351  DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
352  DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
353  
354  static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
355  {
356      int8_t n1 = n, n2 = n >> 8;
357      return m + n1 + n2;
358  }
359  
do_sadalp_s(int32_t n,int32_t m)360  static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
361  {
362      int16_t n1 = n, n2 = n >> 16;
363      return m + n1 + n2;
364  }
365  
do_sadalp_d(int64_t n,int64_t m)366  static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
367  {
368      int32_t n1 = n, n2 = n >> 32;
369      return m + n1 + n2;
370  }
371  
DO_ZPZZ(sve2_sadalp_zpzz_h,int16_t,H1_2,do_sadalp_h)372  DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
373  DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
374  DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
375  
376  static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
377  {
378      uint8_t n1 = n, n2 = n >> 8;
379      return m + n1 + n2;
380  }
381  
do_uadalp_s(uint32_t n,uint32_t m)382  static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
383  {
384      uint16_t n1 = n, n2 = n >> 16;
385      return m + n1 + n2;
386  }
387  
do_uadalp_d(uint64_t n,uint64_t m)388  static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
389  {
390      uint32_t n1 = n, n2 = n >> 32;
391      return m + n1 + n2;
392  }
393  
DO_ZPZZ(sve2_uadalp_zpzz_h,uint16_t,H1_2,do_uadalp_h)394  DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
395  DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
396  DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
397  
398  #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
399  #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
400  #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
401  #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
402  
403  DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
404  DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
405  DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
406  DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
407  
408  #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
409  #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
410  #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
411  #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
412  
413  DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
414  DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
415  DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
416  DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
417  
418  /*
419   * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
420   * We pass in a pointer to a dummy saturation field to trigger
421   * the saturating arithmetic but discard the information about
422   * whether it has occurred.
423   */
424  #define do_sqshl_b(n, m) \
425     ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
426  #define do_sqshl_h(n, m) \
427     ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
428  #define do_sqshl_s(n, m) \
429     ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
430  #define do_sqshl_d(n, m) \
431     ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
432  
433  DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
434  DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
435  DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
436  DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
437  
438  #define do_uqshl_b(n, m) \
439     ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
440  #define do_uqshl_h(n, m) \
441     ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
442  #define do_uqshl_s(n, m) \
443     ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
444  #define do_uqshl_d(n, m) \
445     ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
446  
447  DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
448  DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
449  DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
450  DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
451  
452  #define do_sqrshl_b(n, m) \
453     ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
454  #define do_sqrshl_h(n, m) \
455     ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
456  #define do_sqrshl_s(n, m) \
457     ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
458  #define do_sqrshl_d(n, m) \
459     ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
460  
461  DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
462  DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
463  DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
464  DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
465  
466  #undef do_sqrshl_d
467  
468  #define do_uqrshl_b(n, m) \
469     ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
470  #define do_uqrshl_h(n, m) \
471     ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
472  #define do_uqrshl_s(n, m) \
473     ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
474  #define do_uqrshl_d(n, m) \
475     ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
476  
477  DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
478  DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
479  DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
480  DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
481  
482  #undef do_uqrshl_d
483  
484  #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
485  #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
486  
487  DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
488  DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
489  DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
490  DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
491  
492  DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
493  DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
494  DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
495  DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
496  
497  #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
498  #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
499  
500  DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
501  DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
502  DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
503  DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
504  
505  DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
506  DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
507  DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
508  DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
509  
510  #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
511  #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
512  
513  DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
514  DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
515  DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
516  DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
517  
518  DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
519  DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
520  DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
521  DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
522  
523  static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
524  {
525      return val >= max ? max : val <= min ? min : val;
526  }
527  
528  #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
529  #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
530  #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
531  
do_sqadd_d(int64_t n,int64_t m)532  static inline int64_t do_sqadd_d(int64_t n, int64_t m)
533  {
534      int64_t r = n + m;
535      if (((r ^ n) & ~(n ^ m)) < 0) {
536          /* Signed overflow.  */
537          return r < 0 ? INT64_MAX : INT64_MIN;
538      }
539      return r;
540  }
541  
DO_ZPZZ(sve2_sqadd_zpzz_b,int8_t,H1,DO_SQADD_B)542  DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
543  DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
544  DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
545  DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
546  
547  #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
548  #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
549  #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
550  
551  static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
552  {
553      uint64_t r = n + m;
554      return r < n ? UINT64_MAX : r;
555  }
556  
DO_ZPZZ(sve2_uqadd_zpzz_b,uint8_t,H1,DO_UQADD_B)557  DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
558  DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
559  DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
560  DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
561  
562  #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
563  #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
564  #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
565  
566  static inline int64_t do_sqsub_d(int64_t n, int64_t m)
567  {
568      int64_t r = n - m;
569      if (((r ^ n) & (n ^ m)) < 0) {
570          /* Signed overflow.  */
571          return r < 0 ? INT64_MAX : INT64_MIN;
572      }
573      return r;
574  }
575  
DO_ZPZZ(sve2_sqsub_zpzz_b,int8_t,H1,DO_SQSUB_B)576  DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
577  DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
578  DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
579  DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
580  
581  #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
582  #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
583  #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
584  
585  static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
586  {
587      return n > m ? n - m : 0;
588  }
589  
DO_ZPZZ(sve2_uqsub_zpzz_b,uint8_t,H1,DO_UQSUB_B)590  DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
591  DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
592  DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
593  DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
594  
595  #define DO_SUQADD_B(n, m) \
596      do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
597  #define DO_SUQADD_H(n, m) \
598      do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
599  #define DO_SUQADD_S(n, m) \
600      do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
601  
602  static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
603  {
604      uint64_t r = n + m;
605  
606      if (n < 0) {
607          /* Note that m - abs(n) cannot underflow. */
608          if (r > INT64_MAX) {
609              /* Result is either very large positive or negative. */
610              if (m > -n) {
611                  /* m > abs(n), so r is a very large positive. */
612                  return INT64_MAX;
613              }
614              /* Result is negative. */
615          }
616      } else {
617          /* Both inputs are positive: check for overflow.  */
618          if (r < m || r > INT64_MAX) {
619              return INT64_MAX;
620          }
621      }
622      return r;
623  }
624  
DO_ZPZZ(sve2_suqadd_zpzz_b,uint8_t,H1,DO_SUQADD_B)625  DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
626  DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
627  DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
628  DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
629  
630  #define DO_USQADD_B(n, m) \
631      do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
632  #define DO_USQADD_H(n, m) \
633      do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
634  #define DO_USQADD_S(n, m) \
635      do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
636  
637  static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
638  {
639      uint64_t r = n + m;
640  
641      if (m < 0) {
642          return n < -m ? 0 : r;
643      }
644      return r < n ? UINT64_MAX : r;
645  }
646  
DO_ZPZZ(sve2_usqadd_zpzz_b,uint8_t,H1,DO_USQADD_B)647  DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
648  DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
649  DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
650  DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
651  
652  #undef DO_ZPZZ
653  #undef DO_ZPZZ_D
654  
655  /*
656   * Three operand expander, operating on element pairs.
657   * If the slot I is even, the elements from from VN {I, I+1}.
658   * If the slot I is odd, the elements from from VM {I-1, I}.
659   * Load all of the input elements in each pair before overwriting output.
660   */
661  #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
662  void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
663  {                                                               \
664      intptr_t i, opr_sz = simd_oprsz(desc);                      \
665      for (i = 0; i < opr_sz; ) {                                 \
666          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
667          do {                                                    \
668              TYPE n0 = *(TYPE *)(vn + H(i));                     \
669              TYPE m0 = *(TYPE *)(vm + H(i));                     \
670              TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
671              TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
672              if (pg & 1) {                                       \
673                  *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
674              }                                                   \
675              i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
676              if (pg & 1) {                                       \
677                  *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
678              }                                                   \
679              i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
680          } while (i & 15);                                       \
681      }                                                           \
682  }
683  
684  /* Similarly, specialized for 64-bit operands.  */
685  #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
686  void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
687  {                                                               \
688      intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
689      TYPE *d = vd, *n = vn, *m = vm;                             \
690      uint8_t *pg = vg;                                           \
691      for (i = 0; i < opr_sz; i += 2) {                           \
692          TYPE n0 = n[i], n1 = n[i + 1];                          \
693          TYPE m0 = m[i], m1 = m[i + 1];                          \
694          if (pg[H1(i)] & 1) {                                    \
695              d[i] = OP(n0, n1);                                  \
696          }                                                       \
697          if (pg[H1(i + 1)] & 1) {                                \
698              d[i + 1] = OP(m0, m1);                              \
699          }                                                       \
700      }                                                           \
701  }
702  
703  DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
704  DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
705  DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
706  DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
707  
708  DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
709  DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
710  DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
711  DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
712  
713  DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
714  DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
715  DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
716  DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
717  
718  DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
719  DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
720  DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
721  DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
722  
723  DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
724  DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
725  DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
726  DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
727  
728  #undef DO_ZPZZ_PAIR
729  #undef DO_ZPZZ_PAIR_D
730  
731  #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
732  void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
733                    void *status, uint32_t desc)                          \
734  {                                                                       \
735      intptr_t i, opr_sz = simd_oprsz(desc);                              \
736      for (i = 0; i < opr_sz; ) {                                         \
737          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
738          do {                                                            \
739              TYPE n0 = *(TYPE *)(vn + H(i));                             \
740              TYPE m0 = *(TYPE *)(vm + H(i));                             \
741              TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
742              TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
743              if (pg & 1) {                                               \
744                  *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
745              }                                                           \
746              i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
747              if (pg & 1) {                                               \
748                  *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
749              }                                                           \
750              i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
751          } while (i & 15);                                               \
752      }                                                                   \
753  }
754  
755  DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
756  DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
757  DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
758  
759  DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
760  DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
761  DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
762  
763  DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
764  DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
765  DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
766  
767  DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
768  DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
769  DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
770  
771  DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
772  DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
773  DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
774  
775  #undef DO_ZPZZ_PAIR_FP
776  
777  /* Three-operand expander, controlled by a predicate, in which the
778   * third operand is "wide".  That is, for D = N op M, the same 64-bit
779   * value of M is used with all of the narrower values of N.
780   */
781  #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
782  void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
783  {                                                                       \
784      intptr_t i, opr_sz = simd_oprsz(desc);                              \
785      for (i = 0; i < opr_sz; ) {                                         \
786          uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
787          TYPEW mm = *(TYPEW *)(vm + i);                                  \
788          do {                                                            \
789              if (pg & 1) {                                               \
790                  TYPE nn = *(TYPE *)(vn + H(i));                         \
791                  *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
792              }                                                           \
793              i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
794          } while (i & 7);                                                \
795      }                                                                   \
796  }
797  
798  DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
799  DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
800  DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
801  
802  DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
803  DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
804  DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
805  
806  DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
807  DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
808  DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
809  
810  #undef DO_ZPZW
811  
812  /* Fully general two-operand expander, controlled by a predicate.
813   */
814  #define DO_ZPZ(NAME, TYPE, H, OP)                               \
815  void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
816  {                                                               \
817      intptr_t i, opr_sz = simd_oprsz(desc);                      \
818      for (i = 0; i < opr_sz; ) {                                 \
819          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
820          do {                                                    \
821              if (pg & 1) {                                       \
822                  TYPE nn = *(TYPE *)(vn + H(i));                 \
823                  *(TYPE *)(vd + H(i)) = OP(nn);                  \
824              }                                                   \
825              i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
826          } while (i & 15);                                       \
827      }                                                           \
828  }
829  
830  /* Similarly, specialized for 64-bit operands.  */
831  #define DO_ZPZ_D(NAME, TYPE, OP)                                \
832  void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
833  {                                                               \
834      intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
835      TYPE *d = vd, *n = vn;                                      \
836      uint8_t *pg = vg;                                           \
837      for (i = 0; i < opr_sz; i += 1) {                           \
838          if (pg[H1(i)] & 1) {                                    \
839              TYPE nn = n[i];                                     \
840              d[i] = OP(nn);                                      \
841          }                                                       \
842      }                                                           \
843  }
844  
845  #define DO_CLS_B(N)   (clrsb32(N) - 24)
846  #define DO_CLS_H(N)   (clrsb32(N) - 16)
847  
848  DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
849  DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
850  DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
851  DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
852  
853  #define DO_CLZ_B(N)   (clz32(N) - 24)
854  #define DO_CLZ_H(N)   (clz32(N) - 16)
855  
856  DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
857  DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
858  DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
859  DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
860  
861  DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
862  DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
863  DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
864  DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
865  
866  #define DO_CNOT(N)    (N == 0)
867  
868  DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
869  DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
870  DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
871  DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
872  
873  #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
874  
875  DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
876  DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
877  DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
878  
879  #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
880  
881  DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
882  DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
883  DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
884  
885  #define DO_NOT(N)    (~N)
886  
887  DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
888  DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
889  DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
890  DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
891  
892  #define DO_SXTB(N)    ((int8_t)N)
893  #define DO_SXTH(N)    ((int16_t)N)
894  #define DO_SXTS(N)    ((int32_t)N)
895  #define DO_UXTB(N)    ((uint8_t)N)
896  #define DO_UXTH(N)    ((uint16_t)N)
897  #define DO_UXTS(N)    ((uint32_t)N)
898  
899  DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
900  DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
901  DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
902  DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
903  DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
904  DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
905  
906  DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
907  DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
908  DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
909  DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
910  DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
911  DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
912  
913  #define DO_ABS(N)    (N < 0 ? -N : N)
914  
915  DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
916  DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
917  DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
918  DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
919  
920  #define DO_NEG(N)    (-N)
921  
922  DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
923  DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
924  DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
925  DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
926  
927  DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
928  DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
929  DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
930  
931  DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
932  DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
933  
934  DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
935  
936  void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
937  {
938      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
939      uint64_t *d = vd, *n = vn;
940      uint8_t *pg = vg;
941  
942      for (i = 0; i < opr_sz; i += 2) {
943          if (pg[H1(i)] & 1) {
944              uint64_t n0 = n[i + 0];
945              uint64_t n1 = n[i + 1];
946              d[i + 0] = n1;
947              d[i + 1] = n0;
948          }
949      }
950  }
951  
DO_ZPZ(sve_rbit_b,uint8_t,H1,revbit8)952  DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
953  DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
954  DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
955  DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
956  
957  #define DO_SQABS(X) \
958      ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
959         x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
960  
961  DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
962  DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
963  DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
964  DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
965  
966  #define DO_SQNEG(X) \
967      ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
968         x_ == min_ ? -min_ - 1 : -x_; })
969  
970  DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
971  DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
972  DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
973  DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
974  
975  DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
976  DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
977  
978  /* Three-operand expander, unpredicated, in which the third operand is "wide".
979   */
980  #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
981  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
982  {                                                              \
983      intptr_t i, opr_sz = simd_oprsz(desc);                     \
984      for (i = 0; i < opr_sz; ) {                                \
985          TYPEW mm = *(TYPEW *)(vm + i);                         \
986          do {                                                   \
987              TYPE nn = *(TYPE *)(vn + H(i));                    \
988              *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
989              i += sizeof(TYPE);                                 \
990          } while (i & 7);                                       \
991      }                                                          \
992  }
993  
994  DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
995  DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
996  DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
997  
998  DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
999  DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1000  DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1001  
1002  DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1003  DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1004  DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1005  
1006  #undef DO_ZZW
1007  
1008  #undef DO_CLS_B
1009  #undef DO_CLS_H
1010  #undef DO_CLZ_B
1011  #undef DO_CLZ_H
1012  #undef DO_CNOT
1013  #undef DO_FABS
1014  #undef DO_FNEG
1015  #undef DO_ABS
1016  #undef DO_NEG
1017  #undef DO_ZPZ
1018  #undef DO_ZPZ_D
1019  
1020  /*
1021   * Three-operand expander, unpredicated, in which the two inputs are
1022   * selected from the top or bottom half of the wide column.
1023   */
1024  #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1025  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1026  {                                                                       \
1027      intptr_t i, opr_sz = simd_oprsz(desc);                              \
1028      int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1029      int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1030      for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1031          TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1032          TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1033          *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1034      }                                                                   \
1035  }
1036  
1037  DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1038  DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1039  DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1040  
1041  DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1042  DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1043  DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1044  
1045  DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1046  DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1047  DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1048  
1049  DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1050  DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1051  DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1052  
1053  DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1054  DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1055  DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1056  
1057  DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1058  DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1059  DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1060  
1061  DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1062  DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1063  DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1064  
1065  DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1066  DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1067  DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1068  
1069  /* Note that the multiply cannot overflow, but the doubling can. */
1070  static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1071  {
1072      int16_t val = n * m;
1073      return DO_SQADD_H(val, val);
1074  }
1075  
do_sqdmull_s(int32_t n,int32_t m)1076  static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1077  {
1078      int32_t val = n * m;
1079      return DO_SQADD_S(val, val);
1080  }
1081  
do_sqdmull_d(int64_t n,int64_t m)1082  static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1083  {
1084      int64_t val = n * m;
1085      return do_sqadd_d(val, val);
1086  }
1087  
DO_ZZZ_TB(sve2_sqdmull_zzz_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h)1088  DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1089  DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1090  DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1091  
1092  #undef DO_ZZZ_TB
1093  
1094  #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1095  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1096  {                                                              \
1097      intptr_t i, opr_sz = simd_oprsz(desc);                     \
1098      int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1099      for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1100          TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1101          TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1102          *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1103      }                                                          \
1104  }
1105  
1106  DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1107  DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1108  DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1109  
1110  DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1111  DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1112  DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1113  
1114  DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1115  DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1116  DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1117  
1118  DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1119  DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1120  DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1121  
1122  #undef DO_ZZZ_WTB
1123  
1124  #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1125  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1126  {                                                                       \
1127      intptr_t i, opr_sz = simd_oprsz(desc);                              \
1128      intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1129      intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1130      for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1131          TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1132          TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1133          *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1134      }                                                                   \
1135  }
1136  
1137  DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1138  DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1139  DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1140  DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1141  
1142  #undef DO_ZZZ_NTB
1143  
1144  #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1145  void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1146  {                                                               \
1147      intptr_t i, opr_sz = simd_oprsz(desc);                      \
1148      intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1149      for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1150          TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1151          TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1152          TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1153          *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1154      }                                                           \
1155  }
1156  
1157  DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1158  DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1159  DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1160  
1161  DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1162  DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1163  DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1164  
1165  DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1166  DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1167  DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1168  
1169  DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1170  DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1171  DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1172  
1173  #define DO_NMUL(N, M)  -(N * M)
1174  
1175  DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1176  DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1177  DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1178  
1179  DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1180  DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1181  DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1182  
1183  #undef DO_ZZZW_ACC
1184  
1185  #define DO_XTNB(NAME, TYPE, OP) \
1186  void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1187  {                                                            \
1188      intptr_t i, opr_sz = simd_oprsz(desc);                   \
1189      for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1190          TYPE nn = *(TYPE *)(vn + i);                         \
1191          nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1192          *(TYPE *)(vd + i) = nn;                              \
1193      }                                                        \
1194  }
1195  
1196  #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1197  void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1198  {                                                                       \
1199      intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1200      for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1201          TYPE nn = *(TYPE *)(vn + i);                                    \
1202          *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1203      }                                                                   \
1204  }
1205  
1206  #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1207  #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1208  #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1209  
1210  DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1211  DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1212  DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1213  
1214  DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1215  DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1216  DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1217  
1218  #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1219  #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1220  #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1221  
1222  DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1223  DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1224  DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1225  
1226  DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1227  DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1228  DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1229  
1230  DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1231  DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1232  DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1233  
1234  DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1235  DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1236  DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1237  
1238  #undef DO_XTNB
1239  #undef DO_XTNT
1240  
1241  void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1242  {
1243      intptr_t i, opr_sz = simd_oprsz(desc);
1244      int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1245      uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1246      uint32_t *a = va, *n = vn;
1247      uint64_t *d = vd, *m = vm;
1248  
1249      for (i = 0; i < opr_sz / 8; ++i) {
1250          uint32_t e1 = a[2 * i + H4(0)];
1251          uint32_t e2 = n[2 * i + sel] ^ inv;
1252          uint64_t c = extract64(m[i], 32, 1);
1253          /* Compute and store the entire 33-bit result at once. */
1254          d[i] = c + e1 + e2;
1255      }
1256  }
1257  
HELPER(sve2_adcl_d)1258  void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1259  {
1260      intptr_t i, opr_sz = simd_oprsz(desc);
1261      int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1262      uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1263      uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1264  
1265      for (i = 0; i < opr_sz / 8; i += 2) {
1266          Int128 e1 = int128_make64(a[i]);
1267          Int128 e2 = int128_make64(n[i + sel] ^ inv);
1268          Int128 c = int128_make64(m[i + 1] & 1);
1269          Int128 r = int128_add(int128_add(e1, e2), c);
1270          d[i + 0] = int128_getlo(r);
1271          d[i + 1] = int128_gethi(r);
1272      }
1273  }
1274  
1275  #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1276  void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1277  {                                                                       \
1278      intptr_t i, opr_sz = simd_oprsz(desc);                              \
1279      int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1280      int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1281      for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1282          TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1283          TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1284          TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1285          *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1286      }                                                                   \
1287  }
1288  
DO_SQDMLAL(sve2_sqdmlal_zzzw_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h,DO_SQADD_H)1289  DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1290             do_sqdmull_h, DO_SQADD_H)
1291  DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1292             do_sqdmull_s, DO_SQADD_S)
1293  DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1294             do_sqdmull_d, do_sqadd_d)
1295  
1296  DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1297             do_sqdmull_h, DO_SQSUB_H)
1298  DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1299             do_sqdmull_s, DO_SQSUB_S)
1300  DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1301             do_sqdmull_d, do_sqsub_d)
1302  
1303  #undef DO_SQDMLAL
1304  
1305  #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1306  void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1307  {                                                               \
1308      intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1309      int rot = simd_data(desc);                                  \
1310      int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1311      bool sub_r = rot == 1 || rot == 2;                          \
1312      bool sub_i = rot >= 2;                                      \
1313      TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1314      for (i = 0; i < opr_sz; i += 2) {                           \
1315          TYPE elt1_a = n[H(i + sel_a)];                          \
1316          TYPE elt2_a = m[H(i + sel_a)];                          \
1317          TYPE elt2_b = m[H(i + sel_b)];                          \
1318          d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1319          d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1320      }                                                           \
1321  }
1322  
1323  #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1324  
1325  DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1326  DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1327  DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1328  DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1329  
1330  #define DO_SQRDMLAH_B(N, M, A, S) \
1331      do_sqrdmlah_b(N, M, A, S, true)
1332  #define DO_SQRDMLAH_H(N, M, A, S) \
1333      ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1334  #define DO_SQRDMLAH_S(N, M, A, S) \
1335      ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1336  #define DO_SQRDMLAH_D(N, M, A, S) \
1337      do_sqrdmlah_d(N, M, A, S, true)
1338  
1339  DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1340  DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1341  DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1342  DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1343  
1344  #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1345  void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1346  {                                                                           \
1347      intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1348      int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1349      int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1350      int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1351      bool sub_r = rot == 1 || rot == 2;                                      \
1352      bool sub_i = rot >= 2;                                                  \
1353      TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1354      for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1355          TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1356          TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1357          for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1358              TYPE elt1_a = n[H(i + j + sel_a)];                              \
1359              d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1360              d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1361          }                                                                   \
1362      }                                                                       \
1363  }
1364  
1365  DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1366  DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1367  
1368  DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1369  DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1370  
1371  #undef DO_CMLA
1372  #undef DO_CMLA_FUNC
1373  #undef DO_CMLA_IDX_FUNC
1374  #undef DO_SQRDMLAH_B
1375  #undef DO_SQRDMLAH_H
1376  #undef DO_SQRDMLAH_S
1377  #undef DO_SQRDMLAH_D
1378  
1379  /* Note N and M are 4 elements bundled into one unit. */
1380  static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1381                           int sel_a, int sel_b, int sub_i)
1382  {
1383      for (int i = 0; i <= 1; i++) {
1384          int32_t elt1_r = (int8_t)(n >> (16 * i));
1385          int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1386          int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1387          int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1388  
1389          a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1390      }
1391      return a;
1392  }
1393  
do_cdot_d(uint64_t n,uint64_t m,int64_t a,int sel_a,int sel_b,int sub_i)1394  static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1395                           int sel_a, int sel_b, int sub_i)
1396  {
1397      for (int i = 0; i <= 1; i++) {
1398          int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1399          int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1400          int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1401          int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1402  
1403          a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1404      }
1405      return a;
1406  }
1407  
HELPER(sve2_cdot_zzzz_s)1408  void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1409                                void *va, uint32_t desc)
1410  {
1411      int opr_sz = simd_oprsz(desc);
1412      int rot = simd_data(desc);
1413      int sel_a = rot & 1;
1414      int sel_b = sel_a ^ 1;
1415      int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1416      uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1417  
1418      for (int e = 0; e < opr_sz / 4; e++) {
1419          d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1420      }
1421  }
1422  
HELPER(sve2_cdot_zzzz_d)1423  void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1424                                void *va, uint32_t desc)
1425  {
1426      int opr_sz = simd_oprsz(desc);
1427      int rot = simd_data(desc);
1428      int sel_a = rot & 1;
1429      int sel_b = sel_a ^ 1;
1430      int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1431      uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1432  
1433      for (int e = 0; e < opr_sz / 8; e++) {
1434          d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1435      }
1436  }
1437  
HELPER(sve2_cdot_idx_s)1438  void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1439                               void *va, uint32_t desc)
1440  {
1441      int opr_sz = simd_oprsz(desc);
1442      int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1443      int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1444      int sel_a = rot & 1;
1445      int sel_b = sel_a ^ 1;
1446      int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1447      uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1448  
1449      for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1450          uint32_t seg_m = m[seg + idx];
1451          for (int e = 0; e < 4; e++) {
1452              d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1453                                     sel_a, sel_b, sub_i);
1454          }
1455      }
1456  }
1457  
HELPER(sve2_cdot_idx_d)1458  void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1459                               void *va, uint32_t desc)
1460  {
1461      int seg, opr_sz = simd_oprsz(desc);
1462      int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1463      int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1464      int sel_a = rot & 1;
1465      int sel_b = sel_a ^ 1;
1466      int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1467      uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1468  
1469      for (seg = 0; seg < opr_sz / 8; seg += 2) {
1470          uint64_t seg_m = m[seg + idx];
1471          for (int e = 0; e < 2; e++) {
1472              d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1473                                     sel_a, sel_b, sub_i);
1474          }
1475      }
1476  }
1477  
1478  #define DO_ZZXZ(NAME, TYPE, H, OP) \
1479  void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1480  {                                                                       \
1481      intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1482      intptr_t i, j, idx = simd_data(desc);                               \
1483      TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1484      for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1485          TYPE mm = m[i];                                                 \
1486          for (j = 0; j < segment; j++) {                                 \
1487              d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1488          }                                                               \
1489      }                                                                   \
1490  }
1491  
1492  #define DO_SQRDMLAH_H(N, M, A) \
1493      ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1494  #define DO_SQRDMLAH_S(N, M, A) \
1495      ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1496  #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1497  
DO_ZZXZ(sve2_sqrdmlah_idx_h,int16_t,H2,DO_SQRDMLAH_H)1498  DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1499  DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1500  DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1501  
1502  #define DO_SQRDMLSH_H(N, M, A) \
1503      ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1504  #define DO_SQRDMLSH_S(N, M, A) \
1505      ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1506  #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1507  
1508  DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1509  DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1510  DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1511  
1512  #undef DO_ZZXZ
1513  
1514  #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1515  void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1516  {                                                                         \
1517      intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1518      intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1519      intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1520      for (i = 0; i < oprsz; i += 16) {                                     \
1521          TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1522          for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1523              TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1524              TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1525              *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1526          }                                                                 \
1527      }                                                                     \
1528  }
1529  
1530  #define DO_MLA(N, M, A)  (A + N * M)
1531  
1532  DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1533  DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1534  DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1535  DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1536  
1537  #define DO_MLS(N, M, A)  (A - N * M)
1538  
1539  DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1540  DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1541  DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1542  DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1543  
1544  #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1545  #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1546  
1547  DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1548  DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1549  
1550  #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1551  #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1552  
1553  DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1554  DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1555  
1556  #undef DO_MLA
1557  #undef DO_MLS
1558  #undef DO_ZZXW
1559  
1560  #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1561  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1562  {                                                                         \
1563      intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1564      intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1565      intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1566      for (i = 0; i < oprsz; i += 16) {                                     \
1567          TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1568          for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1569              TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1570              *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1571          }                                                                 \
1572      }                                                                     \
1573  }
1574  
1575  DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1576  DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1577  
1578  DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1579  DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1580  
1581  DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1582  DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1583  
1584  #undef DO_ZZX
1585  
1586  #define DO_BITPERM(NAME, TYPE, OP) \
1587  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1588  {                                                              \
1589      intptr_t i, opr_sz = simd_oprsz(desc);                     \
1590      for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1591          TYPE nn = *(TYPE *)(vn + i);                           \
1592          TYPE mm = *(TYPE *)(vm + i);                           \
1593          *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1594      }                                                          \
1595  }
1596  
1597  static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1598  {
1599      uint64_t res = 0;
1600      int db, rb = 0;
1601  
1602      for (db = 0; db < n; ++db) {
1603          if ((mask >> db) & 1) {
1604              res |= ((data >> db) & 1) << rb;
1605              ++rb;
1606          }
1607      }
1608      return res;
1609  }
1610  
DO_BITPERM(sve2_bext_b,uint8_t,bitextract)1611  DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1612  DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1613  DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1614  DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1615  
1616  static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1617  {
1618      uint64_t res = 0;
1619      int rb, db = 0;
1620  
1621      for (rb = 0; rb < n; ++rb) {
1622          if ((mask >> rb) & 1) {
1623              res |= ((data >> db) & 1) << rb;
1624              ++db;
1625          }
1626      }
1627      return res;
1628  }
1629  
DO_BITPERM(sve2_bdep_b,uint8_t,bitdeposit)1630  DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1631  DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1632  DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1633  DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1634  
1635  static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1636  {
1637      uint64_t resm = 0, resu = 0;
1638      int db, rbm = 0, rbu = 0;
1639  
1640      for (db = 0; db < n; ++db) {
1641          uint64_t val = (data >> db) & 1;
1642          if ((mask >> db) & 1) {
1643              resm |= val << rbm++;
1644          } else {
1645              resu |= val << rbu++;
1646          }
1647      }
1648  
1649      return resm | (resu << rbm);
1650  }
1651  
DO_BITPERM(sve2_bgrp_b,uint8_t,bitgroup)1652  DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1653  DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1654  DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1655  DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1656  
1657  #undef DO_BITPERM
1658  
1659  #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1660  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1661  {                                                               \
1662      intptr_t i, opr_sz = simd_oprsz(desc);                      \
1663      int sub_r = simd_data(desc);                                \
1664      if (sub_r) {                                                \
1665          for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1666              TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1667              TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1668              TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1669              TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1670              acc_r = ADD_OP(acc_r, el2_i);                       \
1671              acc_i = SUB_OP(acc_i, el2_r);                       \
1672              *(TYPE *)(vd + H(i)) = acc_r;                       \
1673              *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1674          }                                                       \
1675      } else {                                                    \
1676          for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1677              TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1678              TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1679              TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1680              TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1681              acc_r = SUB_OP(acc_r, el2_i);                       \
1682              acc_i = ADD_OP(acc_i, el2_r);                       \
1683              *(TYPE *)(vd + H(i)) = acc_r;                       \
1684              *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1685          }                                                       \
1686      }                                                           \
1687  }
1688  
1689  DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1690  DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1691  DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1692  DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1693  
1694  DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1695  DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1696  DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1697  DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1698  
1699  #undef DO_CADD
1700  
1701  #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1702  void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1703  {                                                              \
1704      intptr_t i, opr_sz = simd_oprsz(desc);                     \
1705      intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1706      int shift = simd_data(desc) >> 1;                          \
1707      for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1708          TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1709          *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1710      }                                                          \
1711  }
1712  
1713  DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1714  DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1715  DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1716  
1717  DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1718  DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1719  DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1720  
1721  #undef DO_ZZI_SHLL
1722  
1723  /* Two-operand reduction expander, controlled by a predicate.
1724   * The difference between TYPERED and TYPERET has to do with
1725   * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1726   * but TYPERET must be unsigned so that e.g. a 32-bit value
1727   * is not sign-extended to the ABI uint64_t return type.
1728   */
1729  /* ??? If we were to vectorize this by hand the reduction ordering
1730   * would change.  For integer operands, this is perfectly fine.
1731   */
1732  #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1733  uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1734  {                                                          \
1735      intptr_t i, opr_sz = simd_oprsz(desc);                 \
1736      TYPERED ret = INIT;                                    \
1737      for (i = 0; i < opr_sz; ) {                            \
1738          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1739          do {                                               \
1740              if (pg & 1) {                                  \
1741                  TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1742                  ret = OP(ret, nn);                         \
1743              }                                              \
1744              i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1745          } while (i & 15);                                  \
1746      }                                                      \
1747      return (TYPERET)ret;                                   \
1748  }
1749  
1750  #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1751  uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1752  {                                                          \
1753      intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1754      TYPEE *n = vn;                                         \
1755      uint8_t *pg = vg;                                      \
1756      TYPER ret = INIT;                                      \
1757      for (i = 0; i < opr_sz; i += 1) {                      \
1758          if (pg[H1(i)] & 1) {                               \
1759              TYPEE nn = n[i];                               \
1760              ret = OP(ret, nn);                             \
1761          }                                                  \
1762      }                                                      \
1763      return ret;                                            \
1764  }
1765  
1766  DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1767  DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1768  DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1769  DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1770  
1771  DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1772  DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1773  DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1774  DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1775  
1776  DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1777  DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1778  DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1779  DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1780  
1781  DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1782  DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1783  DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1784  
1785  DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1786  DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1787  DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1788  DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1789  
1790  DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1791  DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1792  DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1793  DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1794  
1795  DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1796  DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1797  DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1798  DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1799  
1800  DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1801  DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1802  DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1803  DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1804  
1805  DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1806  DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1807  DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1808  DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1809  
1810  #undef DO_VPZ
1811  #undef DO_VPZ_D
1812  
1813  /* Two vector operand, one scalar operand, unpredicated.  */
1814  #define DO_ZZI(NAME, TYPE, OP)                                       \
1815  void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1816  {                                                                    \
1817      intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1818      TYPE s = s64, *d = vd, *n = vn;                                  \
1819      for (i = 0; i < opr_sz; ++i) {                                   \
1820          d[i] = OP(n[i], s);                                          \
1821      }                                                                \
1822  }
1823  
1824  #define DO_SUBR(X, Y)   (Y - X)
1825  
1826  DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1827  DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1828  DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1829  DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1830  
1831  DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1832  DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1833  DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1834  DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1835  
1836  DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1837  DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1838  DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1839  DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1840  
1841  DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1842  DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1843  DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1844  DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1845  
1846  DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1847  DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1848  DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1849  DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1850  
1851  #undef DO_ZZI
1852  
1853  #undef DO_AND
1854  #undef DO_ORR
1855  #undef DO_EOR
1856  #undef DO_BIC
1857  #undef DO_ADD
1858  #undef DO_SUB
1859  #undef DO_MAX
1860  #undef DO_MIN
1861  #undef DO_ABD
1862  #undef DO_MUL
1863  #undef DO_DIV
1864  #undef DO_ASR
1865  #undef DO_LSR
1866  #undef DO_LSL
1867  #undef DO_SUBR
1868  
1869  /* Similar to the ARM LastActiveElement pseudocode function, except the
1870     result is multiplied by the element size.  This includes the not found
1871     indication; e.g. not found for esz=3 is -8.  */
1872  static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1873  {
1874      uint64_t mask = pred_esz_masks[esz];
1875      intptr_t i = words;
1876  
1877      do {
1878          uint64_t this_g = g[--i] & mask;
1879          if (this_g) {
1880              return i * 64 + (63 - clz64(this_g));
1881          }
1882      } while (i > 0);
1883      return (intptr_t)-1 << esz;
1884  }
1885  
HELPER(sve_pfirst)1886  uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1887  {
1888      intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1889      uint32_t flags = PREDTEST_INIT;
1890      uint64_t *d = vd, *g = vg;
1891      intptr_t i = 0;
1892  
1893      do {
1894          uint64_t this_d = d[i];
1895          uint64_t this_g = g[i];
1896  
1897          if (this_g) {
1898              if (!(flags & 4)) {
1899                  /* Set in D the first bit of G.  */
1900                  this_d |= this_g & -this_g;
1901                  d[i] = this_d;
1902              }
1903              flags = iter_predtest_fwd(this_d, this_g, flags);
1904          }
1905      } while (++i < words);
1906  
1907      return flags;
1908  }
1909  
HELPER(sve_pnext)1910  uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1911  {
1912      intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1913      intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1914      uint32_t flags = PREDTEST_INIT;
1915      uint64_t *d = vd, *g = vg, esz_mask;
1916      intptr_t i, next;
1917  
1918      next = last_active_element(vd, words, esz) + (1 << esz);
1919      esz_mask = pred_esz_masks[esz];
1920  
1921      /* Similar to the pseudocode for pnext, but scaled by ESZ
1922         so that we find the correct bit.  */
1923      if (next < words * 64) {
1924          uint64_t mask = -1;
1925  
1926          if (next & 63) {
1927              mask = ~((1ull << (next & 63)) - 1);
1928              next &= -64;
1929          }
1930          do {
1931              uint64_t this_g = g[next / 64] & esz_mask & mask;
1932              if (this_g != 0) {
1933                  next = (next & -64) + ctz64(this_g);
1934                  break;
1935              }
1936              next += 64;
1937              mask = -1;
1938          } while (next < words * 64);
1939      }
1940  
1941      i = 0;
1942      do {
1943          uint64_t this_d = 0;
1944          if (i == next / 64) {
1945              this_d = 1ull << (next & 63);
1946          }
1947          d[i] = this_d;
1948          flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1949      } while (++i < words);
1950  
1951      return flags;
1952  }
1953  
1954  /*
1955   * Copy Zn into Zd, and store zero into inactive elements.
1956   * If inv, store zeros into the active elements.
1957   */
HELPER(sve_movz_b)1958  void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1959  {
1960      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1961      uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1962      uint64_t *d = vd, *n = vn;
1963      uint8_t *pg = vg;
1964  
1965      for (i = 0; i < opr_sz; i += 1) {
1966          d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1967      }
1968  }
1969  
HELPER(sve_movz_h)1970  void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1971  {
1972      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1973      uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1974      uint64_t *d = vd, *n = vn;
1975      uint8_t *pg = vg;
1976  
1977      for (i = 0; i < opr_sz; i += 1) {
1978          d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1979      }
1980  }
1981  
HELPER(sve_movz_s)1982  void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1983  {
1984      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1985      uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1986      uint64_t *d = vd, *n = vn;
1987      uint8_t *pg = vg;
1988  
1989      for (i = 0; i < opr_sz; i += 1) {
1990          d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1991      }
1992  }
1993  
HELPER(sve_movz_d)1994  void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1995  {
1996      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1997      uint64_t *d = vd, *n = vn;
1998      uint8_t *pg = vg;
1999      uint8_t inv = simd_data(desc);
2000  
2001      for (i = 0; i < opr_sz; i += 1) {
2002          d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2003      }
2004  }
2005  
2006  /* Three-operand expander, immediate operand, controlled by a predicate.
2007   */
2008  #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2009  void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2010  {                                                               \
2011      intptr_t i, opr_sz = simd_oprsz(desc);                      \
2012      TYPE imm = simd_data(desc);                                 \
2013      for (i = 0; i < opr_sz; ) {                                 \
2014          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2015          do {                                                    \
2016              if (pg & 1) {                                       \
2017                  TYPE nn = *(TYPE *)(vn + H(i));                 \
2018                  *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2019              }                                                   \
2020              i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2021          } while (i & 15);                                       \
2022      }                                                           \
2023  }
2024  
2025  /* Similarly, specialized for 64-bit operands.  */
2026  #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2027  void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2028  {                                                               \
2029      intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2030      TYPE *d = vd, *n = vn;                                      \
2031      TYPE imm = simd_data(desc);                                 \
2032      uint8_t *pg = vg;                                           \
2033      for (i = 0; i < opr_sz; i += 1) {                           \
2034          if (pg[H1(i)] & 1) {                                    \
2035              TYPE nn = n[i];                                     \
2036              d[i] = OP(nn, imm);                                 \
2037          }                                                       \
2038      }                                                           \
2039  }
2040  
2041  #define DO_SHR(N, M)  (N >> M)
2042  #define DO_SHL(N, M)  (N << M)
2043  
2044  /* Arithmetic shift right for division.  This rounds negative numbers
2045     toward zero as per signed division.  Therefore before shifting,
2046     when N is negative, add 2**M-1.  */
2047  #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2048  
do_urshr(uint64_t x,unsigned sh)2049  static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2050  {
2051      if (likely(sh < 64)) {
2052          return (x >> sh) + ((x >> (sh - 1)) & 1);
2053      } else if (sh == 64) {
2054          return x >> 63;
2055      } else {
2056          return 0;
2057      }
2058  }
2059  
do_srshr(int64_t x,unsigned sh)2060  static inline int64_t do_srshr(int64_t x, unsigned sh)
2061  {
2062      if (likely(sh < 64)) {
2063          return (x >> sh) + ((x >> (sh - 1)) & 1);
2064      } else {
2065          /* Rounding the sign bit always produces 0. */
2066          return 0;
2067      }
2068  }
2069  
DO_ZPZI(sve_asr_zpzi_b,int8_t,H1,DO_SHR)2070  DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2071  DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2072  DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2073  DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2074  
2075  DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2076  DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2077  DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2078  DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2079  
2080  DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2081  DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2082  DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2083  DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2084  
2085  DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2086  DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2087  DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2088  DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2089  
2090  /* SVE2 bitwise shift by immediate */
2091  DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2092  DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2093  DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2094  DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2095  
2096  DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2097  DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2098  DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2099  DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2100  
2101  DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2102  DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2103  DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2104  DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2105  
2106  DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2107  DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2108  DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2109  DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2110  
2111  #define do_suqrshl_b(n, m) \
2112     ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2113  #define do_suqrshl_h(n, m) \
2114     ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2115  #define do_suqrshl_s(n, m) \
2116     ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2117  #define do_suqrshl_d(n, m) \
2118     ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2119  
2120  DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2121  DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2122  DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2123  DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2124  
2125  #undef DO_ASRD
2126  #undef DO_ZPZI
2127  #undef DO_ZPZI_D
2128  
2129  #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2130  void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2131  {                                                            \
2132      intptr_t i, opr_sz = simd_oprsz(desc);                   \
2133      int shift = simd_data(desc);                             \
2134      for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2135          TYPEW nn = *(TYPEW *)(vn + i);                       \
2136          *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2137      }                                                        \
2138  }
2139  
2140  #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2141  void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2142  {                                                                 \
2143      intptr_t i, opr_sz = simd_oprsz(desc);                        \
2144      int shift = simd_data(desc);                                  \
2145      for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2146          TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2147          *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2148      }                                                             \
2149  }
2150  
2151  DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2152  DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2153  DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2154  
2155  DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2156  DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2157  DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2158  
2159  DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2160  DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2161  DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2162  
2163  DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2164  DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2165  DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2166  
2167  #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2168  #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2169  #define DO_SQSHRUN_D(x, sh) \
2170      do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2171  
2172  DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2173  DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2174  DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2175  
2176  DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2177  DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2178  DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2179  
2180  #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2181  #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2182  #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2183  
2184  DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2185  DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2186  DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2187  
2188  DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2189  DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2190  DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2191  
2192  #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2193  #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2194  #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2195  
2196  DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2197  DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2198  DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2199  
2200  DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2201  DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2202  DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2203  
2204  #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2205  #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2206  #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2207  
2208  DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2209  DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2210  DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2211  
2212  DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2213  DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2214  DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2215  
2216  #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2217  #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2218  #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2219  
2220  DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2221  DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2222  DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2223  
2224  DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2225  DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2226  DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2227  
2228  #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2229  #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2230  #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2231  
2232  DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2233  DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2234  DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2235  
2236  DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2237  DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2238  DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2239  
2240  #undef DO_SHRNB
2241  #undef DO_SHRNT
2242  
2243  #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2244  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2245  {                                                                           \
2246      intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2247      for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2248          TYPEW nn = *(TYPEW *)(vn + i);                                      \
2249          TYPEW mm = *(TYPEW *)(vm + i);                                      \
2250          *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2251      }                                                                       \
2252  }
2253  
2254  #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2255  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2256  {                                                                           \
2257      intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2258      for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2259          TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2260          TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2261          *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2262      }                                                                       \
2263  }
2264  
2265  #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2266  #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2267  #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2268  #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2269  
2270  DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2271  DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2272  DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2273  
2274  DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2275  DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2276  DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2277  
2278  DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2279  DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2280  DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2281  
2282  DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2283  DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2284  DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2285  
2286  DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2287  DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2288  DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2289  
2290  DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2291  DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2292  DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2293  
2294  DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2295  DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2296  DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2297  
2298  DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2299  DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2300  DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2301  
2302  #undef DO_RSUBHN
2303  #undef DO_SUBHN
2304  #undef DO_RADDHN
2305  #undef DO_ADDHN
2306  
2307  #undef DO_BINOPNB
2308  
2309  /* Fully general four-operand expander, controlled by a predicate.
2310   */
2311  #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2312  void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2313                    void *vg, uint32_t desc)                    \
2314  {                                                             \
2315      intptr_t i, opr_sz = simd_oprsz(desc);                    \
2316      for (i = 0; i < opr_sz; ) {                               \
2317          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2318          do {                                                  \
2319              if (pg & 1) {                                     \
2320                  TYPE nn = *(TYPE *)(vn + H(i));               \
2321                  TYPE mm = *(TYPE *)(vm + H(i));               \
2322                  TYPE aa = *(TYPE *)(va + H(i));               \
2323                  *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2324              }                                                 \
2325              i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2326          } while (i & 15);                                     \
2327      }                                                         \
2328  }
2329  
2330  /* Similarly, specialized for 64-bit operands.  */
2331  #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2332  void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2333                    void *vg, uint32_t desc)                    \
2334  {                                                             \
2335      intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2336      TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2337      uint8_t *pg = vg;                                         \
2338      for (i = 0; i < opr_sz; i += 1) {                         \
2339          if (pg[H1(i)] & 1) {                                  \
2340              TYPE aa = a[i], nn = n[i], mm = m[i];             \
2341              d[i] = OP(aa, nn, mm);                            \
2342          }                                                     \
2343      }                                                         \
2344  }
2345  
2346  #define DO_MLA(A, N, M)  (A + N * M)
2347  #define DO_MLS(A, N, M)  (A - N * M)
2348  
2349  DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2350  DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2351  
2352  DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2353  DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2354  
2355  DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2356  DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2357  
2358  DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2359  DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2360  
2361  #undef DO_MLA
2362  #undef DO_MLS
2363  #undef DO_ZPZZZ
2364  #undef DO_ZPZZZ_D
2365  
2366  void HELPER(sve_index_b)(void *vd, uint32_t start,
2367                           uint32_t incr, uint32_t desc)
2368  {
2369      intptr_t i, opr_sz = simd_oprsz(desc);
2370      uint8_t *d = vd;
2371      for (i = 0; i < opr_sz; i += 1) {
2372          d[H1(i)] = start + i * incr;
2373      }
2374  }
2375  
HELPER(sve_index_h)2376  void HELPER(sve_index_h)(void *vd, uint32_t start,
2377                           uint32_t incr, uint32_t desc)
2378  {
2379      intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2380      uint16_t *d = vd;
2381      for (i = 0; i < opr_sz; i += 1) {
2382          d[H2(i)] = start + i * incr;
2383      }
2384  }
2385  
HELPER(sve_index_s)2386  void HELPER(sve_index_s)(void *vd, uint32_t start,
2387                           uint32_t incr, uint32_t desc)
2388  {
2389      intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2390      uint32_t *d = vd;
2391      for (i = 0; i < opr_sz; i += 1) {
2392          d[H4(i)] = start + i * incr;
2393      }
2394  }
2395  
HELPER(sve_index_d)2396  void HELPER(sve_index_d)(void *vd, uint64_t start,
2397                           uint64_t incr, uint32_t desc)
2398  {
2399      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2400      uint64_t *d = vd;
2401      for (i = 0; i < opr_sz; i += 1) {
2402          d[i] = start + i * incr;
2403      }
2404  }
2405  
HELPER(sve_adr_p32)2406  void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2407  {
2408      intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2409      uint32_t sh = simd_data(desc);
2410      uint32_t *d = vd, *n = vn, *m = vm;
2411      for (i = 0; i < opr_sz; i += 1) {
2412          d[i] = n[i] + (m[i] << sh);
2413      }
2414  }
2415  
HELPER(sve_adr_p64)2416  void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2417  {
2418      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2419      uint64_t sh = simd_data(desc);
2420      uint64_t *d = vd, *n = vn, *m = vm;
2421      for (i = 0; i < opr_sz; i += 1) {
2422          d[i] = n[i] + (m[i] << sh);
2423      }
2424  }
2425  
HELPER(sve_adr_s32)2426  void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2427  {
2428      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2429      uint64_t sh = simd_data(desc);
2430      uint64_t *d = vd, *n = vn, *m = vm;
2431      for (i = 0; i < opr_sz; i += 1) {
2432          d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2433      }
2434  }
2435  
HELPER(sve_adr_u32)2436  void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2437  {
2438      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2439      uint64_t sh = simd_data(desc);
2440      uint64_t *d = vd, *n = vn, *m = vm;
2441      for (i = 0; i < opr_sz; i += 1) {
2442          d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2443      }
2444  }
2445  
HELPER(sve_fexpa_h)2446  void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2447  {
2448      /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2449      static const uint16_t coeff[] = {
2450          0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2451          0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2452          0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2453          0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2454      };
2455      intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2456      uint16_t *d = vd, *n = vn;
2457  
2458      for (i = 0; i < opr_sz; i++) {
2459          uint16_t nn = n[i];
2460          intptr_t idx = extract32(nn, 0, 5);
2461          uint16_t exp = extract32(nn, 5, 5);
2462          d[i] = coeff[idx] | (exp << 10);
2463      }
2464  }
2465  
HELPER(sve_fexpa_s)2466  void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2467  {
2468      /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2469      static const uint32_t coeff[] = {
2470          0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2471          0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2472          0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2473          0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2474          0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2475          0x1ef532, 0x20b051, 0x227043, 0x243516,
2476          0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2477          0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2478          0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2479          0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2480          0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2481          0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2482          0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2483          0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2484          0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2485          0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2486      };
2487      intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2488      uint32_t *d = vd, *n = vn;
2489  
2490      for (i = 0; i < opr_sz; i++) {
2491          uint32_t nn = n[i];
2492          intptr_t idx = extract32(nn, 0, 6);
2493          uint32_t exp = extract32(nn, 6, 8);
2494          d[i] = coeff[idx] | (exp << 23);
2495      }
2496  }
2497  
HELPER(sve_fexpa_d)2498  void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2499  {
2500      /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2501      static const uint64_t coeff[] = {
2502          0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2503          0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2504          0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2505          0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2506          0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2507          0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2508          0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2509          0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2510          0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2511          0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2512          0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2513          0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2514          0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2515          0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2516          0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2517          0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2518          0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2519          0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2520          0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2521          0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2522          0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2523          0xFA7C1819E90D8ull,
2524      };
2525      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2526      uint64_t *d = vd, *n = vn;
2527  
2528      for (i = 0; i < opr_sz; i++) {
2529          uint64_t nn = n[i];
2530          intptr_t idx = extract32(nn, 0, 6);
2531          uint64_t exp = extract32(nn, 6, 11);
2532          d[i] = coeff[idx] | (exp << 52);
2533      }
2534  }
2535  
HELPER(sve_ftssel_h)2536  void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2537  {
2538      intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2539      uint16_t *d = vd, *n = vn, *m = vm;
2540      for (i = 0; i < opr_sz; i += 1) {
2541          uint16_t nn = n[i];
2542          uint16_t mm = m[i];
2543          if (mm & 1) {
2544              nn = float16_one;
2545          }
2546          d[i] = nn ^ (mm & 2) << 14;
2547      }
2548  }
2549  
HELPER(sve_ftssel_s)2550  void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2551  {
2552      intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2553      uint32_t *d = vd, *n = vn, *m = vm;
2554      for (i = 0; i < opr_sz; i += 1) {
2555          uint32_t nn = n[i];
2556          uint32_t mm = m[i];
2557          if (mm & 1) {
2558              nn = float32_one;
2559          }
2560          d[i] = nn ^ (mm & 2) << 30;
2561      }
2562  }
2563  
HELPER(sve_ftssel_d)2564  void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2565  {
2566      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2567      uint64_t *d = vd, *n = vn, *m = vm;
2568      for (i = 0; i < opr_sz; i += 1) {
2569          uint64_t nn = n[i];
2570          uint64_t mm = m[i];
2571          if (mm & 1) {
2572              nn = float64_one;
2573          }
2574          d[i] = nn ^ (mm & 2) << 62;
2575      }
2576  }
2577  
2578  /*
2579   * Signed saturating addition with scalar operand.
2580   */
2581  
HELPER(sve_sqaddi_b)2582  void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2583  {
2584      intptr_t i, oprsz = simd_oprsz(desc);
2585  
2586      for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2587          *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2588      }
2589  }
2590  
HELPER(sve_sqaddi_h)2591  void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2592  {
2593      intptr_t i, oprsz = simd_oprsz(desc);
2594  
2595      for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2596          *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2597      }
2598  }
2599  
HELPER(sve_sqaddi_s)2600  void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2601  {
2602      intptr_t i, oprsz = simd_oprsz(desc);
2603  
2604      for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2605          *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2606      }
2607  }
2608  
HELPER(sve_sqaddi_d)2609  void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2610  {
2611      intptr_t i, oprsz = simd_oprsz(desc);
2612  
2613      for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2614          *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2615      }
2616  }
2617  
2618  /*
2619   * Unsigned saturating addition with scalar operand.
2620   */
2621  
HELPER(sve_uqaddi_b)2622  void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2623  {
2624      intptr_t i, oprsz = simd_oprsz(desc);
2625  
2626      for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2627          *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2628      }
2629  }
2630  
HELPER(sve_uqaddi_h)2631  void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2632  {
2633      intptr_t i, oprsz = simd_oprsz(desc);
2634  
2635      for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2636          *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2637      }
2638  }
2639  
HELPER(sve_uqaddi_s)2640  void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2641  {
2642      intptr_t i, oprsz = simd_oprsz(desc);
2643  
2644      for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2645          *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2646      }
2647  }
2648  
HELPER(sve_uqaddi_d)2649  void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2650  {
2651      intptr_t i, oprsz = simd_oprsz(desc);
2652  
2653      for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2654          *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2655      }
2656  }
2657  
HELPER(sve_uqsubi_d)2658  void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2659  {
2660      intptr_t i, oprsz = simd_oprsz(desc);
2661  
2662      for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2663          *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2664      }
2665  }
2666  
2667  /* Two operand predicated copy immediate with merge.  All valid immediates
2668   * can fit within 17 signed bits in the simd_data field.
2669   */
HELPER(sve_cpy_m_b)2670  void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2671                           uint64_t mm, uint32_t desc)
2672  {
2673      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2674      uint64_t *d = vd, *n = vn;
2675      uint8_t *pg = vg;
2676  
2677      mm = dup_const(MO_8, mm);
2678      for (i = 0; i < opr_sz; i += 1) {
2679          uint64_t nn = n[i];
2680          uint64_t pp = expand_pred_b(pg[H1(i)]);
2681          d[i] = (mm & pp) | (nn & ~pp);
2682      }
2683  }
2684  
HELPER(sve_cpy_m_h)2685  void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2686                           uint64_t mm, uint32_t desc)
2687  {
2688      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2689      uint64_t *d = vd, *n = vn;
2690      uint8_t *pg = vg;
2691  
2692      mm = dup_const(MO_16, mm);
2693      for (i = 0; i < opr_sz; i += 1) {
2694          uint64_t nn = n[i];
2695          uint64_t pp = expand_pred_h(pg[H1(i)]);
2696          d[i] = (mm & pp) | (nn & ~pp);
2697      }
2698  }
2699  
HELPER(sve_cpy_m_s)2700  void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2701                           uint64_t mm, uint32_t desc)
2702  {
2703      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2704      uint64_t *d = vd, *n = vn;
2705      uint8_t *pg = vg;
2706  
2707      mm = dup_const(MO_32, mm);
2708      for (i = 0; i < opr_sz; i += 1) {
2709          uint64_t nn = n[i];
2710          uint64_t pp = expand_pred_s(pg[H1(i)]);
2711          d[i] = (mm & pp) | (nn & ~pp);
2712      }
2713  }
2714  
HELPER(sve_cpy_m_d)2715  void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2716                           uint64_t mm, uint32_t desc)
2717  {
2718      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2719      uint64_t *d = vd, *n = vn;
2720      uint8_t *pg = vg;
2721  
2722      for (i = 0; i < opr_sz; i += 1) {
2723          uint64_t nn = n[i];
2724          d[i] = (pg[H1(i)] & 1 ? mm : nn);
2725      }
2726  }
2727  
HELPER(sve_cpy_z_b)2728  void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2729  {
2730      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2731      uint64_t *d = vd;
2732      uint8_t *pg = vg;
2733  
2734      val = dup_const(MO_8, val);
2735      for (i = 0; i < opr_sz; i += 1) {
2736          d[i] = val & expand_pred_b(pg[H1(i)]);
2737      }
2738  }
2739  
HELPER(sve_cpy_z_h)2740  void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2741  {
2742      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2743      uint64_t *d = vd;
2744      uint8_t *pg = vg;
2745  
2746      val = dup_const(MO_16, val);
2747      for (i = 0; i < opr_sz; i += 1) {
2748          d[i] = val & expand_pred_h(pg[H1(i)]);
2749      }
2750  }
2751  
HELPER(sve_cpy_z_s)2752  void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2753  {
2754      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2755      uint64_t *d = vd;
2756      uint8_t *pg = vg;
2757  
2758      val = dup_const(MO_32, val);
2759      for (i = 0; i < opr_sz; i += 1) {
2760          d[i] = val & expand_pred_s(pg[H1(i)]);
2761      }
2762  }
2763  
HELPER(sve_cpy_z_d)2764  void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2765  {
2766      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2767      uint64_t *d = vd;
2768      uint8_t *pg = vg;
2769  
2770      for (i = 0; i < opr_sz; i += 1) {
2771          d[i] = (pg[H1(i)] & 1 ? val : 0);
2772      }
2773  }
2774  
2775  /* Big-endian hosts need to frob the byte indices.  If the copy
2776   * happens to be 8-byte aligned, then no frobbing necessary.
2777   */
swap_memmove(void * vd,void * vs,size_t n)2778  static void swap_memmove(void *vd, void *vs, size_t n)
2779  {
2780      uintptr_t d = (uintptr_t)vd;
2781      uintptr_t s = (uintptr_t)vs;
2782      uintptr_t o = (d | s | n) & 7;
2783      size_t i;
2784  
2785  #if !HOST_BIG_ENDIAN
2786      o = 0;
2787  #endif
2788      switch (o) {
2789      case 0:
2790          memmove(vd, vs, n);
2791          break;
2792  
2793      case 4:
2794          if (d < s || d >= s + n) {
2795              for (i = 0; i < n; i += 4) {
2796                  *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2797              }
2798          } else {
2799              for (i = n; i > 0; ) {
2800                  i -= 4;
2801                  *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2802              }
2803          }
2804          break;
2805  
2806      case 2:
2807      case 6:
2808          if (d < s || d >= s + n) {
2809              for (i = 0; i < n; i += 2) {
2810                  *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2811              }
2812          } else {
2813              for (i = n; i > 0; ) {
2814                  i -= 2;
2815                  *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2816              }
2817          }
2818          break;
2819  
2820      default:
2821          if (d < s || d >= s + n) {
2822              for (i = 0; i < n; i++) {
2823                  *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2824              }
2825          } else {
2826              for (i = n; i > 0; ) {
2827                  i -= 1;
2828                  *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2829              }
2830          }
2831          break;
2832      }
2833  }
2834  
2835  /* Similarly for memset of 0.  */
swap_memzero(void * vd,size_t n)2836  static void swap_memzero(void *vd, size_t n)
2837  {
2838      uintptr_t d = (uintptr_t)vd;
2839      uintptr_t o = (d | n) & 7;
2840      size_t i;
2841  
2842      /* Usually, the first bit of a predicate is set, so N is 0.  */
2843      if (likely(n == 0)) {
2844          return;
2845      }
2846  
2847  #if !HOST_BIG_ENDIAN
2848      o = 0;
2849  #endif
2850      switch (o) {
2851      case 0:
2852          memset(vd, 0, n);
2853          break;
2854  
2855      case 4:
2856          for (i = 0; i < n; i += 4) {
2857              *(uint32_t *)H1_4(d + i) = 0;
2858          }
2859          break;
2860  
2861      case 2:
2862      case 6:
2863          for (i = 0; i < n; i += 2) {
2864              *(uint16_t *)H1_2(d + i) = 0;
2865          }
2866          break;
2867  
2868      default:
2869          for (i = 0; i < n; i++) {
2870              *(uint8_t *)H1(d + i) = 0;
2871          }
2872          break;
2873      }
2874  }
2875  
HELPER(sve_ext)2876  void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2877  {
2878      intptr_t opr_sz = simd_oprsz(desc);
2879      size_t n_ofs = simd_data(desc);
2880      size_t n_siz = opr_sz - n_ofs;
2881  
2882      if (vd != vm) {
2883          swap_memmove(vd, vn + n_ofs, n_siz);
2884          swap_memmove(vd + n_siz, vm, n_ofs);
2885      } else if (vd != vn) {
2886          swap_memmove(vd + n_siz, vd, n_ofs);
2887          swap_memmove(vd, vn + n_ofs, n_siz);
2888      } else {
2889          /* vd == vn == vm.  Need temp space.  */
2890          ARMVectorReg tmp;
2891          swap_memmove(&tmp, vm, n_ofs);
2892          swap_memmove(vd, vd + n_ofs, n_siz);
2893          memcpy(vd + n_siz, &tmp, n_ofs);
2894      }
2895  }
2896  
2897  #define DO_INSR(NAME, TYPE, H) \
2898  void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2899  {                                                                  \
2900      intptr_t opr_sz = simd_oprsz(desc);                            \
2901      swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2902      *(TYPE *)(vd + H(0)) = val;                                    \
2903  }
2904  
DO_INSR(sve_insr_b,uint8_t,H1)2905  DO_INSR(sve_insr_b, uint8_t, H1)
2906  DO_INSR(sve_insr_h, uint16_t, H1_2)
2907  DO_INSR(sve_insr_s, uint32_t, H1_4)
2908  DO_INSR(sve_insr_d, uint64_t, H1_8)
2909  
2910  #undef DO_INSR
2911  
2912  void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2913  {
2914      intptr_t i, j, opr_sz = simd_oprsz(desc);
2915      for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2916          uint64_t f = *(uint64_t *)(vn + i);
2917          uint64_t b = *(uint64_t *)(vn + j);
2918          *(uint64_t *)(vd + i) = bswap64(b);
2919          *(uint64_t *)(vd + j) = bswap64(f);
2920      }
2921  }
2922  
HELPER(sve_rev_h)2923  void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2924  {
2925      intptr_t i, j, opr_sz = simd_oprsz(desc);
2926      for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2927          uint64_t f = *(uint64_t *)(vn + i);
2928          uint64_t b = *(uint64_t *)(vn + j);
2929          *(uint64_t *)(vd + i) = hswap64(b);
2930          *(uint64_t *)(vd + j) = hswap64(f);
2931      }
2932  }
2933  
HELPER(sve_rev_s)2934  void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2935  {
2936      intptr_t i, j, opr_sz = simd_oprsz(desc);
2937      for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2938          uint64_t f = *(uint64_t *)(vn + i);
2939          uint64_t b = *(uint64_t *)(vn + j);
2940          *(uint64_t *)(vd + i) = rol64(b, 32);
2941          *(uint64_t *)(vd + j) = rol64(f, 32);
2942      }
2943  }
2944  
HELPER(sve_rev_d)2945  void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2946  {
2947      intptr_t i, j, opr_sz = simd_oprsz(desc);
2948      for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2949          uint64_t f = *(uint64_t *)(vn + i);
2950          uint64_t b = *(uint64_t *)(vn + j);
2951          *(uint64_t *)(vd + i) = b;
2952          *(uint64_t *)(vd + j) = f;
2953      }
2954  }
2955  
2956  typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2957  
do_tbl1(void * vd,void * vn,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)2958  static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2959                             bool is_tbx, tb_impl_fn *fn)
2960  {
2961      ARMVectorReg scratch;
2962      uintptr_t oprsz = simd_oprsz(desc);
2963  
2964      if (unlikely(vd == vn)) {
2965          vn = memcpy(&scratch, vn, oprsz);
2966      }
2967  
2968      fn(vd, vn, NULL, vm, oprsz, is_tbx);
2969  }
2970  
do_tbl2(void * vd,void * vn0,void * vn1,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)2971  static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2972                             uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2973  {
2974      ARMVectorReg scratch;
2975      uintptr_t oprsz = simd_oprsz(desc);
2976  
2977      if (unlikely(vd == vn0)) {
2978          vn0 = memcpy(&scratch, vn0, oprsz);
2979          if (vd == vn1) {
2980              vn1 = vn0;
2981          }
2982      } else if (unlikely(vd == vn1)) {
2983          vn1 = memcpy(&scratch, vn1, oprsz);
2984      }
2985  
2986      fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2987  }
2988  
2989  #define DO_TB(SUFF, TYPE, H)                                            \
2990  static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
2991                                  void *vm, uintptr_t oprsz, bool is_tbx) \
2992  {                                                                       \
2993      TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
2994      uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
2995      for (i = 0; i < nelem; ++i) {                                       \
2996          TYPE index = indexes[H1(i)], val = 0;                           \
2997          if (index < nelem) {                                            \
2998              val = tbl0[H(index)];                                       \
2999          } else {                                                        \
3000              index -= nelem;                                             \
3001              if (tbl1 && index < nelem) {                                \
3002                  val = tbl1[H(index)];                                   \
3003              } else if (is_tbx) {                                        \
3004                  continue;                                               \
3005              }                                                           \
3006          }                                                               \
3007          d[H(i)] = val;                                                  \
3008      }                                                                   \
3009  }                                                                       \
3010  void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3011  {                                                                       \
3012      do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3013  }                                                                       \
3014  void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3015                               void *vm, uint32_t desc)                   \
3016  {                                                                       \
3017      do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3018  }                                                                       \
3019  void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3020  {                                                                       \
3021      do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3022  }
3023  
3024  DO_TB(b, uint8_t, H1)
3025  DO_TB(h, uint16_t, H2)
3026  DO_TB(s, uint32_t, H4)
3027  DO_TB(d, uint64_t, H8)
3028  
3029  #undef DO_TB
3030  
3031  #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3032  void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3033  {                                                              \
3034      intptr_t i, opr_sz = simd_oprsz(desc);                     \
3035      TYPED *d = vd;                                             \
3036      TYPES *n = vn;                                             \
3037      ARMVectorReg tmp;                                          \
3038      if (unlikely(vn - vd < opr_sz)) {                          \
3039          n = memcpy(&tmp, n, opr_sz / 2);                       \
3040      }                                                          \
3041      for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3042          d[HD(i)] = n[HS(i)];                                   \
3043      }                                                          \
3044  }
3045  
3046  DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3047  DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3048  DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3049  
3050  DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3051  DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3052  DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3053  
3054  #undef DO_UNPK
3055  
3056  /* Mask of bits included in the even numbered predicates of width esz.
3057   * We also use this for expand_bits/compress_bits, and so extend the
3058   * same pattern out to 16-bit units.
3059   */
3060  static const uint64_t even_bit_esz_masks[5] = {
3061      0x5555555555555555ull,
3062      0x3333333333333333ull,
3063      0x0f0f0f0f0f0f0f0full,
3064      0x00ff00ff00ff00ffull,
3065      0x0000ffff0000ffffull,
3066  };
3067  
3068  /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3069   * For N==0, this corresponds to the operation that in qemu/bitops.h
3070   * we call half_shuffle64; this algorithm is from Hacker's Delight,
3071   * section 7-2 Shuffling Bits.
3072   */
expand_bits(uint64_t x,int n)3073  static uint64_t expand_bits(uint64_t x, int n)
3074  {
3075      int i;
3076  
3077      x &= 0xffffffffu;
3078      for (i = 4; i >= n; i--) {
3079          int sh = 1 << i;
3080          x = ((x << sh) | x) & even_bit_esz_masks[i];
3081      }
3082      return x;
3083  }
3084  
3085  /* Compress units of 2**(N+1) bits to units of 2**N bits.
3086   * For N==0, this corresponds to the operation that in qemu/bitops.h
3087   * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3088   * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3089   */
compress_bits(uint64_t x,int n)3090  static uint64_t compress_bits(uint64_t x, int n)
3091  {
3092      int i;
3093  
3094      for (i = n; i <= 4; i++) {
3095          int sh = 1 << i;
3096          x &= even_bit_esz_masks[i];
3097          x = (x >> sh) | x;
3098      }
3099      return x & 0xffffffffu;
3100  }
3101  
HELPER(sve_zip_p)3102  void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3103  {
3104      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3105      int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3106      intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3107      int esize = 1 << esz;
3108      uint64_t *d = vd;
3109      intptr_t i;
3110  
3111      if (oprsz <= 8) {
3112          uint64_t nn = *(uint64_t *)vn;
3113          uint64_t mm = *(uint64_t *)vm;
3114          int half = 4 * oprsz;
3115  
3116          nn = extract64(nn, high * half, half);
3117          mm = extract64(mm, high * half, half);
3118          nn = expand_bits(nn, esz);
3119          mm = expand_bits(mm, esz);
3120          d[0] = nn | (mm << esize);
3121      } else {
3122          ARMPredicateReg tmp;
3123  
3124          /* We produce output faster than we consume input.
3125             Therefore we must be mindful of possible overlap.  */
3126          if (vd == vn) {
3127              vn = memcpy(&tmp, vn, oprsz);
3128              if (vd == vm) {
3129                  vm = vn;
3130              }
3131          } else if (vd == vm) {
3132              vm = memcpy(&tmp, vm, oprsz);
3133          }
3134          if (high) {
3135              high = oprsz >> 1;
3136          }
3137  
3138          if ((oprsz & 7) == 0) {
3139              uint32_t *n = vn, *m = vm;
3140              high >>= 2;
3141  
3142              for (i = 0; i < oprsz / 8; i++) {
3143                  uint64_t nn = n[H4(high + i)];
3144                  uint64_t mm = m[H4(high + i)];
3145  
3146                  nn = expand_bits(nn, esz);
3147                  mm = expand_bits(mm, esz);
3148                  d[i] = nn | (mm << esize);
3149              }
3150          } else {
3151              uint8_t *n = vn, *m = vm;
3152              uint16_t *d16 = vd;
3153  
3154              for (i = 0; i < oprsz / 2; i++) {
3155                  uint16_t nn = n[H1(high + i)];
3156                  uint16_t mm = m[H1(high + i)];
3157  
3158                  nn = expand_bits(nn, esz);
3159                  mm = expand_bits(mm, esz);
3160                  d16[H2(i)] = nn | (mm << esize);
3161              }
3162          }
3163      }
3164  }
3165  
HELPER(sve_uzp_p)3166  void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3167  {
3168      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3169      int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3170      int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3171      uint64_t *d = vd, *n = vn, *m = vm;
3172      uint64_t l, h;
3173      intptr_t i;
3174  
3175      if (oprsz <= 8) {
3176          l = compress_bits(n[0] >> odd, esz);
3177          h = compress_bits(m[0] >> odd, esz);
3178          d[0] = l | (h << (4 * oprsz));
3179      } else {
3180          ARMPredicateReg tmp_m;
3181          intptr_t oprsz_16 = oprsz / 16;
3182  
3183          if ((vm - vd) < (uintptr_t)oprsz) {
3184              m = memcpy(&tmp_m, vm, oprsz);
3185          }
3186  
3187          for (i = 0; i < oprsz_16; i++) {
3188              l = n[2 * i + 0];
3189              h = n[2 * i + 1];
3190              l = compress_bits(l >> odd, esz);
3191              h = compress_bits(h >> odd, esz);
3192              d[i] = l | (h << 32);
3193          }
3194  
3195          /*
3196           * For VL which is not a multiple of 512, the results from M do not
3197           * align nicely with the uint64_t for D.  Put the aligned results
3198           * from M into TMP_M and then copy it into place afterward.
3199           */
3200          if (oprsz & 15) {
3201              int final_shift = (oprsz & 15) * 2;
3202  
3203              l = n[2 * i + 0];
3204              h = n[2 * i + 1];
3205              l = compress_bits(l >> odd, esz);
3206              h = compress_bits(h >> odd, esz);
3207              d[i] = l | (h << final_shift);
3208  
3209              for (i = 0; i < oprsz_16; i++) {
3210                  l = m[2 * i + 0];
3211                  h = m[2 * i + 1];
3212                  l = compress_bits(l >> odd, esz);
3213                  h = compress_bits(h >> odd, esz);
3214                  tmp_m.p[i] = l | (h << 32);
3215              }
3216              l = m[2 * i + 0];
3217              h = m[2 * i + 1];
3218              l = compress_bits(l >> odd, esz);
3219              h = compress_bits(h >> odd, esz);
3220              tmp_m.p[i] = l | (h << final_shift);
3221  
3222              swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3223          } else {
3224              for (i = 0; i < oprsz_16; i++) {
3225                  l = m[2 * i + 0];
3226                  h = m[2 * i + 1];
3227                  l = compress_bits(l >> odd, esz);
3228                  h = compress_bits(h >> odd, esz);
3229                  d[oprsz_16 + i] = l | (h << 32);
3230              }
3231          }
3232      }
3233  }
3234  
HELPER(sve_trn_p)3235  void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3236  {
3237      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3238      int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3239      int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3240      uint64_t *d = vd, *n = vn, *m = vm;
3241      uint64_t mask;
3242      int shr, shl;
3243      intptr_t i;
3244  
3245      shl = 1 << esz;
3246      shr = 0;
3247      mask = even_bit_esz_masks[esz];
3248      if (odd) {
3249          mask <<= shl;
3250          shr = shl;
3251          shl = 0;
3252      }
3253  
3254      for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3255          uint64_t nn = (n[i] & mask) >> shr;
3256          uint64_t mm = (m[i] & mask) << shl;
3257          d[i] = nn + mm;
3258      }
3259  }
3260  
3261  /* Reverse units of 2**N bits.  */
reverse_bits_64(uint64_t x,int n)3262  static uint64_t reverse_bits_64(uint64_t x, int n)
3263  {
3264      int i, sh;
3265  
3266      x = bswap64(x);
3267      for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3268          uint64_t mask = even_bit_esz_masks[i];
3269          x = ((x & mask) << sh) | ((x >> sh) & mask);
3270      }
3271      return x;
3272  }
3273  
reverse_bits_8(uint8_t x,int n)3274  static uint8_t reverse_bits_8(uint8_t x, int n)
3275  {
3276      static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3277      int i, sh;
3278  
3279      for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3280          x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3281      }
3282      return x;
3283  }
3284  
HELPER(sve_rev_p)3285  void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3286  {
3287      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3288      int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3289      intptr_t i, oprsz_2 = oprsz / 2;
3290  
3291      if (oprsz <= 8) {
3292          uint64_t l = *(uint64_t *)vn;
3293          l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3294          *(uint64_t *)vd = l;
3295      } else if ((oprsz & 15) == 0) {
3296          for (i = 0; i < oprsz_2; i += 8) {
3297              intptr_t ih = oprsz - 8 - i;
3298              uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3299              uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3300              *(uint64_t *)(vd + i) = h;
3301              *(uint64_t *)(vd + ih) = l;
3302          }
3303      } else {
3304          for (i = 0; i < oprsz_2; i += 1) {
3305              intptr_t il = H1(i);
3306              intptr_t ih = H1(oprsz - 1 - i);
3307              uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3308              uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3309              *(uint8_t *)(vd + il) = h;
3310              *(uint8_t *)(vd + ih) = l;
3311          }
3312      }
3313  }
3314  
HELPER(sve_punpk_p)3315  void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3316  {
3317      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3318      intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3319      uint64_t *d = vd;
3320      intptr_t i;
3321  
3322      if (oprsz <= 8) {
3323          uint64_t nn = *(uint64_t *)vn;
3324          int half = 4 * oprsz;
3325  
3326          nn = extract64(nn, high * half, half);
3327          nn = expand_bits(nn, 0);
3328          d[0] = nn;
3329      } else {
3330          ARMPredicateReg tmp_n;
3331  
3332          /* We produce output faster than we consume input.
3333             Therefore we must be mindful of possible overlap.  */
3334          if ((vn - vd) < (uintptr_t)oprsz) {
3335              vn = memcpy(&tmp_n, vn, oprsz);
3336          }
3337          if (high) {
3338              high = oprsz >> 1;
3339          }
3340  
3341          if ((oprsz & 7) == 0) {
3342              uint32_t *n = vn;
3343              high >>= 2;
3344  
3345              for (i = 0; i < oprsz / 8; i++) {
3346                  uint64_t nn = n[H4(high + i)];
3347                  d[i] = expand_bits(nn, 0);
3348              }
3349          } else {
3350              uint16_t *d16 = vd;
3351              uint8_t *n = vn;
3352  
3353              for (i = 0; i < oprsz / 2; i++) {
3354                  uint16_t nn = n[H1(high + i)];
3355                  d16[H2(i)] = expand_bits(nn, 0);
3356              }
3357          }
3358      }
3359  }
3360  
3361  #define DO_ZIP(NAME, TYPE, H) \
3362  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3363  {                                                                    \
3364      intptr_t oprsz = simd_oprsz(desc);                               \
3365      intptr_t odd_ofs = simd_data(desc);                              \
3366      intptr_t i, oprsz_2 = oprsz / 2;                                 \
3367      ARMVectorReg tmp_n, tmp_m;                                       \
3368      /* We produce output faster than we consume input.               \
3369         Therefore we must be mindful of possible overlap.  */         \
3370      if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3371          vn = memcpy(&tmp_n, vn, oprsz);                              \
3372      }                                                                \
3373      if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3374          vm = memcpy(&tmp_m, vm, oprsz);                              \
3375      }                                                                \
3376      for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3377          *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3378          *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3379              *(TYPE *)(vm + odd_ofs + H(i));                          \
3380      }                                                                \
3381      if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3382          memset(vd + oprsz - 16, 0, 16);                              \
3383      }                                                                \
3384  }
3385  
DO_ZIP(sve_zip_b,uint8_t,H1)3386  DO_ZIP(sve_zip_b, uint8_t, H1)
3387  DO_ZIP(sve_zip_h, uint16_t, H1_2)
3388  DO_ZIP(sve_zip_s, uint32_t, H1_4)
3389  DO_ZIP(sve_zip_d, uint64_t, H1_8)
3390  DO_ZIP(sve2_zip_q, Int128, )
3391  
3392  #define DO_UZP(NAME, TYPE, H) \
3393  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3394  {                                                                      \
3395      intptr_t oprsz = simd_oprsz(desc);                                 \
3396      intptr_t odd_ofs = simd_data(desc);                                \
3397      intptr_t i, p;                                                     \
3398      ARMVectorReg tmp_m;                                                \
3399      if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3400          vm = memcpy(&tmp_m, vm, oprsz);                                \
3401      }                                                                  \
3402      i = 0, p = odd_ofs;                                                \
3403      do {                                                               \
3404          *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3405          i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3406      } while (p < oprsz);                                               \
3407      p -= oprsz;                                                        \
3408      do {                                                               \
3409          *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3410          i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3411      } while (p < oprsz);                                               \
3412      tcg_debug_assert(i == oprsz);                                      \
3413  }
3414  
3415  DO_UZP(sve_uzp_b, uint8_t, H1)
3416  DO_UZP(sve_uzp_h, uint16_t, H1_2)
3417  DO_UZP(sve_uzp_s, uint32_t, H1_4)
3418  DO_UZP(sve_uzp_d, uint64_t, H1_8)
3419  DO_UZP(sve2_uzp_q, Int128, )
3420  
3421  #define DO_TRN(NAME, TYPE, H) \
3422  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3423  {                                                                      \
3424      intptr_t oprsz = simd_oprsz(desc);                                 \
3425      intptr_t odd_ofs = simd_data(desc);                                \
3426      intptr_t i;                                                        \
3427      for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3428          TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3429          TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3430          *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3431          *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3432      }                                                                  \
3433      if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3434          memset(vd + oprsz - 16, 0, 16);                                \
3435      }                                                                  \
3436  }
3437  
3438  DO_TRN(sve_trn_b, uint8_t, H1)
3439  DO_TRN(sve_trn_h, uint16_t, H1_2)
3440  DO_TRN(sve_trn_s, uint32_t, H1_4)
3441  DO_TRN(sve_trn_d, uint64_t, H1_8)
3442  DO_TRN(sve2_trn_q, Int128, )
3443  
3444  #undef DO_ZIP
3445  #undef DO_UZP
3446  #undef DO_TRN
3447  
3448  void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3449  {
3450      intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3451      uint32_t *d = vd, *n = vn;
3452      uint8_t *pg = vg;
3453  
3454      for (i = j = 0; i < opr_sz; i++) {
3455          if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3456              d[H4(j)] = n[H4(i)];
3457              j++;
3458          }
3459      }
3460      for (; j < opr_sz; j++) {
3461          d[H4(j)] = 0;
3462      }
3463  }
3464  
HELPER(sve_compact_d)3465  void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3466  {
3467      intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3468      uint64_t *d = vd, *n = vn;
3469      uint8_t *pg = vg;
3470  
3471      for (i = j = 0; i < opr_sz; i++) {
3472          if (pg[H1(i)] & 1) {
3473              d[j] = n[i];
3474              j++;
3475          }
3476      }
3477      for (; j < opr_sz; j++) {
3478          d[j] = 0;
3479      }
3480  }
3481  
3482  /* Similar to the ARM LastActiveElement pseudocode function, except the
3483   * result is multiplied by the element size.  This includes the not found
3484   * indication; e.g. not found for esz=3 is -8.
3485   */
HELPER(sve_last_active_element)3486  int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3487  {
3488      intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3489      intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3490  
3491      return last_active_element(vg, words, esz);
3492  }
3493  
HELPER(sve_splice)3494  void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3495  {
3496      intptr_t opr_sz = simd_oprsz(desc) / 8;
3497      int esz = simd_data(desc);
3498      uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3499      intptr_t i, first_i, last_i;
3500      ARMVectorReg tmp;
3501  
3502      first_i = last_i = 0;
3503      first_g = last_g = 0;
3504  
3505      /* Find the extent of the active elements within VG.  */
3506      for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3507          pg = *(uint64_t *)(vg + i) & mask;
3508          if (pg) {
3509              if (last_g == 0) {
3510                  last_g = pg;
3511                  last_i = i;
3512              }
3513              first_g = pg;
3514              first_i = i;
3515          }
3516      }
3517  
3518      len = 0;
3519      if (first_g != 0) {
3520          first_i = first_i * 8 + ctz64(first_g);
3521          last_i = last_i * 8 + 63 - clz64(last_g);
3522          len = last_i - first_i + (1 << esz);
3523          if (vd == vm) {
3524              vm = memcpy(&tmp, vm, opr_sz * 8);
3525          }
3526          swap_memmove(vd, vn + first_i, len);
3527      }
3528      swap_memmove(vd + len, vm, opr_sz * 8 - len);
3529  }
3530  
HELPER(sve_sel_zpzz_b)3531  void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3532                              void *vg, uint32_t desc)
3533  {
3534      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3535      uint64_t *d = vd, *n = vn, *m = vm;
3536      uint8_t *pg = vg;
3537  
3538      for (i = 0; i < opr_sz; i += 1) {
3539          uint64_t nn = n[i], mm = m[i];
3540          uint64_t pp = expand_pred_b(pg[H1(i)]);
3541          d[i] = (nn & pp) | (mm & ~pp);
3542      }
3543  }
3544  
HELPER(sve_sel_zpzz_h)3545  void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3546                              void *vg, uint32_t desc)
3547  {
3548      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3549      uint64_t *d = vd, *n = vn, *m = vm;
3550      uint8_t *pg = vg;
3551  
3552      for (i = 0; i < opr_sz; i += 1) {
3553          uint64_t nn = n[i], mm = m[i];
3554          uint64_t pp = expand_pred_h(pg[H1(i)]);
3555          d[i] = (nn & pp) | (mm & ~pp);
3556      }
3557  }
3558  
HELPER(sve_sel_zpzz_s)3559  void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3560                              void *vg, uint32_t desc)
3561  {
3562      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3563      uint64_t *d = vd, *n = vn, *m = vm;
3564      uint8_t *pg = vg;
3565  
3566      for (i = 0; i < opr_sz; i += 1) {
3567          uint64_t nn = n[i], mm = m[i];
3568          uint64_t pp = expand_pred_s(pg[H1(i)]);
3569          d[i] = (nn & pp) | (mm & ~pp);
3570      }
3571  }
3572  
HELPER(sve_sel_zpzz_d)3573  void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3574                              void *vg, uint32_t desc)
3575  {
3576      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3577      uint64_t *d = vd, *n = vn, *m = vm;
3578      uint8_t *pg = vg;
3579  
3580      for (i = 0; i < opr_sz; i += 1) {
3581          uint64_t nn = n[i], mm = m[i];
3582          d[i] = (pg[H1(i)] & 1 ? nn : mm);
3583      }
3584  }
3585  
HELPER(sve_sel_zpzz_q)3586  void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3587                              void *vg, uint32_t desc)
3588  {
3589      intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3590      Int128 *d = vd, *n = vn, *m = vm;
3591      uint16_t *pg = vg;
3592  
3593      for (i = 0; i < opr_sz; i += 1) {
3594          d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3595      }
3596  }
3597  
3598  /* Two operand comparison controlled by a predicate.
3599   * ??? It is very tempting to want to be able to expand this inline
3600   * with x86 instructions, e.g.
3601   *
3602   *    vcmpeqw    zm, zn, %ymm0
3603   *    vpmovmskb  %ymm0, %eax
3604   *    and        $0x5555, %eax
3605   *    and        pg, %eax
3606   *
3607   * or even aarch64, e.g.
3608   *
3609   *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3610   *    cmeq       v0.8h, zn, zm
3611   *    and        v0.8h, v0.8h, mask
3612   *    addv       h0, v0.8h
3613   *    and        v0.8b, pg
3614   *
3615   * However, coming up with an abstraction that allows vector inputs and
3616   * a scalar output, and also handles the byte-ordering of sub-uint64_t
3617   * scalar outputs, is tricky.
3618   */
3619  #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3620  uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3621  {                                                                            \
3622      intptr_t opr_sz = simd_oprsz(desc);                                      \
3623      uint32_t flags = PREDTEST_INIT;                                          \
3624      intptr_t i = opr_sz;                                                     \
3625      do {                                                                     \
3626          uint64_t out = 0, pg;                                                \
3627          do {                                                                 \
3628              i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3629              TYPE nn = *(TYPE *)(vn + H(i));                                  \
3630              TYPE mm = *(TYPE *)(vm + H(i));                                  \
3631              out |= nn OP mm;                                                 \
3632          } while (i & 63);                                                    \
3633          pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3634          out &= pg;                                                           \
3635          *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3636          flags = iter_predtest_bwd(out, pg, flags);                           \
3637      } while (i > 0);                                                         \
3638      return flags;                                                            \
3639  }
3640  
3641  #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3642      DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3643  #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3644      DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3645  #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3646      DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3647  #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3648      DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3649  
3650  DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3651  DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3652  DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3653  DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3654  
3655  DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3656  DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3657  DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3658  DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3659  
3660  DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3661  DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3662  DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3663  DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3664  
3665  DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3666  DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3667  DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3668  DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3669  
3670  DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3671  DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3672  DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3673  DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3674  
3675  DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3676  DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3677  DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3678  DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3679  
3680  #undef DO_CMP_PPZZ_B
3681  #undef DO_CMP_PPZZ_H
3682  #undef DO_CMP_PPZZ_S
3683  #undef DO_CMP_PPZZ_D
3684  #undef DO_CMP_PPZZ
3685  
3686  /* Similar, but the second source is "wide".  */
3687  #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3688  uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3689  {                                                                            \
3690      intptr_t opr_sz = simd_oprsz(desc);                                      \
3691      uint32_t flags = PREDTEST_INIT;                                          \
3692      intptr_t i = opr_sz;                                                     \
3693      do {                                                                     \
3694          uint64_t out = 0, pg;                                                \
3695          do {                                                                 \
3696              TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3697              do {                                                             \
3698                  i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3699                  TYPE nn = *(TYPE *)(vn + H(i));                              \
3700                  out |= nn OP mm;                                             \
3701              } while (i & 7);                                                 \
3702          } while (i & 63);                                                    \
3703          pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3704          out &= pg;                                                           \
3705          *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3706          flags = iter_predtest_bwd(out, pg, flags);                           \
3707      } while (i > 0);                                                         \
3708      return flags;                                                            \
3709  }
3710  
3711  #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3712      DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3713  #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3714      DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3715  #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3716      DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3717  
3718  DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3719  DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3720  DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3721  
3722  DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3723  DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3724  DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3725  
3726  DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3727  DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3728  DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3729  
3730  DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3731  DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3732  DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3733  
3734  DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3735  DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3736  DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3737  
3738  DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3739  DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3740  DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3741  
3742  DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3743  DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3744  DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3745  
3746  DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3747  DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3748  DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3749  
3750  DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3751  DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3752  DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3753  
3754  DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3755  DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3756  DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3757  
3758  #undef DO_CMP_PPZW_B
3759  #undef DO_CMP_PPZW_H
3760  #undef DO_CMP_PPZW_S
3761  #undef DO_CMP_PPZW
3762  
3763  /* Similar, but the second source is immediate.  */
3764  #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3765  uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3766  {                                                                    \
3767      intptr_t opr_sz = simd_oprsz(desc);                              \
3768      uint32_t flags = PREDTEST_INIT;                                  \
3769      TYPE mm = simd_data(desc);                                       \
3770      intptr_t i = opr_sz;                                             \
3771      do {                                                             \
3772          uint64_t out = 0, pg;                                        \
3773          do {                                                         \
3774              i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3775              TYPE nn = *(TYPE *)(vn + H(i));                          \
3776              out |= nn OP mm;                                         \
3777          } while (i & 63);                                            \
3778          pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3779          out &= pg;                                                   \
3780          *(uint64_t *)(vd + (i >> 3)) = out;                          \
3781          flags = iter_predtest_bwd(out, pg, flags);                   \
3782      } while (i > 0);                                                 \
3783      return flags;                                                    \
3784  }
3785  
3786  #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3787      DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3788  #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3789      DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3790  #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3791      DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3792  #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3793      DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3794  
3795  DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3796  DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3797  DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3798  DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3799  
3800  DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3801  DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3802  DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3803  DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3804  
3805  DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3806  DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3807  DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3808  DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3809  
3810  DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3811  DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3812  DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3813  DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3814  
3815  DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3816  DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3817  DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3818  DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3819  
3820  DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3821  DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3822  DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3823  DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3824  
3825  DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3826  DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3827  DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3828  DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3829  
3830  DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3831  DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3832  DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3833  DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3834  
3835  DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3836  DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3837  DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3838  DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3839  
3840  DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3841  DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3842  DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3843  DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3844  
3845  #undef DO_CMP_PPZI_B
3846  #undef DO_CMP_PPZI_H
3847  #undef DO_CMP_PPZI_S
3848  #undef DO_CMP_PPZI_D
3849  #undef DO_CMP_PPZI
3850  
3851  /* Similar to the ARM LastActive pseudocode function.  */
last_active_pred(void * vd,void * vg,intptr_t oprsz)3852  static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3853  {
3854      intptr_t i;
3855  
3856      for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3857          uint64_t pg = *(uint64_t *)(vg + i);
3858          if (pg) {
3859              return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3860          }
3861      }
3862      return 0;
3863  }
3864  
3865  /* Compute a mask into RETB that is true for all G, up to and including
3866   * (if after) or excluding (if !after) the first G & N.
3867   * Return true if BRK found.
3868   */
compute_brk(uint64_t * retb,uint64_t n,uint64_t g,bool brk,bool after)3869  static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3870                          bool brk, bool after)
3871  {
3872      uint64_t b;
3873  
3874      if (brk) {
3875          b = 0;
3876      } else if ((g & n) == 0) {
3877          /* For all G, no N are set; break not found.  */
3878          b = g;
3879      } else {
3880          /* Break somewhere in N.  Locate it.  */
3881          b = g & n;            /* guard true, pred true */
3882          b = b & -b;           /* first such */
3883          if (after) {
3884              b = b | (b - 1);  /* break after same */
3885          } else {
3886              b = b - 1;        /* break before same */
3887          }
3888          brk = true;
3889      }
3890  
3891      *retb = b;
3892      return brk;
3893  }
3894  
3895  /* Compute a zeroing BRK.  */
compute_brk_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3896  static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3897                            intptr_t oprsz, bool after)
3898  {
3899      bool brk = false;
3900      intptr_t i;
3901  
3902      for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3903          uint64_t this_b, this_g = g[i];
3904  
3905          brk = compute_brk(&this_b, n[i], this_g, brk, after);
3906          d[i] = this_b & this_g;
3907      }
3908  }
3909  
3910  /* Likewise, but also compute flags.  */
compute_brks_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3911  static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3912                                 intptr_t oprsz, bool after)
3913  {
3914      uint32_t flags = PREDTEST_INIT;
3915      bool brk = false;
3916      intptr_t i;
3917  
3918      for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3919          uint64_t this_b, this_d, this_g = g[i];
3920  
3921          brk = compute_brk(&this_b, n[i], this_g, brk, after);
3922          d[i] = this_d = this_b & this_g;
3923          flags = iter_predtest_fwd(this_d, this_g, flags);
3924      }
3925      return flags;
3926  }
3927  
3928  /* Compute a merging BRK.  */
compute_brk_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3929  static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3930                            intptr_t oprsz, bool after)
3931  {
3932      bool brk = false;
3933      intptr_t i;
3934  
3935      for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3936          uint64_t this_b, this_g = g[i];
3937  
3938          brk = compute_brk(&this_b, n[i], this_g, brk, after);
3939          d[i] = (this_b & this_g) | (d[i] & ~this_g);
3940      }
3941  }
3942  
3943  /* Likewise, but also compute flags.  */
compute_brks_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3944  static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3945                                 intptr_t oprsz, bool after)
3946  {
3947      uint32_t flags = PREDTEST_INIT;
3948      bool brk = false;
3949      intptr_t i;
3950  
3951      for (i = 0; i < oprsz / 8; ++i) {
3952          uint64_t this_b, this_d = d[i], this_g = g[i];
3953  
3954          brk = compute_brk(&this_b, n[i], this_g, brk, after);
3955          d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3956          flags = iter_predtest_fwd(this_d, this_g, flags);
3957      }
3958      return flags;
3959  }
3960  
do_zero(ARMPredicateReg * d,intptr_t oprsz)3961  static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3962  {
3963      /* It is quicker to zero the whole predicate than loop on OPRSZ.
3964       * The compiler should turn this into 4 64-bit integer stores.
3965       */
3966      memset(d, 0, sizeof(ARMPredicateReg));
3967      return PREDTEST_INIT;
3968  }
3969  
HELPER(sve_brkpa)3970  void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3971                         uint32_t pred_desc)
3972  {
3973      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3974      if (last_active_pred(vn, vg, oprsz)) {
3975          compute_brk_z(vd, vm, vg, oprsz, true);
3976      } else {
3977          do_zero(vd, oprsz);
3978      }
3979  }
3980  
HELPER(sve_brkpas)3981  uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3982                              uint32_t pred_desc)
3983  {
3984      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3985      if (last_active_pred(vn, vg, oprsz)) {
3986          return compute_brks_z(vd, vm, vg, oprsz, true);
3987      } else {
3988          return do_zero(vd, oprsz);
3989      }
3990  }
3991  
HELPER(sve_brkpb)3992  void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3993                         uint32_t pred_desc)
3994  {
3995      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3996      if (last_active_pred(vn, vg, oprsz)) {
3997          compute_brk_z(vd, vm, vg, oprsz, false);
3998      } else {
3999          do_zero(vd, oprsz);
4000      }
4001  }
4002  
HELPER(sve_brkpbs)4003  uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4004                              uint32_t pred_desc)
4005  {
4006      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4007      if (last_active_pred(vn, vg, oprsz)) {
4008          return compute_brks_z(vd, vm, vg, oprsz, false);
4009      } else {
4010          return do_zero(vd, oprsz);
4011      }
4012  }
4013  
HELPER(sve_brka_z)4014  void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4015  {
4016      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4017      compute_brk_z(vd, vn, vg, oprsz, true);
4018  }
4019  
HELPER(sve_brkas_z)4020  uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4021  {
4022      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4023      return compute_brks_z(vd, vn, vg, oprsz, true);
4024  }
4025  
HELPER(sve_brkb_z)4026  void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4027  {
4028      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4029      compute_brk_z(vd, vn, vg, oprsz, false);
4030  }
4031  
HELPER(sve_brkbs_z)4032  uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4033  {
4034      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4035      return compute_brks_z(vd, vn, vg, oprsz, false);
4036  }
4037  
HELPER(sve_brka_m)4038  void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4039  {
4040      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4041      compute_brk_m(vd, vn, vg, oprsz, true);
4042  }
4043  
HELPER(sve_brkas_m)4044  uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4045  {
4046      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4047      return compute_brks_m(vd, vn, vg, oprsz, true);
4048  }
4049  
HELPER(sve_brkb_m)4050  void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4051  {
4052      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4053      compute_brk_m(vd, vn, vg, oprsz, false);
4054  }
4055  
HELPER(sve_brkbs_m)4056  uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4057  {
4058      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4059      return compute_brks_m(vd, vn, vg, oprsz, false);
4060  }
4061  
HELPER(sve_brkn)4062  void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4063  {
4064      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4065      if (!last_active_pred(vn, vg, oprsz)) {
4066          do_zero(vd, oprsz);
4067      }
4068  }
4069  
4070  /* As if PredTest(Ones(PL), D, esz).  */
predtest_ones(ARMPredicateReg * d,intptr_t oprsz,uint64_t esz_mask)4071  static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4072                                uint64_t esz_mask)
4073  {
4074      uint32_t flags = PREDTEST_INIT;
4075      intptr_t i;
4076  
4077      for (i = 0; i < oprsz / 8; i++) {
4078          flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4079      }
4080      if (oprsz & 7) {
4081          uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4082          flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4083      }
4084      return flags;
4085  }
4086  
HELPER(sve_brkns)4087  uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4088  {
4089      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4090      if (last_active_pred(vn, vg, oprsz)) {
4091          return predtest_ones(vd, oprsz, -1);
4092      } else {
4093          return do_zero(vd, oprsz);
4094      }
4095  }
4096  
HELPER(sve_cntp)4097  uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4098  {
4099      intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4100      intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4101      uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4102      intptr_t i;
4103  
4104      for (i = 0; i < words; ++i) {
4105          uint64_t t = n[i] & g[i] & mask;
4106          sum += ctpop64(t);
4107      }
4108      return sum;
4109  }
4110  
HELPER(sve_whilel)4111  uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4112  {
4113      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4114      intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4115      uint64_t esz_mask = pred_esz_masks[esz];
4116      ARMPredicateReg *d = vd;
4117      uint32_t flags;
4118      intptr_t i;
4119  
4120      /* Begin with a zero predicate register.  */
4121      flags = do_zero(d, oprsz);
4122      if (count == 0) {
4123          return flags;
4124      }
4125  
4126      /* Set all of the requested bits.  */
4127      for (i = 0; i < count / 64; ++i) {
4128          d->p[i] = esz_mask;
4129      }
4130      if (count & 63) {
4131          d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4132      }
4133  
4134      return predtest_ones(d, oprsz, esz_mask);
4135  }
4136  
HELPER(sve_whileg)4137  uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4138  {
4139      intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4140      intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4141      uint64_t esz_mask = pred_esz_masks[esz];
4142      ARMPredicateReg *d = vd;
4143      intptr_t i, invcount, oprbits;
4144      uint64_t bits;
4145  
4146      if (count == 0) {
4147          return do_zero(d, oprsz);
4148      }
4149  
4150      oprbits = oprsz * 8;
4151      tcg_debug_assert(count <= oprbits);
4152  
4153      bits = esz_mask;
4154      if (oprbits & 63) {
4155          bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4156      }
4157  
4158      invcount = oprbits - count;
4159      for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4160          d->p[i] = bits;
4161          bits = esz_mask;
4162      }
4163  
4164      d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4165  
4166      while (--i >= 0) {
4167          d->p[i] = 0;
4168      }
4169  
4170      return predtest_ones(d, oprsz, esz_mask);
4171  }
4172  
4173  /* Recursive reduction on a function;
4174   * C.f. the ARM ARM function ReducePredicated.
4175   *
4176   * While it would be possible to write this without the DATA temporary,
4177   * it is much simpler to process the predicate register this way.
4178   * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4179   * little to gain with a more complex non-recursive form.
4180   */
4181  #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4182  static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4183  {                                                                     \
4184      if (n == 1) {                                                     \
4185          return *data;                                                 \
4186      } else {                                                          \
4187          uintptr_t half = n / 2;                                       \
4188          TYPE lo = NAME##_reduce(data, status, half);                  \
4189          TYPE hi = NAME##_reduce(data + half, status, half);           \
4190          return TYPE##_##FUNC(lo, hi, status);                         \
4191      }                                                                 \
4192  }                                                                     \
4193  uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
4194  {                                                                     \
4195      uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4196      TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4197      for (i = 0; i < oprsz; ) {                                        \
4198          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4199          do {                                                          \
4200              TYPE nn = *(TYPE *)(vn + H(i));                           \
4201              *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4202              i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4203          } while (i & 15);                                             \
4204      }                                                                 \
4205      for (; i < maxsz; i += sizeof(TYPE)) {                            \
4206          *(TYPE *)((void *)data + i) = IDENT;                          \
4207      }                                                                 \
4208      return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
4209  }
4210  
DO_REDUCE(sve_faddv_h,float16,H1_2,add,float16_zero)4211  DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4212  DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4213  DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4214  
4215  /* Identity is floatN_default_nan, without the function call.  */
4216  DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4217  DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4218  DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4219  
4220  DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4221  DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4222  DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4223  
4224  DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4225  DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4226  DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4227  
4228  DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4229  DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4230  DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4231  
4232  #undef DO_REDUCE
4233  
4234  uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4235                               void *status, uint32_t desc)
4236  {
4237      intptr_t i = 0, opr_sz = simd_oprsz(desc);
4238      float16 result = nn;
4239  
4240      do {
4241          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4242          do {
4243              if (pg & 1) {
4244                  float16 mm = *(float16 *)(vm + H1_2(i));
4245                  result = float16_add(result, mm, status);
4246              }
4247              i += sizeof(float16), pg >>= sizeof(float16);
4248          } while (i & 15);
4249      } while (i < opr_sz);
4250  
4251      return result;
4252  }
4253  
HELPER(sve_fadda_s)4254  uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4255                               void *status, uint32_t desc)
4256  {
4257      intptr_t i = 0, opr_sz = simd_oprsz(desc);
4258      float32 result = nn;
4259  
4260      do {
4261          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4262          do {
4263              if (pg & 1) {
4264                  float32 mm = *(float32 *)(vm + H1_2(i));
4265                  result = float32_add(result, mm, status);
4266              }
4267              i += sizeof(float32), pg >>= sizeof(float32);
4268          } while (i & 15);
4269      } while (i < opr_sz);
4270  
4271      return result;
4272  }
4273  
HELPER(sve_fadda_d)4274  uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4275                               void *status, uint32_t desc)
4276  {
4277      intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4278      uint64_t *m = vm;
4279      uint8_t *pg = vg;
4280  
4281      for (i = 0; i < opr_sz; i++) {
4282          if (pg[H1(i)] & 1) {
4283              nn = float64_add(nn, m[i], status);
4284          }
4285      }
4286  
4287      return nn;
4288  }
4289  
4290  /* Fully general three-operand expander, controlled by a predicate,
4291   * With the extra float_status parameter.
4292   */
4293  #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4294  void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4295                    void *status, uint32_t desc)                  \
4296  {                                                               \
4297      intptr_t i = simd_oprsz(desc);                              \
4298      uint64_t *g = vg;                                           \
4299      do {                                                        \
4300          uint64_t pg = g[(i - 1) >> 6];                          \
4301          do {                                                    \
4302              i -= sizeof(TYPE);                                  \
4303              if (likely((pg >> (i & 63)) & 1)) {                 \
4304                  TYPE nn = *(TYPE *)(vn + H(i));                 \
4305                  TYPE mm = *(TYPE *)(vm + H(i));                 \
4306                  *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4307              }                                                   \
4308          } while (i & 63);                                       \
4309      } while (i != 0);                                           \
4310  }
4311  
DO_ZPZZ_FP(sve_fadd_h,uint16_t,H1_2,float16_add)4312  DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4313  DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4314  DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4315  
4316  DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4317  DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4318  DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4319  
4320  DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4321  DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4322  DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4323  
4324  DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4325  DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4326  DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4327  
4328  DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4329  DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4330  DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4331  
4332  DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4333  DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4334  DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4335  
4336  DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4337  DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4338  DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4339  
4340  DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4341  DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4342  DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4343  
4344  static inline float16 abd_h(float16 a, float16 b, float_status *s)
4345  {
4346      return float16_abs(float16_sub(a, b, s));
4347  }
4348  
abd_s(float32 a,float32 b,float_status * s)4349  static inline float32 abd_s(float32 a, float32 b, float_status *s)
4350  {
4351      return float32_abs(float32_sub(a, b, s));
4352  }
4353  
abd_d(float64 a,float64 b,float_status * s)4354  static inline float64 abd_d(float64 a, float64 b, float_status *s)
4355  {
4356      return float64_abs(float64_sub(a, b, s));
4357  }
4358  
DO_ZPZZ_FP(sve_fabd_h,uint16_t,H1_2,abd_h)4359  DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4360  DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4361  DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4362  
4363  static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4364  {
4365      int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4366      return float64_scalbn(a, b_int, s);
4367  }
4368  
DO_ZPZZ_FP(sve_fscalbn_h,int16_t,H1_2,float16_scalbn)4369  DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4370  DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4371  DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4372  
4373  DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4374  DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4375  DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4376  
4377  #undef DO_ZPZZ_FP
4378  
4379  /* Three-operand expander, with one scalar operand, controlled by
4380   * a predicate, with the extra float_status parameter.
4381   */
4382  #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4383  void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4384                    void *status, uint32_t desc)                    \
4385  {                                                                 \
4386      intptr_t i = simd_oprsz(desc);                                \
4387      uint64_t *g = vg;                                             \
4388      TYPE mm = scalar;                                             \
4389      do {                                                          \
4390          uint64_t pg = g[(i - 1) >> 6];                            \
4391          do {                                                      \
4392              i -= sizeof(TYPE);                                    \
4393              if (likely((pg >> (i & 63)) & 1)) {                   \
4394                  TYPE nn = *(TYPE *)(vn + H(i));                   \
4395                  *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4396              }                                                     \
4397          } while (i & 63);                                         \
4398      } while (i != 0);                                             \
4399  }
4400  
4401  DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4402  DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4403  DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4404  
4405  DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4406  DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4407  DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4408  
4409  DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4410  DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4411  DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4412  
4413  static inline float16 subr_h(float16 a, float16 b, float_status *s)
4414  {
4415      return float16_sub(b, a, s);
4416  }
4417  
subr_s(float32 a,float32 b,float_status * s)4418  static inline float32 subr_s(float32 a, float32 b, float_status *s)
4419  {
4420      return float32_sub(b, a, s);
4421  }
4422  
subr_d(float64 a,float64 b,float_status * s)4423  static inline float64 subr_d(float64 a, float64 b, float_status *s)
4424  {
4425      return float64_sub(b, a, s);
4426  }
4427  
DO_ZPZS_FP(sve_fsubrs_h,float16,H1_2,subr_h)4428  DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4429  DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4430  DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4431  
4432  DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4433  DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4434  DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4435  
4436  DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4437  DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4438  DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4439  
4440  DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4441  DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4442  DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4443  
4444  DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4445  DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4446  DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4447  
4448  /* Fully general two-operand expander, controlled by a predicate,
4449   * With the extra float_status parameter.
4450   */
4451  #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4452  void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4453  {                                                                     \
4454      intptr_t i = simd_oprsz(desc);                                    \
4455      uint64_t *g = vg;                                                 \
4456      do {                                                              \
4457          uint64_t pg = g[(i - 1) >> 6];                                \
4458          do {                                                          \
4459              i -= sizeof(TYPE);                                        \
4460              if (likely((pg >> (i & 63)) & 1)) {                       \
4461                  TYPE nn = *(TYPE *)(vn + H(i));                       \
4462                  *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4463              }                                                         \
4464          } while (i & 63);                                             \
4465      } while (i != 0);                                                 \
4466  }
4467  
4468  /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4469   * FZ16.  When converting from fp16, this affects flushing input denormals;
4470   * when converting to fp16, this affects flushing output denormals.
4471   */
4472  static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4473  {
4474      bool save = get_flush_inputs_to_zero(fpst);
4475      float32 ret;
4476  
4477      set_flush_inputs_to_zero(false, fpst);
4478      ret = float16_to_float32(f, true, fpst);
4479      set_flush_inputs_to_zero(save, fpst);
4480      return ret;
4481  }
4482  
sve_f16_to_f64(float16 f,float_status * fpst)4483  static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4484  {
4485      bool save = get_flush_inputs_to_zero(fpst);
4486      float64 ret;
4487  
4488      set_flush_inputs_to_zero(false, fpst);
4489      ret = float16_to_float64(f, true, fpst);
4490      set_flush_inputs_to_zero(save, fpst);
4491      return ret;
4492  }
4493  
sve_f32_to_f16(float32 f,float_status * fpst)4494  static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4495  {
4496      bool save = get_flush_to_zero(fpst);
4497      float16 ret;
4498  
4499      set_flush_to_zero(false, fpst);
4500      ret = float32_to_float16(f, true, fpst);
4501      set_flush_to_zero(save, fpst);
4502      return ret;
4503  }
4504  
sve_f64_to_f16(float64 f,float_status * fpst)4505  static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4506  {
4507      bool save = get_flush_to_zero(fpst);
4508      float16 ret;
4509  
4510      set_flush_to_zero(false, fpst);
4511      ret = float64_to_float16(f, true, fpst);
4512      set_flush_to_zero(save, fpst);
4513      return ret;
4514  }
4515  
vfp_float16_to_int16_rtz(float16 f,float_status * s)4516  static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4517  {
4518      if (float16_is_any_nan(f)) {
4519          float_raise(float_flag_invalid, s);
4520          return 0;
4521      }
4522      return float16_to_int16_round_to_zero(f, s);
4523  }
4524  
vfp_float16_to_int64_rtz(float16 f,float_status * s)4525  static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4526  {
4527      if (float16_is_any_nan(f)) {
4528          float_raise(float_flag_invalid, s);
4529          return 0;
4530      }
4531      return float16_to_int64_round_to_zero(f, s);
4532  }
4533  
vfp_float32_to_int64_rtz(float32 f,float_status * s)4534  static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4535  {
4536      if (float32_is_any_nan(f)) {
4537          float_raise(float_flag_invalid, s);
4538          return 0;
4539      }
4540      return float32_to_int64_round_to_zero(f, s);
4541  }
4542  
vfp_float64_to_int64_rtz(float64 f,float_status * s)4543  static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4544  {
4545      if (float64_is_any_nan(f)) {
4546          float_raise(float_flag_invalid, s);
4547          return 0;
4548      }
4549      return float64_to_int64_round_to_zero(f, s);
4550  }
4551  
vfp_float16_to_uint16_rtz(float16 f,float_status * s)4552  static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4553  {
4554      if (float16_is_any_nan(f)) {
4555          float_raise(float_flag_invalid, s);
4556          return 0;
4557      }
4558      return float16_to_uint16_round_to_zero(f, s);
4559  }
4560  
vfp_float16_to_uint64_rtz(float16 f,float_status * s)4561  static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4562  {
4563      if (float16_is_any_nan(f)) {
4564          float_raise(float_flag_invalid, s);
4565          return 0;
4566      }
4567      return float16_to_uint64_round_to_zero(f, s);
4568  }
4569  
vfp_float32_to_uint64_rtz(float32 f,float_status * s)4570  static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4571  {
4572      if (float32_is_any_nan(f)) {
4573          float_raise(float_flag_invalid, s);
4574          return 0;
4575      }
4576      return float32_to_uint64_round_to_zero(f, s);
4577  }
4578  
vfp_float64_to_uint64_rtz(float64 f,float_status * s)4579  static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4580  {
4581      if (float64_is_any_nan(f)) {
4582          float_raise(float_flag_invalid, s);
4583          return 0;
4584      }
4585      return float64_to_uint64_round_to_zero(f, s);
4586  }
4587  
DO_ZPZ_FP(sve_fcvt_sh,uint32_t,H1_4,sve_f32_to_f16)4588  DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4589  DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4590  DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4591  DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4592  DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4593  DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4594  DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4595  
4596  DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4597  DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4598  DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4599  DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4600  DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4601  DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4602  DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4603  
4604  DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4605  DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4606  DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4607  DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4608  DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4609  DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4610  DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4611  
4612  DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4613  DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4614  DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4615  
4616  DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4617  DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4618  DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4619  
4620  DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4621  DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4622  DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4623  
4624  DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4625  DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4626  DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4627  
4628  DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4629  DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4630  DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4631  DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4632  DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4633  DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4634  DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4635  
4636  DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4637  DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4638  DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4639  DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4640  DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4641  DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4642  DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4643  
4644  static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4645  {
4646      /* Extract frac to the top of the uint32_t. */
4647      uint32_t frac = (uint32_t)a << (16 + 6);
4648      int16_t exp = extract32(a, 10, 5);
4649  
4650      if (unlikely(exp == 0)) {
4651          if (frac != 0) {
4652              if (!get_flush_inputs_to_zero(s)) {
4653                  /* denormal: bias - fractional_zeros */
4654                  return -15 - clz32(frac);
4655              }
4656              /* flush to zero */
4657              float_raise(float_flag_input_denormal, s);
4658          }
4659      } else if (unlikely(exp == 0x1f)) {
4660          if (frac == 0) {
4661              return INT16_MAX; /* infinity */
4662          }
4663      } else {
4664          /* normal: exp - bias */
4665          return exp - 15;
4666      }
4667      /* nan or zero */
4668      float_raise(float_flag_invalid, s);
4669      return INT16_MIN;
4670  }
4671  
do_float32_logb_as_int(float32 a,float_status * s)4672  static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4673  {
4674      /* Extract frac to the top of the uint32_t. */
4675      uint32_t frac = a << 9;
4676      int32_t exp = extract32(a, 23, 8);
4677  
4678      if (unlikely(exp == 0)) {
4679          if (frac != 0) {
4680              if (!get_flush_inputs_to_zero(s)) {
4681                  /* denormal: bias - fractional_zeros */
4682                  return -127 - clz32(frac);
4683              }
4684              /* flush to zero */
4685              float_raise(float_flag_input_denormal, s);
4686          }
4687      } else if (unlikely(exp == 0xff)) {
4688          if (frac == 0) {
4689              return INT32_MAX; /* infinity */
4690          }
4691      } else {
4692          /* normal: exp - bias */
4693          return exp - 127;
4694      }
4695      /* nan or zero */
4696      float_raise(float_flag_invalid, s);
4697      return INT32_MIN;
4698  }
4699  
do_float64_logb_as_int(float64 a,float_status * s)4700  static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4701  {
4702      /* Extract frac to the top of the uint64_t. */
4703      uint64_t frac = a << 12;
4704      int64_t exp = extract64(a, 52, 11);
4705  
4706      if (unlikely(exp == 0)) {
4707          if (frac != 0) {
4708              if (!get_flush_inputs_to_zero(s)) {
4709                  /* denormal: bias - fractional_zeros */
4710                  return -1023 - clz64(frac);
4711              }
4712              /* flush to zero */
4713              float_raise(float_flag_input_denormal, s);
4714          }
4715      } else if (unlikely(exp == 0x7ff)) {
4716          if (frac == 0) {
4717              return INT64_MAX; /* infinity */
4718          }
4719      } else {
4720          /* normal: exp - bias */
4721          return exp - 1023;
4722      }
4723      /* nan or zero */
4724      float_raise(float_flag_invalid, s);
4725      return INT64_MIN;
4726  }
4727  
DO_ZPZ_FP(flogb_h,float16,H1_2,do_float16_logb_as_int)4728  DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4729  DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4730  DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4731  
4732  #undef DO_ZPZ_FP
4733  
4734  static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4735                              float_status *status, uint32_t desc,
4736                              uint16_t neg1, uint16_t neg3)
4737  {
4738      intptr_t i = simd_oprsz(desc);
4739      uint64_t *g = vg;
4740  
4741      do {
4742          uint64_t pg = g[(i - 1) >> 6];
4743          do {
4744              i -= 2;
4745              if (likely((pg >> (i & 63)) & 1)) {
4746                  float16 e1, e2, e3, r;
4747  
4748                  e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4749                  e2 = *(uint16_t *)(vm + H1_2(i));
4750                  e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4751                  r = float16_muladd(e1, e2, e3, 0, status);
4752                  *(uint16_t *)(vd + H1_2(i)) = r;
4753              }
4754          } while (i & 63);
4755      } while (i != 0);
4756  }
4757  
HELPER(sve_fmla_zpzzz_h)4758  void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4759                                void *vg, void *status, uint32_t desc)
4760  {
4761      do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4762  }
4763  
HELPER(sve_fmls_zpzzz_h)4764  void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4765                                void *vg, void *status, uint32_t desc)
4766  {
4767      do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4768  }
4769  
HELPER(sve_fnmla_zpzzz_h)4770  void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4771                                 void *vg, void *status, uint32_t desc)
4772  {
4773      do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4774  }
4775  
HELPER(sve_fnmls_zpzzz_h)4776  void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4777                                 void *vg, void *status, uint32_t desc)
4778  {
4779      do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4780  }
4781  
do_fmla_zpzzz_s(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint32_t neg1,uint32_t neg3)4782  static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4783                              float_status *status, uint32_t desc,
4784                              uint32_t neg1, uint32_t neg3)
4785  {
4786      intptr_t i = simd_oprsz(desc);
4787      uint64_t *g = vg;
4788  
4789      do {
4790          uint64_t pg = g[(i - 1) >> 6];
4791          do {
4792              i -= 4;
4793              if (likely((pg >> (i & 63)) & 1)) {
4794                  float32 e1, e2, e3, r;
4795  
4796                  e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4797                  e2 = *(uint32_t *)(vm + H1_4(i));
4798                  e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4799                  r = float32_muladd(e1, e2, e3, 0, status);
4800                  *(uint32_t *)(vd + H1_4(i)) = r;
4801              }
4802          } while (i & 63);
4803      } while (i != 0);
4804  }
4805  
HELPER(sve_fmla_zpzzz_s)4806  void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4807                                void *vg, void *status, uint32_t desc)
4808  {
4809      do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4810  }
4811  
HELPER(sve_fmls_zpzzz_s)4812  void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4813                                void *vg, void *status, uint32_t desc)
4814  {
4815      do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4816  }
4817  
HELPER(sve_fnmla_zpzzz_s)4818  void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4819                                 void *vg, void *status, uint32_t desc)
4820  {
4821      do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4822  }
4823  
HELPER(sve_fnmls_zpzzz_s)4824  void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4825                                 void *vg, void *status, uint32_t desc)
4826  {
4827      do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4828  }
4829  
do_fmla_zpzzz_d(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint64_t neg1,uint64_t neg3)4830  static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4831                              float_status *status, uint32_t desc,
4832                              uint64_t neg1, uint64_t neg3)
4833  {
4834      intptr_t i = simd_oprsz(desc);
4835      uint64_t *g = vg;
4836  
4837      do {
4838          uint64_t pg = g[(i - 1) >> 6];
4839          do {
4840              i -= 8;
4841              if (likely((pg >> (i & 63)) & 1)) {
4842                  float64 e1, e2, e3, r;
4843  
4844                  e1 = *(uint64_t *)(vn + i) ^ neg1;
4845                  e2 = *(uint64_t *)(vm + i);
4846                  e3 = *(uint64_t *)(va + i) ^ neg3;
4847                  r = float64_muladd(e1, e2, e3, 0, status);
4848                  *(uint64_t *)(vd + i) = r;
4849              }
4850          } while (i & 63);
4851      } while (i != 0);
4852  }
4853  
HELPER(sve_fmla_zpzzz_d)4854  void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4855                                void *vg, void *status, uint32_t desc)
4856  {
4857      do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4858  }
4859  
HELPER(sve_fmls_zpzzz_d)4860  void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4861                                void *vg, void *status, uint32_t desc)
4862  {
4863      do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4864  }
4865  
HELPER(sve_fnmla_zpzzz_d)4866  void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4867                                 void *vg, void *status, uint32_t desc)
4868  {
4869      do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4870  }
4871  
HELPER(sve_fnmls_zpzzz_d)4872  void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4873                                 void *vg, void *status, uint32_t desc)
4874  {
4875      do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4876  }
4877  
4878  /* Two operand floating-point comparison controlled by a predicate.
4879   * Unlike the integer version, we are not allowed to optimistically
4880   * compare operands, since the comparison may have side effects wrt
4881   * the FPSR.
4882   */
4883  #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
4884  void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
4885                    void *status, uint32_t desc)                          \
4886  {                                                                       \
4887      intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
4888      uint64_t *d = vd, *g = vg;                                          \
4889      do {                                                                \
4890          uint64_t out = 0, pg = g[j];                                    \
4891          do {                                                            \
4892              i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
4893              if (likely((pg >> (i & 63)) & 1)) {                         \
4894                  TYPE nn = *(TYPE *)(vn + H(i));                         \
4895                  TYPE mm = *(TYPE *)(vm + H(i));                         \
4896                  out |= OP(TYPE, nn, mm, status);                        \
4897              }                                                           \
4898          } while (i & 63);                                               \
4899          d[j--] = out;                                                   \
4900      } while (i > 0);                                                    \
4901  }
4902  
4903  #define DO_FPCMP_PPZZ_H(NAME, OP) \
4904      DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4905  #define DO_FPCMP_PPZZ_S(NAME, OP) \
4906      DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4907  #define DO_FPCMP_PPZZ_D(NAME, OP) \
4908      DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4909  
4910  #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4911      DO_FPCMP_PPZZ_H(NAME, OP)   \
4912      DO_FPCMP_PPZZ_S(NAME, OP)   \
4913      DO_FPCMP_PPZZ_D(NAME, OP)
4914  
4915  #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
4916  #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
4917  #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
4918  #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
4919  #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
4920  #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
4921  #define DO_FCMUO(TYPE, X, Y, ST)  \
4922      TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4923  #define DO_FACGE(TYPE, X, Y, ST)  \
4924      TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4925  #define DO_FACGT(TYPE, X, Y, ST)  \
4926      TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4927  
DO_FPCMP_PPZZ_ALL(sve_fcmge,DO_FCMGE)4928  DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4929  DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4930  DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4931  DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4932  DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4933  DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4934  DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4935  
4936  #undef DO_FPCMP_PPZZ_ALL
4937  #undef DO_FPCMP_PPZZ_D
4938  #undef DO_FPCMP_PPZZ_S
4939  #undef DO_FPCMP_PPZZ_H
4940  #undef DO_FPCMP_PPZZ
4941  
4942  /* One operand floating-point comparison against zero, controlled
4943   * by a predicate.
4944   */
4945  #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
4946  void HELPER(NAME)(void *vd, void *vn, void *vg,            \
4947                    void *status, uint32_t desc)             \
4948  {                                                          \
4949      intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
4950      uint64_t *d = vd, *g = vg;                             \
4951      do {                                                   \
4952          uint64_t out = 0, pg = g[j];                       \
4953          do {                                               \
4954              i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
4955              if ((pg >> (i & 63)) & 1) {                    \
4956                  TYPE nn = *(TYPE *)(vn + H(i));            \
4957                  out |= OP(TYPE, nn, 0, status);            \
4958              }                                              \
4959          } while (i & 63);                                  \
4960          d[j--] = out;                                      \
4961      } while (i > 0);                                       \
4962  }
4963  
4964  #define DO_FPCMP_PPZ0_H(NAME, OP) \
4965      DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4966  #define DO_FPCMP_PPZ0_S(NAME, OP) \
4967      DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4968  #define DO_FPCMP_PPZ0_D(NAME, OP) \
4969      DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4970  
4971  #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4972      DO_FPCMP_PPZ0_H(NAME, OP)   \
4973      DO_FPCMP_PPZ0_S(NAME, OP)   \
4974      DO_FPCMP_PPZ0_D(NAME, OP)
4975  
4976  DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4977  DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4978  DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4979  DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4980  DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4981  DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4982  
4983  /* FP Trig Multiply-Add. */
4984  
4985  void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4986  {
4987      static const float16 coeff[16] = {
4988          0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989          0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4990      };
4991      intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4992      intptr_t x = simd_data(desc);
4993      float16 *d = vd, *n = vn, *m = vm;
4994      for (i = 0; i < opr_sz; i++) {
4995          float16 mm = m[i];
4996          intptr_t xx = x;
4997          if (float16_is_neg(mm)) {
4998              mm = float16_abs(mm);
4999              xx += 8;
5000          }
5001          d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5002      }
5003  }
5004  
HELPER(sve_ftmad_s)5005  void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5006  {
5007      static const float32 coeff[16] = {
5008          0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5009          0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5010          0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5011          0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5012      };
5013      intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5014      intptr_t x = simd_data(desc);
5015      float32 *d = vd, *n = vn, *m = vm;
5016      for (i = 0; i < opr_sz; i++) {
5017          float32 mm = m[i];
5018          intptr_t xx = x;
5019          if (float32_is_neg(mm)) {
5020              mm = float32_abs(mm);
5021              xx += 8;
5022          }
5023          d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5024      }
5025  }
5026  
HELPER(sve_ftmad_d)5027  void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5028  {
5029      static const float64 coeff[16] = {
5030          0x3ff0000000000000ull, 0xbfc5555555555543ull,
5031          0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5032          0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5033          0x3de5d8408868552full, 0x0000000000000000ull,
5034          0x3ff0000000000000ull, 0xbfe0000000000000ull,
5035          0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5036          0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5037          0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5038      };
5039      intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5040      intptr_t x = simd_data(desc);
5041      float64 *d = vd, *n = vn, *m = vm;
5042      for (i = 0; i < opr_sz; i++) {
5043          float64 mm = m[i];
5044          intptr_t xx = x;
5045          if (float64_is_neg(mm)) {
5046              mm = float64_abs(mm);
5047              xx += 8;
5048          }
5049          d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5050      }
5051  }
5052  
5053  /*
5054   * FP Complex Add
5055   */
5056  
HELPER(sve_fcadd_h)5057  void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5058                           void *vs, uint32_t desc)
5059  {
5060      intptr_t j, i = simd_oprsz(desc);
5061      uint64_t *g = vg;
5062      float16 neg_imag = float16_set_sign(0, simd_data(desc));
5063      float16 neg_real = float16_chs(neg_imag);
5064  
5065      do {
5066          uint64_t pg = g[(i - 1) >> 6];
5067          do {
5068              float16 e0, e1, e2, e3;
5069  
5070              /* I holds the real index; J holds the imag index.  */
5071              j = i - sizeof(float16);
5072              i -= 2 * sizeof(float16);
5073  
5074              e0 = *(float16 *)(vn + H1_2(i));
5075              e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5076              e2 = *(float16 *)(vn + H1_2(j));
5077              e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5078  
5079              if (likely((pg >> (i & 63)) & 1)) {
5080                  *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5081              }
5082              if (likely((pg >> (j & 63)) & 1)) {
5083                  *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5084              }
5085          } while (i & 63);
5086      } while (i != 0);
5087  }
5088  
HELPER(sve_fcadd_s)5089  void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5090                           void *vs, uint32_t desc)
5091  {
5092      intptr_t j, i = simd_oprsz(desc);
5093      uint64_t *g = vg;
5094      float32 neg_imag = float32_set_sign(0, simd_data(desc));
5095      float32 neg_real = float32_chs(neg_imag);
5096  
5097      do {
5098          uint64_t pg = g[(i - 1) >> 6];
5099          do {
5100              float32 e0, e1, e2, e3;
5101  
5102              /* I holds the real index; J holds the imag index.  */
5103              j = i - sizeof(float32);
5104              i -= 2 * sizeof(float32);
5105  
5106              e0 = *(float32 *)(vn + H1_2(i));
5107              e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5108              e2 = *(float32 *)(vn + H1_2(j));
5109              e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5110  
5111              if (likely((pg >> (i & 63)) & 1)) {
5112                  *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5113              }
5114              if (likely((pg >> (j & 63)) & 1)) {
5115                  *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5116              }
5117          } while (i & 63);
5118      } while (i != 0);
5119  }
5120  
HELPER(sve_fcadd_d)5121  void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5122                           void *vs, uint32_t desc)
5123  {
5124      intptr_t j, i = simd_oprsz(desc);
5125      uint64_t *g = vg;
5126      float64 neg_imag = float64_set_sign(0, simd_data(desc));
5127      float64 neg_real = float64_chs(neg_imag);
5128  
5129      do {
5130          uint64_t pg = g[(i - 1) >> 6];
5131          do {
5132              float64 e0, e1, e2, e3;
5133  
5134              /* I holds the real index; J holds the imag index.  */
5135              j = i - sizeof(float64);
5136              i -= 2 * sizeof(float64);
5137  
5138              e0 = *(float64 *)(vn + H1_2(i));
5139              e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5140              e2 = *(float64 *)(vn + H1_2(j));
5141              e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5142  
5143              if (likely((pg >> (i & 63)) & 1)) {
5144                  *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5145              }
5146              if (likely((pg >> (j & 63)) & 1)) {
5147                  *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5148              }
5149          } while (i & 63);
5150      } while (i != 0);
5151  }
5152  
5153  /*
5154   * FP Complex Multiply
5155   */
5156  
HELPER(sve_fcmla_zpzzz_h)5157  void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5158                                 void *vg, void *status, uint32_t desc)
5159  {
5160      intptr_t j, i = simd_oprsz(desc);
5161      unsigned rot = simd_data(desc);
5162      bool flip = rot & 1;
5163      float16 neg_imag, neg_real;
5164      uint64_t *g = vg;
5165  
5166      neg_imag = float16_set_sign(0, (rot & 2) != 0);
5167      neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5168  
5169      do {
5170          uint64_t pg = g[(i - 1) >> 6];
5171          do {
5172              float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5173  
5174              /* I holds the real index; J holds the imag index.  */
5175              j = i - sizeof(float16);
5176              i -= 2 * sizeof(float16);
5177  
5178              nr = *(float16 *)(vn + H1_2(i));
5179              ni = *(float16 *)(vn + H1_2(j));
5180              mr = *(float16 *)(vm + H1_2(i));
5181              mi = *(float16 *)(vm + H1_2(j));
5182  
5183              e2 = (flip ? ni : nr);
5184              e1 = (flip ? mi : mr) ^ neg_real;
5185              e4 = e2;
5186              e3 = (flip ? mr : mi) ^ neg_imag;
5187  
5188              if (likely((pg >> (i & 63)) & 1)) {
5189                  d = *(float16 *)(va + H1_2(i));
5190                  d = float16_muladd(e2, e1, d, 0, status);
5191                  *(float16 *)(vd + H1_2(i)) = d;
5192              }
5193              if (likely((pg >> (j & 63)) & 1)) {
5194                  d = *(float16 *)(va + H1_2(j));
5195                  d = float16_muladd(e4, e3, d, 0, status);
5196                  *(float16 *)(vd + H1_2(j)) = d;
5197              }
5198          } while (i & 63);
5199      } while (i != 0);
5200  }
5201  
HELPER(sve_fcmla_zpzzz_s)5202  void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5203                                 void *vg, void *status, uint32_t desc)
5204  {
5205      intptr_t j, i = simd_oprsz(desc);
5206      unsigned rot = simd_data(desc);
5207      bool flip = rot & 1;
5208      float32 neg_imag, neg_real;
5209      uint64_t *g = vg;
5210  
5211      neg_imag = float32_set_sign(0, (rot & 2) != 0);
5212      neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5213  
5214      do {
5215          uint64_t pg = g[(i - 1) >> 6];
5216          do {
5217              float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5218  
5219              /* I holds the real index; J holds the imag index.  */
5220              j = i - sizeof(float32);
5221              i -= 2 * sizeof(float32);
5222  
5223              nr = *(float32 *)(vn + H1_2(i));
5224              ni = *(float32 *)(vn + H1_2(j));
5225              mr = *(float32 *)(vm + H1_2(i));
5226              mi = *(float32 *)(vm + H1_2(j));
5227  
5228              e2 = (flip ? ni : nr);
5229              e1 = (flip ? mi : mr) ^ neg_real;
5230              e4 = e2;
5231              e3 = (flip ? mr : mi) ^ neg_imag;
5232  
5233              if (likely((pg >> (i & 63)) & 1)) {
5234                  d = *(float32 *)(va + H1_2(i));
5235                  d = float32_muladd(e2, e1, d, 0, status);
5236                  *(float32 *)(vd + H1_2(i)) = d;
5237              }
5238              if (likely((pg >> (j & 63)) & 1)) {
5239                  d = *(float32 *)(va + H1_2(j));
5240                  d = float32_muladd(e4, e3, d, 0, status);
5241                  *(float32 *)(vd + H1_2(j)) = d;
5242              }
5243          } while (i & 63);
5244      } while (i != 0);
5245  }
5246  
HELPER(sve_fcmla_zpzzz_d)5247  void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5248                                 void *vg, void *status, uint32_t desc)
5249  {
5250      intptr_t j, i = simd_oprsz(desc);
5251      unsigned rot = simd_data(desc);
5252      bool flip = rot & 1;
5253      float64 neg_imag, neg_real;
5254      uint64_t *g = vg;
5255  
5256      neg_imag = float64_set_sign(0, (rot & 2) != 0);
5257      neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5258  
5259      do {
5260          uint64_t pg = g[(i - 1) >> 6];
5261          do {
5262              float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5263  
5264              /* I holds the real index; J holds the imag index.  */
5265              j = i - sizeof(float64);
5266              i -= 2 * sizeof(float64);
5267  
5268              nr = *(float64 *)(vn + H1_2(i));
5269              ni = *(float64 *)(vn + H1_2(j));
5270              mr = *(float64 *)(vm + H1_2(i));
5271              mi = *(float64 *)(vm + H1_2(j));
5272  
5273              e2 = (flip ? ni : nr);
5274              e1 = (flip ? mi : mr) ^ neg_real;
5275              e4 = e2;
5276              e3 = (flip ? mr : mi) ^ neg_imag;
5277  
5278              if (likely((pg >> (i & 63)) & 1)) {
5279                  d = *(float64 *)(va + H1_2(i));
5280                  d = float64_muladd(e2, e1, d, 0, status);
5281                  *(float64 *)(vd + H1_2(i)) = d;
5282              }
5283              if (likely((pg >> (j & 63)) & 1)) {
5284                  d = *(float64 *)(va + H1_2(j));
5285                  d = float64_muladd(e4, e3, d, 0, status);
5286                  *(float64 *)(vd + H1_2(j)) = d;
5287              }
5288          } while (i & 63);
5289      } while (i != 0);
5290  }
5291  
5292  /*
5293   * Load contiguous data, protected by a governing predicate.
5294   */
5295  
5296  /*
5297   * Skip through a sequence of inactive elements in the guarding predicate @vg,
5298   * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5299   * element >= @reg_off, or @reg_max if there were no active elements at all.
5300   */
find_next_active(uint64_t * vg,intptr_t reg_off,intptr_t reg_max,int esz)5301  static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5302                                   intptr_t reg_max, int esz)
5303  {
5304      uint64_t pg_mask = pred_esz_masks[esz];
5305      uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5306  
5307      /* In normal usage, the first element is active.  */
5308      if (likely(pg & 1)) {
5309          return reg_off;
5310      }
5311  
5312      if (pg == 0) {
5313          reg_off &= -64;
5314          do {
5315              reg_off += 64;
5316              if (unlikely(reg_off >= reg_max)) {
5317                  /* The entire predicate was false.  */
5318                  return reg_max;
5319              }
5320              pg = vg[reg_off >> 6] & pg_mask;
5321          } while (pg == 0);
5322      }
5323      reg_off += ctz64(pg);
5324  
5325      /* We should never see an out of range predicate bit set.  */
5326      tcg_debug_assert(reg_off < reg_max);
5327      return reg_off;
5328  }
5329  
5330  /*
5331   * Resolve the guest virtual address to info->host and info->flags.
5332   * If @nofault, return false if the page is invalid, otherwise
5333   * exit via page fault exception.
5334   */
5335  
sve_probe_page(SVEHostPage * info,bool nofault,CPUARMState * env,target_ulong addr,int mem_off,MMUAccessType access_type,int mmu_idx,uintptr_t retaddr)5336  bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5337                      target_ulong addr, int mem_off, MMUAccessType access_type,
5338                      int mmu_idx, uintptr_t retaddr)
5339  {
5340      int flags;
5341  
5342      addr += mem_off;
5343  
5344      /*
5345       * User-only currently always issues with TBI.  See the comment
5346       * above useronly_clean_ptr.  Usually we clean this top byte away
5347       * during translation, but we can't do that for e.g. vector + imm
5348       * addressing modes.
5349       *
5350       * We currently always enable TBI for user-only, and do not provide
5351       * a way to turn it off.  So clean the pointer unconditionally here,
5352       * rather than look it up here, or pass it down from above.
5353       */
5354      addr = useronly_clean_ptr(addr);
5355  
5356  #ifdef CONFIG_USER_ONLY
5357      flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5358                                 &info->host, retaddr);
5359  #else
5360      CPUTLBEntryFull *full;
5361      flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5362                                &info->host, &full, retaddr);
5363  #endif
5364      info->flags = flags;
5365  
5366      if (flags & TLB_INVALID_MASK) {
5367          g_assert(nofault);
5368          return false;
5369      }
5370  
5371  #ifdef CONFIG_USER_ONLY
5372      memset(&info->attrs, 0, sizeof(info->attrs));
5373      /* Require both ANON and MTE; see allocation_tag_mem(). */
5374      info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5375  #else
5376      info->attrs = full->attrs;
5377      info->tagged = full->extra.arm.pte_attrs == 0xf0;
5378  #endif
5379  
5380      /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5381      info->host -= mem_off;
5382      return true;
5383  }
5384  
5385  /*
5386   * Find first active element on each page, and a loose bound for the
5387   * final element on each page.  Identify any single element that spans
5388   * the page boundary.  Return true if there are any active elements.
5389   */
sve_cont_ldst_elements(SVEContLdSt * info,target_ulong addr,uint64_t * vg,intptr_t reg_max,int esz,int msize)5390  bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5391                              intptr_t reg_max, int esz, int msize)
5392  {
5393      const int esize = 1 << esz;
5394      const uint64_t pg_mask = pred_esz_masks[esz];
5395      intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5396      intptr_t mem_off_last, mem_off_split;
5397      intptr_t page_split, elt_split;
5398      intptr_t i;
5399  
5400      /* Set all of the element indices to -1, and the TLB data to 0. */
5401      memset(info, -1, offsetof(SVEContLdSt, page));
5402      memset(info->page, 0, sizeof(info->page));
5403  
5404      /* Gross scan over the entire predicate to find bounds. */
5405      i = 0;
5406      do {
5407          uint64_t pg = vg[i] & pg_mask;
5408          if (pg) {
5409              reg_off_last = i * 64 + 63 - clz64(pg);
5410              if (reg_off_first < 0) {
5411                  reg_off_first = i * 64 + ctz64(pg);
5412              }
5413          }
5414      } while (++i * 64 < reg_max);
5415  
5416      if (unlikely(reg_off_first < 0)) {
5417          /* No active elements, no pages touched. */
5418          return false;
5419      }
5420      tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5421  
5422      info->reg_off_first[0] = reg_off_first;
5423      info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5424      mem_off_last = (reg_off_last >> esz) * msize;
5425  
5426      page_split = -(addr | TARGET_PAGE_MASK);
5427      if (likely(mem_off_last + msize <= page_split)) {
5428          /* The entire operation fits within a single page. */
5429          info->reg_off_last[0] = reg_off_last;
5430          return true;
5431      }
5432  
5433      info->page_split = page_split;
5434      elt_split = page_split / msize;
5435      reg_off_split = elt_split << esz;
5436      mem_off_split = elt_split * msize;
5437  
5438      /*
5439       * This is the last full element on the first page, but it is not
5440       * necessarily active.  If there is no full element, i.e. the first
5441       * active element is the one that's split, this value remains -1.
5442       * It is useful as iteration bounds.
5443       */
5444      if (elt_split != 0) {
5445          info->reg_off_last[0] = reg_off_split - esize;
5446      }
5447  
5448      /* Determine if an unaligned element spans the pages.  */
5449      if (page_split % msize != 0) {
5450          /* It is helpful to know if the split element is active. */
5451          if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5452              info->reg_off_split = reg_off_split;
5453              info->mem_off_split = mem_off_split;
5454  
5455              if (reg_off_split == reg_off_last) {
5456                  /* The page crossing element is last. */
5457                  return true;
5458              }
5459          }
5460          reg_off_split += esize;
5461          mem_off_split += msize;
5462      }
5463  
5464      /*
5465       * We do want the first active element on the second page, because
5466       * this may affect the address reported in an exception.
5467       */
5468      reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5469      tcg_debug_assert(reg_off_split <= reg_off_last);
5470      info->reg_off_first[1] = reg_off_split;
5471      info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5472      info->reg_off_last[1] = reg_off_last;
5473      return true;
5474  }
5475  
5476  /*
5477   * Resolve the guest virtual addresses to info->page[].
5478   * Control the generation of page faults with @fault.  Return false if
5479   * there is no work to do, which can only happen with @fault == FAULT_NO.
5480   */
sve_cont_ldst_pages(SVEContLdSt * info,SVEContFault fault,CPUARMState * env,target_ulong addr,MMUAccessType access_type,uintptr_t retaddr)5481  bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5482                           CPUARMState *env, target_ulong addr,
5483                           MMUAccessType access_type, uintptr_t retaddr)
5484  {
5485      int mmu_idx = arm_env_mmu_index(env);
5486      int mem_off = info->mem_off_first[0];
5487      bool nofault = fault == FAULT_NO;
5488      bool have_work = true;
5489  
5490      if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5491                          access_type, mmu_idx, retaddr)) {
5492          /* No work to be done. */
5493          return false;
5494      }
5495  
5496      if (likely(info->page_split < 0)) {
5497          /* The entire operation was on the one page. */
5498          return true;
5499      }
5500  
5501      /*
5502       * If the second page is invalid, then we want the fault address to be
5503       * the first byte on that page which is accessed.
5504       */
5505      if (info->mem_off_split >= 0) {
5506          /*
5507           * There is an element split across the pages.  The fault address
5508           * should be the first byte of the second page.
5509           */
5510          mem_off = info->page_split;
5511          /*
5512           * If the split element is also the first active element
5513           * of the vector, then:  For first-fault we should continue
5514           * to generate faults for the second page.  For no-fault,
5515           * we have work only if the second page is valid.
5516           */
5517          if (info->mem_off_first[0] < info->mem_off_split) {
5518              nofault = FAULT_FIRST;
5519              have_work = false;
5520          }
5521      } else {
5522          /*
5523           * There is no element split across the pages.  The fault address
5524           * should be the first active element on the second page.
5525           */
5526          mem_off = info->mem_off_first[1];
5527          /*
5528           * There must have been one active element on the first page,
5529           * so we're out of first-fault territory.
5530           */
5531          nofault = fault != FAULT_ALL;
5532      }
5533  
5534      have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5535                                  access_type, mmu_idx, retaddr);
5536      return have_work;
5537  }
5538  
5539  #ifndef CONFIG_USER_ONLY
sve_cont_ldst_watchpoints(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,int wp_access,uintptr_t retaddr)5540  void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5541                                 uint64_t *vg, target_ulong addr,
5542                                 int esize, int msize, int wp_access,
5543                                 uintptr_t retaddr)
5544  {
5545      intptr_t mem_off, reg_off, reg_last;
5546      int flags0 = info->page[0].flags;
5547      int flags1 = info->page[1].flags;
5548  
5549      if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5550          return;
5551      }
5552  
5553      /* Indicate that watchpoints are handled. */
5554      info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5555      info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5556  
5557      if (flags0 & TLB_WATCHPOINT) {
5558          mem_off = info->mem_off_first[0];
5559          reg_off = info->reg_off_first[0];
5560          reg_last = info->reg_off_last[0];
5561  
5562          while (reg_off <= reg_last) {
5563              uint64_t pg = vg[reg_off >> 6];
5564              do {
5565                  if ((pg >> (reg_off & 63)) & 1) {
5566                      cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5567                                           msize, info->page[0].attrs,
5568                                           wp_access, retaddr);
5569                  }
5570                  reg_off += esize;
5571                  mem_off += msize;
5572              } while (reg_off <= reg_last && (reg_off & 63));
5573          }
5574      }
5575  
5576      mem_off = info->mem_off_split;
5577      if (mem_off >= 0) {
5578          cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5579                               info->page[0].attrs, wp_access, retaddr);
5580      }
5581  
5582      mem_off = info->mem_off_first[1];
5583      if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5584          reg_off = info->reg_off_first[1];
5585          reg_last = info->reg_off_last[1];
5586  
5587          do {
5588              uint64_t pg = vg[reg_off >> 6];
5589              do {
5590                  if ((pg >> (reg_off & 63)) & 1) {
5591                      cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5592                                           msize, info->page[1].attrs,
5593                                           wp_access, retaddr);
5594                  }
5595                  reg_off += esize;
5596                  mem_off += msize;
5597              } while (reg_off & 63);
5598          } while (reg_off <= reg_last);
5599      }
5600  }
5601  #endif
5602  
sve_cont_ldst_mte_check(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,uint32_t mtedesc,uintptr_t ra)5603  void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5604                               uint64_t *vg, target_ulong addr, int esize,
5605                               int msize, uint32_t mtedesc, uintptr_t ra)
5606  {
5607      intptr_t mem_off, reg_off, reg_last;
5608  
5609      /* Process the page only if MemAttr == Tagged. */
5610      if (info->page[0].tagged) {
5611          mem_off = info->mem_off_first[0];
5612          reg_off = info->reg_off_first[0];
5613          reg_last = info->reg_off_split;
5614          if (reg_last < 0) {
5615              reg_last = info->reg_off_last[0];
5616          }
5617  
5618          do {
5619              uint64_t pg = vg[reg_off >> 6];
5620              do {
5621                  if ((pg >> (reg_off & 63)) & 1) {
5622                      mte_check(env, mtedesc, addr, ra);
5623                  }
5624                  reg_off += esize;
5625                  mem_off += msize;
5626              } while (reg_off <= reg_last && (reg_off & 63));
5627          } while (reg_off <= reg_last);
5628      }
5629  
5630      mem_off = info->mem_off_first[1];
5631      if (mem_off >= 0 && info->page[1].tagged) {
5632          reg_off = info->reg_off_first[1];
5633          reg_last = info->reg_off_last[1];
5634  
5635          do {
5636              uint64_t pg = vg[reg_off >> 6];
5637              do {
5638                  if ((pg >> (reg_off & 63)) & 1) {
5639                      mte_check(env, mtedesc, addr, ra);
5640                  }
5641                  reg_off += esize;
5642                  mem_off += msize;
5643              } while (reg_off & 63);
5644          } while (reg_off <= reg_last);
5645      }
5646  }
5647  
5648  /*
5649   * Common helper for all contiguous 1,2,3,4-register predicated stores.
5650   */
5651  static inline QEMU_ALWAYS_INLINE
sve_ldN_r(CPUARMState * env,uint64_t * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const int N,uint32_t mtedesc,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5652  void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5653                 uint32_t desc, const uintptr_t retaddr,
5654                 const int esz, const int msz, const int N, uint32_t mtedesc,
5655                 sve_ldst1_host_fn *host_fn,
5656                 sve_ldst1_tlb_fn *tlb_fn)
5657  {
5658      const unsigned rd = simd_data(desc);
5659      const intptr_t reg_max = simd_oprsz(desc);
5660      intptr_t reg_off, reg_last, mem_off;
5661      SVEContLdSt info;
5662      void *host;
5663      int flags, i;
5664  
5665      /* Find the active elements.  */
5666      if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5667          /* The entire predicate was false; no load occurs.  */
5668          for (i = 0; i < N; ++i) {
5669              memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5670          }
5671          return;
5672      }
5673  
5674      /* Probe the page(s).  Exit with exception for any invalid page. */
5675      sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5676  
5677      /* Handle watchpoints for all active elements. */
5678      sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5679                                BP_MEM_READ, retaddr);
5680  
5681      /*
5682       * Handle mte checks for all active elements.
5683       * Since TBI must be set for MTE, !mtedesc => !mte_active.
5684       */
5685      if (mtedesc) {
5686          sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5687                                  mtedesc, retaddr);
5688      }
5689  
5690      flags = info.page[0].flags | info.page[1].flags;
5691      if (unlikely(flags != 0)) {
5692          /*
5693           * At least one page includes MMIO.
5694           * Any bus operation can fail with cpu_transaction_failed,
5695           * which for ARM will raise SyncExternal.  Perform the load
5696           * into scratch memory to preserve register state until the end.
5697           */
5698          ARMVectorReg scratch[4] = { };
5699  
5700          mem_off = info.mem_off_first[0];
5701          reg_off = info.reg_off_first[0];
5702          reg_last = info.reg_off_last[1];
5703          if (reg_last < 0) {
5704              reg_last = info.reg_off_split;
5705              if (reg_last < 0) {
5706                  reg_last = info.reg_off_last[0];
5707              }
5708          }
5709  
5710          do {
5711              uint64_t pg = vg[reg_off >> 6];
5712              do {
5713                  if ((pg >> (reg_off & 63)) & 1) {
5714                      for (i = 0; i < N; ++i) {
5715                          tlb_fn(env, &scratch[i], reg_off,
5716                                 addr + mem_off + (i << msz), retaddr);
5717                      }
5718                  }
5719                  reg_off += 1 << esz;
5720                  mem_off += N << msz;
5721              } while (reg_off & 63);
5722          } while (reg_off <= reg_last);
5723  
5724          for (i = 0; i < N; ++i) {
5725              memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5726          }
5727          return;
5728      }
5729  
5730      /* The entire operation is in RAM, on valid pages. */
5731  
5732      for (i = 0; i < N; ++i) {
5733          memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5734      }
5735  
5736      mem_off = info.mem_off_first[0];
5737      reg_off = info.reg_off_first[0];
5738      reg_last = info.reg_off_last[0];
5739      host = info.page[0].host;
5740  
5741      set_helper_retaddr(retaddr);
5742  
5743      while (reg_off <= reg_last) {
5744          uint64_t pg = vg[reg_off >> 6];
5745          do {
5746              if ((pg >> (reg_off & 63)) & 1) {
5747                  for (i = 0; i < N; ++i) {
5748                      host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5749                              host + mem_off + (i << msz));
5750                  }
5751              }
5752              reg_off += 1 << esz;
5753              mem_off += N << msz;
5754          } while (reg_off <= reg_last && (reg_off & 63));
5755      }
5756  
5757      clear_helper_retaddr();
5758  
5759      /*
5760       * Use the slow path to manage the cross-page misalignment.
5761       * But we know this is RAM and cannot trap.
5762       */
5763      mem_off = info.mem_off_split;
5764      if (unlikely(mem_off >= 0)) {
5765          reg_off = info.reg_off_split;
5766          for (i = 0; i < N; ++i) {
5767              tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5768                     addr + mem_off + (i << msz), retaddr);
5769          }
5770      }
5771  
5772      mem_off = info.mem_off_first[1];
5773      if (unlikely(mem_off >= 0)) {
5774          reg_off = info.reg_off_first[1];
5775          reg_last = info.reg_off_last[1];
5776          host = info.page[1].host;
5777  
5778          set_helper_retaddr(retaddr);
5779  
5780          do {
5781              uint64_t pg = vg[reg_off >> 6];
5782              do {
5783                  if ((pg >> (reg_off & 63)) & 1) {
5784                      for (i = 0; i < N; ++i) {
5785                          host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5786                                  host + mem_off + (i << msz));
5787                      }
5788                  }
5789                  reg_off += 1 << esz;
5790                  mem_off += N << msz;
5791              } while (reg_off & 63);
5792          } while (reg_off <= reg_last);
5793  
5794          clear_helper_retaddr();
5795      }
5796  }
5797  
5798  static inline QEMU_ALWAYS_INLINE
sve_ldN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5799  void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5800                     uint32_t desc, const uintptr_t ra,
5801                     const int esz, const int msz, const int N,
5802                     sve_ldst1_host_fn *host_fn,
5803                     sve_ldst1_tlb_fn *tlb_fn)
5804  {
5805      uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5806      int bit55 = extract64(addr, 55, 1);
5807  
5808      /* Remove mtedesc from the normal sve descriptor. */
5809      desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5810  
5811      /* Perform gross MTE suppression early. */
5812      if (!tbi_check(mtedesc, bit55) ||
5813          tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
5814          mtedesc = 0;
5815      }
5816  
5817      sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5818  }
5819  
5820  #define DO_LD1_1(NAME, ESZ)                                             \
5821  void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
5822                              target_ulong addr, uint32_t desc)           \
5823  {                                                                       \
5824      sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
5825                sve_##NAME##_host, sve_##NAME##_tlb);                     \
5826  }                                                                       \
5827  void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
5828                                  target_ulong addr, uint32_t desc)       \
5829  {                                                                       \
5830      sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
5831                    sve_##NAME##_host, sve_##NAME##_tlb);                 \
5832  }
5833  
5834  #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
5835  void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
5836                                 target_ulong addr, uint32_t desc)        \
5837  {                                                                       \
5838      sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5839                sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
5840  }                                                                       \
5841  void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
5842                                 target_ulong addr, uint32_t desc)        \
5843  {                                                                       \
5844      sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5845                sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
5846  }                                                                       \
5847  void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
5848                                     target_ulong addr, uint32_t desc)    \
5849  {                                                                       \
5850      sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5851                    sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
5852  }                                                                       \
5853  void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
5854                                     target_ulong addr, uint32_t desc)    \
5855  {                                                                       \
5856      sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5857                    sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
5858  }
5859  
DO_LD1_1(ld1bb,MO_8)5860  DO_LD1_1(ld1bb,  MO_8)
5861  DO_LD1_1(ld1bhu, MO_16)
5862  DO_LD1_1(ld1bhs, MO_16)
5863  DO_LD1_1(ld1bsu, MO_32)
5864  DO_LD1_1(ld1bss, MO_32)
5865  DO_LD1_1(ld1bdu, MO_64)
5866  DO_LD1_1(ld1bds, MO_64)
5867  
5868  DO_LD1_2(ld1hh,  MO_16, MO_16)
5869  DO_LD1_2(ld1hsu, MO_32, MO_16)
5870  DO_LD1_2(ld1hss, MO_32, MO_16)
5871  DO_LD1_2(ld1hdu, MO_64, MO_16)
5872  DO_LD1_2(ld1hds, MO_64, MO_16)
5873  
5874  DO_LD1_2(ld1ss,  MO_32, MO_32)
5875  DO_LD1_2(ld1sdu, MO_64, MO_32)
5876  DO_LD1_2(ld1sds, MO_64, MO_32)
5877  
5878  DO_LD1_2(ld1dd,  MO_64, MO_64)
5879  
5880  #undef DO_LD1_1
5881  #undef DO_LD1_2
5882  
5883  #define DO_LDN_1(N)                                                     \
5884  void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
5885                               target_ulong addr, uint32_t desc)          \
5886  {                                                                       \
5887      sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
5888                sve_ld1bb_host, sve_ld1bb_tlb);                           \
5889  }                                                                       \
5890  void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
5891                                   target_ulong addr, uint32_t desc)      \
5892  {                                                                       \
5893      sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
5894                    sve_ld1bb_host, sve_ld1bb_tlb);                       \
5895  }
5896  
5897  #define DO_LDN_2(N, SUFF, ESZ)                                          \
5898  void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
5899                                      target_ulong addr, uint32_t desc)   \
5900  {                                                                       \
5901      sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5902                sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
5903  }                                                                       \
5904  void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
5905                                      target_ulong addr, uint32_t desc)   \
5906  {                                                                       \
5907      sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5908                sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
5909  }                                                                       \
5910  void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
5911                                          target_ulong addr, uint32_t desc) \
5912  {                                                                       \
5913      sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5914                    sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
5915  }                                                                       \
5916  void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
5917                                          target_ulong addr, uint32_t desc) \
5918  {                                                                       \
5919      sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5920                    sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
5921  }
5922  
5923  DO_LDN_1(2)
5924  DO_LDN_1(3)
5925  DO_LDN_1(4)
5926  
5927  DO_LDN_2(2, hh, MO_16)
5928  DO_LDN_2(3, hh, MO_16)
5929  DO_LDN_2(4, hh, MO_16)
5930  
5931  DO_LDN_2(2, ss, MO_32)
5932  DO_LDN_2(3, ss, MO_32)
5933  DO_LDN_2(4, ss, MO_32)
5934  
5935  DO_LDN_2(2, dd, MO_64)
5936  DO_LDN_2(3, dd, MO_64)
5937  DO_LDN_2(4, dd, MO_64)
5938  
5939  #undef DO_LDN_1
5940  #undef DO_LDN_2
5941  
5942  /*
5943   * Load contiguous data, first-fault and no-fault.
5944   *
5945   * For user-only, we control the race between page_check_range and
5946   * another thread's munmap by using set/clear_helper_retaddr.  Any
5947   * SEGV that occurs between those markers is assumed to be because
5948   * the guest page vanished.  Keep that block as small as possible
5949   * so that unrelated QEMU bugs are not blamed on the guest.
5950   */
5951  
5952  /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
5953   * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5954   * option, which leaves subsequent data unchanged.
5955   */
5956  static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5957  {
5958      uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5959  
5960      if (i & 63) {
5961          ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5962          i = ROUND_UP(i, 64);
5963      }
5964      for (; i < oprsz; i += 64) {
5965          ffr[i / 64] = 0;
5966      }
5967  }
5968  
5969  /*
5970   * Common helper for all contiguous no-fault and first-fault loads.
5971   */
5972  static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r(CPUARMState * env,void * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,uint32_t mtedesc,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5973  void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5974                     uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5975                     const int esz, const int msz, const SVEContFault fault,
5976                     sve_ldst1_host_fn *host_fn,
5977                     sve_ldst1_tlb_fn *tlb_fn)
5978  {
5979      const unsigned rd = simd_data(desc);
5980      void *vd = &env->vfp.zregs[rd];
5981      const intptr_t reg_max = simd_oprsz(desc);
5982      intptr_t reg_off, mem_off, reg_last;
5983      SVEContLdSt info;
5984      int flags;
5985      void *host;
5986  
5987      /* Find the active elements.  */
5988      if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5989          /* The entire predicate was false; no load occurs.  */
5990          memset(vd, 0, reg_max);
5991          return;
5992      }
5993      reg_off = info.reg_off_first[0];
5994  
5995      /* Probe the page(s). */
5996      if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5997          /* Fault on first element. */
5998          tcg_debug_assert(fault == FAULT_NO);
5999          memset(vd, 0, reg_max);
6000          goto do_fault;
6001      }
6002  
6003      mem_off = info.mem_off_first[0];
6004      flags = info.page[0].flags;
6005  
6006      /*
6007       * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6008       * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6009       */
6010      if (!info.page[0].tagged) {
6011          mtedesc = 0;
6012      }
6013  
6014      if (fault == FAULT_FIRST) {
6015          /* Trapping mte check for the first-fault element.  */
6016          if (mtedesc) {
6017              mte_check(env, mtedesc, addr + mem_off, retaddr);
6018          }
6019  
6020          /*
6021           * Special handling of the first active element,
6022           * if it crosses a page boundary or is MMIO.
6023           */
6024          bool is_split = mem_off == info.mem_off_split;
6025          if (unlikely(flags != 0) || unlikely(is_split)) {
6026              /*
6027               * Use the slow path for cross-page handling.
6028               * Might trap for MMIO or watchpoints.
6029               */
6030              tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6031  
6032              /* After any fault, zero the other elements. */
6033              swap_memzero(vd, reg_off);
6034              reg_off += 1 << esz;
6035              mem_off += 1 << msz;
6036              swap_memzero(vd + reg_off, reg_max - reg_off);
6037  
6038              if (is_split) {
6039                  goto second_page;
6040              }
6041          } else {
6042              memset(vd, 0, reg_max);
6043          }
6044      } else {
6045          memset(vd, 0, reg_max);
6046          if (unlikely(mem_off == info.mem_off_split)) {
6047              /* The first active element crosses a page boundary. */
6048              flags |= info.page[1].flags;
6049              if (unlikely(flags & TLB_MMIO)) {
6050                  /* Some page is MMIO, see below. */
6051                  goto do_fault;
6052              }
6053              if (unlikely(flags & TLB_WATCHPOINT) &&
6054                  (cpu_watchpoint_address_matches
6055                   (env_cpu(env), addr + mem_off, 1 << msz)
6056                   & BP_MEM_READ)) {
6057                  /* Watchpoint hit, see below. */
6058                  goto do_fault;
6059              }
6060              if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6061                  goto do_fault;
6062              }
6063              /*
6064               * Use the slow path for cross-page handling.
6065               * This is RAM, without a watchpoint, and will not trap.
6066               */
6067              tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6068              goto second_page;
6069          }
6070      }
6071  
6072      /*
6073       * From this point on, all memory operations are MemSingleNF.
6074       *
6075       * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6076       * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6077       *
6078       * Unfortuately we do not have access to the memory attributes from the
6079       * PTE to tell Device memory from Normal memory.  So we make a mostly
6080       * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6081       * This gives the right answer for the common cases of "Normal memory,
6082       * backed by host RAM" and "Device memory, backed by MMIO".
6083       * The architecture allows us to suppress an NF load and return
6084       * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6085       * case of "Normal memory, backed by MMIO" is permitted.  The case we
6086       * get wrong is "Device memory, backed by host RAM", for which we
6087       * should return (UNKNOWN, FAULT) for but do not.
6088       *
6089       * Similarly, CPU_BP breakpoints would raise exceptions, and so
6090       * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6091       * architectural breakpoints the same.
6092       */
6093      if (unlikely(flags & TLB_MMIO)) {
6094          goto do_fault;
6095      }
6096  
6097      reg_last = info.reg_off_last[0];
6098      host = info.page[0].host;
6099  
6100      set_helper_retaddr(retaddr);
6101  
6102      do {
6103          uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6104          do {
6105              if ((pg >> (reg_off & 63)) & 1) {
6106                  if (unlikely(flags & TLB_WATCHPOINT) &&
6107                      (cpu_watchpoint_address_matches
6108                       (env_cpu(env), addr + mem_off, 1 << msz)
6109                       & BP_MEM_READ)) {
6110                      clear_helper_retaddr();
6111                      goto do_fault;
6112                  }
6113                  if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6114                      clear_helper_retaddr();
6115                      goto do_fault;
6116                  }
6117                  host_fn(vd, reg_off, host + mem_off);
6118              }
6119              reg_off += 1 << esz;
6120              mem_off += 1 << msz;
6121          } while (reg_off <= reg_last && (reg_off & 63));
6122      } while (reg_off <= reg_last);
6123  
6124      clear_helper_retaddr();
6125  
6126      /*
6127       * MemSingleNF is allowed to fail for any reason.  We have special
6128       * code above to handle the first element crossing a page boundary.
6129       * As an implementation choice, decline to handle a cross-page element
6130       * in any other position.
6131       */
6132      reg_off = info.reg_off_split;
6133      if (reg_off >= 0) {
6134          goto do_fault;
6135      }
6136  
6137   second_page:
6138      reg_off = info.reg_off_first[1];
6139      if (likely(reg_off < 0)) {
6140          /* No active elements on the second page.  All done. */
6141          return;
6142      }
6143  
6144      /*
6145       * MemSingleNF is allowed to fail for any reason.  As an implementation
6146       * choice, decline to handle elements on the second page.  This should
6147       * be low frequency as the guest walks through memory -- the next
6148       * iteration of the guest's loop should be aligned on the page boundary,
6149       * and then all following iterations will stay aligned.
6150       */
6151  
6152   do_fault:
6153      record_fault(env, reg_off, reg_max);
6154  }
6155  
6156  static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r_mte(CPUARMState * env,void * vg,target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6157  void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6158                         uint32_t desc, const uintptr_t retaddr,
6159                         const int esz, const int msz, const SVEContFault fault,
6160                         sve_ldst1_host_fn *host_fn,
6161                         sve_ldst1_tlb_fn *tlb_fn)
6162  {
6163      uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6164      int bit55 = extract64(addr, 55, 1);
6165  
6166      /* Remove mtedesc from the normal sve descriptor. */
6167      desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6168  
6169      /* Perform gross MTE suppression early. */
6170      if (!tbi_check(mtedesc, bit55) ||
6171          tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6172          mtedesc = 0;
6173      }
6174  
6175      sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6176                    esz, msz, fault, host_fn, tlb_fn);
6177  }
6178  
6179  #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6180  void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6181                                   target_ulong addr, uint32_t desc)      \
6182  {                                                                       \
6183      sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6184                    sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6185  }                                                                       \
6186  void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6187                                   target_ulong addr, uint32_t desc)      \
6188  {                                                                       \
6189      sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6190                    sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6191  }                                                                       \
6192  void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6193                                       target_ulong addr, uint32_t desc)  \
6194  {                                                                       \
6195      sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6196                        sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6197  }                                                                       \
6198  void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6199                                       target_ulong addr, uint32_t desc)  \
6200  {                                                                       \
6201      sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6202                    sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6203  }
6204  
6205  #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6206  void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6207                                      target_ulong addr, uint32_t desc)   \
6208  {                                                                       \
6209      sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6210                    sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6211  }                                                                       \
6212  void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6213                                      target_ulong addr, uint32_t desc)   \
6214  {                                                                       \
6215      sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6216                    sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6217  }                                                                       \
6218  void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6219                                      target_ulong addr, uint32_t desc)   \
6220  {                                                                       \
6221      sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6222                    sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6223  }                                                                       \
6224  void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6225                                      target_ulong addr, uint32_t desc)   \
6226  {                                                                       \
6227      sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6228                    sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6229  }                                                                       \
6230  void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6231                                          target_ulong addr, uint32_t desc) \
6232  {                                                                       \
6233      sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6234                        sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6235  }                                                                       \
6236  void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6237                                          target_ulong addr, uint32_t desc) \
6238  {                                                                       \
6239      sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6240                        sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6241  }                                                                       \
6242  void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6243                                          target_ulong addr, uint32_t desc) \
6244  {                                                                       \
6245      sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6246                        sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6247  }                                                                       \
6248  void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6249                                          target_ulong addr, uint32_t desc) \
6250  {                                                                       \
6251      sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6252                        sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6253  }
6254  
DO_LDFF1_LDNF1_1(bb,MO_8)6255  DO_LDFF1_LDNF1_1(bb,  MO_8)
6256  DO_LDFF1_LDNF1_1(bhu, MO_16)
6257  DO_LDFF1_LDNF1_1(bhs, MO_16)
6258  DO_LDFF1_LDNF1_1(bsu, MO_32)
6259  DO_LDFF1_LDNF1_1(bss, MO_32)
6260  DO_LDFF1_LDNF1_1(bdu, MO_64)
6261  DO_LDFF1_LDNF1_1(bds, MO_64)
6262  
6263  DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6264  DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6265  DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6266  DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6267  DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6268  
6269  DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6270  DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6271  DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6272  
6273  DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6274  
6275  #undef DO_LDFF1_LDNF1_1
6276  #undef DO_LDFF1_LDNF1_2
6277  
6278  /*
6279   * Common helper for all contiguous 1,2,3,4-register predicated stores.
6280   */
6281  
6282  static inline QEMU_ALWAYS_INLINE
6283  void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6284                 uint32_t desc, const uintptr_t retaddr,
6285                 const int esz, const int msz, const int N, uint32_t mtedesc,
6286                 sve_ldst1_host_fn *host_fn,
6287                 sve_ldst1_tlb_fn *tlb_fn)
6288  {
6289      const unsigned rd = simd_data(desc);
6290      const intptr_t reg_max = simd_oprsz(desc);
6291      intptr_t reg_off, reg_last, mem_off;
6292      SVEContLdSt info;
6293      void *host;
6294      int i, flags;
6295  
6296      /* Find the active elements.  */
6297      if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6298          /* The entire predicate was false; no store occurs.  */
6299          return;
6300      }
6301  
6302      /* Probe the page(s).  Exit with exception for any invalid page. */
6303      sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6304  
6305      /* Handle watchpoints for all active elements. */
6306      sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6307                                BP_MEM_WRITE, retaddr);
6308  
6309      /*
6310       * Handle mte checks for all active elements.
6311       * Since TBI must be set for MTE, !mtedesc => !mte_active.
6312       */
6313      if (mtedesc) {
6314          sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6315                                  mtedesc, retaddr);
6316      }
6317  
6318      flags = info.page[0].flags | info.page[1].flags;
6319      if (unlikely(flags != 0)) {
6320          /*
6321           * At least one page includes MMIO.
6322           * Any bus operation can fail with cpu_transaction_failed,
6323           * which for ARM will raise SyncExternal.  We cannot avoid
6324           * this fault and will leave with the store incomplete.
6325           */
6326          mem_off = info.mem_off_first[0];
6327          reg_off = info.reg_off_first[0];
6328          reg_last = info.reg_off_last[1];
6329          if (reg_last < 0) {
6330              reg_last = info.reg_off_split;
6331              if (reg_last < 0) {
6332                  reg_last = info.reg_off_last[0];
6333              }
6334          }
6335  
6336          do {
6337              uint64_t pg = vg[reg_off >> 6];
6338              do {
6339                  if ((pg >> (reg_off & 63)) & 1) {
6340                      for (i = 0; i < N; ++i) {
6341                          tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6342                                 addr + mem_off + (i << msz), retaddr);
6343                      }
6344                  }
6345                  reg_off += 1 << esz;
6346                  mem_off += N << msz;
6347              } while (reg_off & 63);
6348          } while (reg_off <= reg_last);
6349          return;
6350      }
6351  
6352      mem_off = info.mem_off_first[0];
6353      reg_off = info.reg_off_first[0];
6354      reg_last = info.reg_off_last[0];
6355      host = info.page[0].host;
6356  
6357      set_helper_retaddr(retaddr);
6358  
6359      while (reg_off <= reg_last) {
6360          uint64_t pg = vg[reg_off >> 6];
6361          do {
6362              if ((pg >> (reg_off & 63)) & 1) {
6363                  for (i = 0; i < N; ++i) {
6364                      host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6365                              host + mem_off + (i << msz));
6366                  }
6367              }
6368              reg_off += 1 << esz;
6369              mem_off += N << msz;
6370          } while (reg_off <= reg_last && (reg_off & 63));
6371      }
6372  
6373      clear_helper_retaddr();
6374  
6375      /*
6376       * Use the slow path to manage the cross-page misalignment.
6377       * But we know this is RAM and cannot trap.
6378       */
6379      mem_off = info.mem_off_split;
6380      if (unlikely(mem_off >= 0)) {
6381          reg_off = info.reg_off_split;
6382          for (i = 0; i < N; ++i) {
6383              tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6384                     addr + mem_off + (i << msz), retaddr);
6385          }
6386      }
6387  
6388      mem_off = info.mem_off_first[1];
6389      if (unlikely(mem_off >= 0)) {
6390          reg_off = info.reg_off_first[1];
6391          reg_last = info.reg_off_last[1];
6392          host = info.page[1].host;
6393  
6394          set_helper_retaddr(retaddr);
6395  
6396          do {
6397              uint64_t pg = vg[reg_off >> 6];
6398              do {
6399                  if ((pg >> (reg_off & 63)) & 1) {
6400                      for (i = 0; i < N; ++i) {
6401                          host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6402                                  host + mem_off + (i << msz));
6403                      }
6404                  }
6405                  reg_off += 1 << esz;
6406                  mem_off += N << msz;
6407              } while (reg_off & 63);
6408          } while (reg_off <= reg_last);
6409  
6410          clear_helper_retaddr();
6411      }
6412  }
6413  
6414  static inline QEMU_ALWAYS_INLINE
sve_stN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6415  void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6416                     uint32_t desc, const uintptr_t ra,
6417                     const int esz, const int msz, const int N,
6418                     sve_ldst1_host_fn *host_fn,
6419                     sve_ldst1_tlb_fn *tlb_fn)
6420  {
6421      uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6422      int bit55 = extract64(addr, 55, 1);
6423  
6424      /* Remove mtedesc from the normal sve descriptor. */
6425      desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6426  
6427      /* Perform gross MTE suppression early. */
6428      if (!tbi_check(mtedesc, bit55) ||
6429          tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6430          mtedesc = 0;
6431      }
6432  
6433      sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6434  }
6435  
6436  #define DO_STN_1(N, NAME, ESZ)                                          \
6437  void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6438                                   target_ulong addr, uint32_t desc)      \
6439  {                                                                       \
6440      sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6441                sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6442  }                                                                       \
6443  void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6444                                       target_ulong addr, uint32_t desc)  \
6445  {                                                                       \
6446      sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6447                    sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6448  }
6449  
6450  #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6451  void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6452                                      target_ulong addr, uint32_t desc)   \
6453  {                                                                       \
6454      sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6455                sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6456  }                                                                       \
6457  void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6458                                      target_ulong addr, uint32_t desc)   \
6459  {                                                                       \
6460      sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6461                sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6462  }                                                                       \
6463  void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6464                                          target_ulong addr, uint32_t desc) \
6465  {                                                                       \
6466      sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6467                    sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6468  }                                                                       \
6469  void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6470                                          target_ulong addr, uint32_t desc) \
6471  {                                                                       \
6472      sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6473                    sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6474  }
6475  
6476  DO_STN_1(1, bb, MO_8)
6477  DO_STN_1(1, bh, MO_16)
6478  DO_STN_1(1, bs, MO_32)
6479  DO_STN_1(1, bd, MO_64)
6480  DO_STN_1(2, bb, MO_8)
6481  DO_STN_1(3, bb, MO_8)
6482  DO_STN_1(4, bb, MO_8)
6483  
6484  DO_STN_2(1, hh, MO_16, MO_16)
6485  DO_STN_2(1, hs, MO_32, MO_16)
6486  DO_STN_2(1, hd, MO_64, MO_16)
6487  DO_STN_2(2, hh, MO_16, MO_16)
6488  DO_STN_2(3, hh, MO_16, MO_16)
6489  DO_STN_2(4, hh, MO_16, MO_16)
6490  
6491  DO_STN_2(1, ss, MO_32, MO_32)
6492  DO_STN_2(1, sd, MO_64, MO_32)
6493  DO_STN_2(2, ss, MO_32, MO_32)
6494  DO_STN_2(3, ss, MO_32, MO_32)
6495  DO_STN_2(4, ss, MO_32, MO_32)
6496  
6497  DO_STN_2(1, dd, MO_64, MO_64)
6498  DO_STN_2(2, dd, MO_64, MO_64)
6499  DO_STN_2(3, dd, MO_64, MO_64)
6500  DO_STN_2(4, dd, MO_64, MO_64)
6501  
6502  #undef DO_STN_1
6503  #undef DO_STN_2
6504  
6505  /*
6506   * Loads with a vector index.
6507   */
6508  
6509  /*
6510   * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6511   */
6512  typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6513  
off_zsu_s(void * reg,intptr_t reg_ofs)6514  static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6515  {
6516      return *(uint32_t *)(reg + H1_4(reg_ofs));
6517  }
6518  
off_zss_s(void * reg,intptr_t reg_ofs)6519  static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6520  {
6521      return *(int32_t *)(reg + H1_4(reg_ofs));
6522  }
6523  
off_zsu_d(void * reg,intptr_t reg_ofs)6524  static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6525  {
6526      return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6527  }
6528  
off_zss_d(void * reg,intptr_t reg_ofs)6529  static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6530  {
6531      return (int32_t)*(uint64_t *)(reg + reg_ofs);
6532  }
6533  
off_zd_d(void * reg,intptr_t reg_ofs)6534  static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6535  {
6536      return *(uint64_t *)(reg + reg_ofs);
6537  }
6538  
6539  static inline QEMU_ALWAYS_INLINE
sve_ld1_z(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,uint32_t mtedesc,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6540  void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6541                 target_ulong base, uint32_t desc, uintptr_t retaddr,
6542                 uint32_t mtedesc, int esize, int msize,
6543                 zreg_off_fn *off_fn,
6544                 sve_ldst1_host_fn *host_fn,
6545                 sve_ldst1_tlb_fn *tlb_fn)
6546  {
6547      const int mmu_idx = arm_env_mmu_index(env);
6548      const intptr_t reg_max = simd_oprsz(desc);
6549      const int scale = simd_data(desc);
6550      ARMVectorReg scratch;
6551      intptr_t reg_off;
6552      SVEHostPage info, info2;
6553  
6554      memset(&scratch, 0, reg_max);
6555      reg_off = 0;
6556      do {
6557          uint64_t pg = vg[reg_off >> 6];
6558          do {
6559              if (likely(pg & 1)) {
6560                  target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6561                  target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6562  
6563                  sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6564                                 mmu_idx, retaddr);
6565  
6566                  if (likely(in_page >= msize)) {
6567                      if (unlikely(info.flags & TLB_WATCHPOINT)) {
6568                          cpu_check_watchpoint(env_cpu(env), addr, msize,
6569                                               info.attrs, BP_MEM_READ, retaddr);
6570                      }
6571                      if (mtedesc && info.tagged) {
6572                          mte_check(env, mtedesc, addr, retaddr);
6573                      }
6574                      if (unlikely(info.flags & TLB_MMIO)) {
6575                          tlb_fn(env, &scratch, reg_off, addr, retaddr);
6576                      } else {
6577                          set_helper_retaddr(retaddr);
6578                          host_fn(&scratch, reg_off, info.host);
6579                          clear_helper_retaddr();
6580                      }
6581                  } else {
6582                      /* Element crosses the page boundary. */
6583                      sve_probe_page(&info2, false, env, addr + in_page, 0,
6584                                     MMU_DATA_LOAD, mmu_idx, retaddr);
6585                      if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6586                          cpu_check_watchpoint(env_cpu(env), addr,
6587                                               msize, info.attrs,
6588                                               BP_MEM_READ, retaddr);
6589                      }
6590                      if (mtedesc && info.tagged) {
6591                          mte_check(env, mtedesc, addr, retaddr);
6592                      }
6593                      tlb_fn(env, &scratch, reg_off, addr, retaddr);
6594                  }
6595              }
6596              reg_off += esize;
6597              pg >>= esize;
6598          } while (reg_off & 63);
6599      } while (reg_off < reg_max);
6600  
6601      /* Wait until all exceptions have been raised to write back.  */
6602      memcpy(vd, &scratch, reg_max);
6603  }
6604  
6605  static inline QEMU_ALWAYS_INLINE
sve_ld1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6606  void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6607                     target_ulong base, uint32_t desc, uintptr_t retaddr,
6608                     int esize, int msize, zreg_off_fn *off_fn,
6609                     sve_ldst1_host_fn *host_fn,
6610                     sve_ldst1_tlb_fn *tlb_fn)
6611  {
6612      uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6613      /* Remove mtedesc from the normal sve descriptor. */
6614      desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6615  
6616      /*
6617       * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6618       * offset base entirely over the address space hole to change the
6619       * pointer tag, or change the bit55 selector.  So we could here
6620       * examine TBI + TCMA like we do for sve_ldN_r_mte().
6621       */
6622      sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6623                esize, msize, off_fn, host_fn, tlb_fn);
6624  }
6625  
6626  #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6627  void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6628                                   void *vm, target_ulong base, uint32_t desc) \
6629  {                                                                            \
6630      sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6631                off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6632  }                                                                            \
6633  void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6634       void *vm, target_ulong base, uint32_t desc)                             \
6635  {                                                                            \
6636      sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6637                    off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6638  }
6639  
6640  #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6641  void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6642                                   void *vm, target_ulong base, uint32_t desc) \
6643  {                                                                            \
6644      sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6645                off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6646  }                                                                            \
6647  void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6648      void *vm, target_ulong base, uint32_t desc)                              \
6649  {                                                                            \
6650      sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6651                    off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6652  }
6653  
DO_LD1_ZPZ_S(bsu,zsu,MO_8)6654  DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6655  DO_LD1_ZPZ_S(bsu, zss, MO_8)
6656  DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6657  DO_LD1_ZPZ_D(bdu, zss, MO_8)
6658  DO_LD1_ZPZ_D(bdu, zd, MO_8)
6659  
6660  DO_LD1_ZPZ_S(bss, zsu, MO_8)
6661  DO_LD1_ZPZ_S(bss, zss, MO_8)
6662  DO_LD1_ZPZ_D(bds, zsu, MO_8)
6663  DO_LD1_ZPZ_D(bds, zss, MO_8)
6664  DO_LD1_ZPZ_D(bds, zd, MO_8)
6665  
6666  DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6667  DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6668  DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6669  DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6670  DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6671  
6672  DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6673  DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6674  DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6675  DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6676  DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6677  
6678  DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6679  DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6680  DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6681  DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6682  DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6683  
6684  DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6685  DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6686  DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6687  DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6688  DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6689  
6690  DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6691  DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6692  DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6693  DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6694  DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6695  
6696  DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6697  DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6698  DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6699  DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6700  DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6701  
6702  DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6703  DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6704  DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6705  
6706  DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6707  DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6708  DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6709  
6710  DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6711  DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6712  DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6713  
6714  DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6715  DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6716  DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6717  
6718  #undef DO_LD1_ZPZ_S
6719  #undef DO_LD1_ZPZ_D
6720  
6721  /* First fault loads with a vector index.  */
6722  
6723  /*
6724   * Common helpers for all gather first-faulting loads.
6725   */
6726  
6727  static inline QEMU_ALWAYS_INLINE
6728  void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6729                   target_ulong base, uint32_t desc, uintptr_t retaddr,
6730                   uint32_t mtedesc, const int esz, const int msz,
6731                   zreg_off_fn *off_fn,
6732                   sve_ldst1_host_fn *host_fn,
6733                   sve_ldst1_tlb_fn *tlb_fn)
6734  {
6735      const int mmu_idx = arm_env_mmu_index(env);
6736      const intptr_t reg_max = simd_oprsz(desc);
6737      const int scale = simd_data(desc);
6738      const int esize = 1 << esz;
6739      const int msize = 1 << msz;
6740      intptr_t reg_off;
6741      SVEHostPage info;
6742      target_ulong addr, in_page;
6743      ARMVectorReg scratch;
6744  
6745      /* Skip to the first true predicate.  */
6746      reg_off = find_next_active(vg, 0, reg_max, esz);
6747      if (unlikely(reg_off >= reg_max)) {
6748          /* The entire predicate was false; no load occurs.  */
6749          memset(vd, 0, reg_max);
6750          return;
6751      }
6752  
6753      /* Protect against overlap between vd and vm. */
6754      if (unlikely(vd == vm)) {
6755          vm = memcpy(&scratch, vm, reg_max);
6756      }
6757  
6758      /*
6759       * Probe the first element, allowing faults.
6760       */
6761      addr = base + (off_fn(vm, reg_off) << scale);
6762      if (mtedesc) {
6763          mte_check(env, mtedesc, addr, retaddr);
6764      }
6765      tlb_fn(env, vd, reg_off, addr, retaddr);
6766  
6767      /* After any fault, zero the other elements. */
6768      swap_memzero(vd, reg_off);
6769      reg_off += esize;
6770      swap_memzero(vd + reg_off, reg_max - reg_off);
6771  
6772      /*
6773       * Probe the remaining elements, not allowing faults.
6774       */
6775      while (reg_off < reg_max) {
6776          uint64_t pg = vg[reg_off >> 6];
6777          do {
6778              if (likely((pg >> (reg_off & 63)) & 1)) {
6779                  addr = base + (off_fn(vm, reg_off) << scale);
6780                  in_page = -(addr | TARGET_PAGE_MASK);
6781  
6782                  if (unlikely(in_page < msize)) {
6783                      /* Stop if the element crosses a page boundary. */
6784                      goto fault;
6785                  }
6786  
6787                  sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6788                                 mmu_idx, retaddr);
6789                  if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6790                      goto fault;
6791                  }
6792                  if (unlikely(info.flags & TLB_WATCHPOINT) &&
6793                      (cpu_watchpoint_address_matches
6794                       (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6795                      goto fault;
6796                  }
6797                  if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6798                      goto fault;
6799                  }
6800  
6801                  set_helper_retaddr(retaddr);
6802                  host_fn(vd, reg_off, info.host);
6803                  clear_helper_retaddr();
6804              }
6805              reg_off += esize;
6806          } while (reg_off & 63);
6807      }
6808      return;
6809  
6810   fault:
6811      record_fault(env, reg_off, reg_max);
6812  }
6813  
6814  static inline QEMU_ALWAYS_INLINE
sve_ldff1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,const int esz,const int msz,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6815  void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6816                       target_ulong base, uint32_t desc, uintptr_t retaddr,
6817                       const int esz, const int msz,
6818                       zreg_off_fn *off_fn,
6819                       sve_ldst1_host_fn *host_fn,
6820                       sve_ldst1_tlb_fn *tlb_fn)
6821  {
6822      uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6823      /* Remove mtedesc from the normal sve descriptor. */
6824      desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6825  
6826      /*
6827       * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6828       * offset base entirely over the address space hole to change the
6829       * pointer tag, or change the bit55 selector.  So we could here
6830       * examine TBI + TCMA like we do for sve_ldN_r_mte().
6831       */
6832      sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6833                  esz, msz, off_fn, host_fn, tlb_fn);
6834  }
6835  
6836  #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
6837  void HELPER(sve_ldff##MEM##_##OFS)                                      \
6838      (CPUARMState *env, void *vd, void *vg,                              \
6839       void *vm, target_ulong base, uint32_t desc)                        \
6840  {                                                                       \
6841      sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
6842                  off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6843  }                                                                       \
6844  void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6845      (CPUARMState *env, void *vd, void *vg,                              \
6846       void *vm, target_ulong base, uint32_t desc)                        \
6847  {                                                                       \
6848      sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
6849                      off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6850  }
6851  
6852  #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
6853  void HELPER(sve_ldff##MEM##_##OFS)                                      \
6854      (CPUARMState *env, void *vd, void *vg,                              \
6855       void *vm, target_ulong base, uint32_t desc)                        \
6856  {                                                                       \
6857      sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
6858                  off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6859  }                                                                       \
6860  void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6861      (CPUARMState *env, void *vd, void *vg,                              \
6862       void *vm, target_ulong base, uint32_t desc)                        \
6863  {                                                                       \
6864      sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
6865                      off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6866  }
6867  
DO_LDFF1_ZPZ_S(bsu,zsu,MO_8)6868  DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6869  DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6870  DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6871  DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6872  DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6873  
6874  DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6875  DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6876  DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6877  DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6878  DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6879  
6880  DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6881  DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6882  DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6883  DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6884  DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6885  
6886  DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6887  DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6888  DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6889  DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6890  DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6891  
6892  DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6893  DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6894  DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6895  DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6896  DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6897  
6898  DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6899  DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6900  DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6901  DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6902  DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6903  
6904  DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
6905  DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
6906  DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6907  DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6908  DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6909  
6910  DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
6911  DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
6912  DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6913  DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6914  DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6915  
6916  DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6917  DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6918  DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6919  
6920  DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6921  DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6922  DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6923  
6924  DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6925  DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6926  DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6927  
6928  DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6929  DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6930  DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6931  
6932  /* Stores with a vector index.  */
6933  
6934  static inline QEMU_ALWAYS_INLINE
6935  void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6936                 target_ulong base, uint32_t desc, uintptr_t retaddr,
6937                 uint32_t mtedesc, int esize, int msize,
6938                 zreg_off_fn *off_fn,
6939                 sve_ldst1_host_fn *host_fn,
6940                 sve_ldst1_tlb_fn *tlb_fn)
6941  {
6942      const int mmu_idx = arm_env_mmu_index(env);
6943      const intptr_t reg_max = simd_oprsz(desc);
6944      const int scale = simd_data(desc);
6945      void *host[ARM_MAX_VQ * 4];
6946      intptr_t reg_off, i;
6947      SVEHostPage info, info2;
6948  
6949      /*
6950       * Probe all of the elements for host addresses and flags.
6951       */
6952      i = reg_off = 0;
6953      do {
6954          uint64_t pg = vg[reg_off >> 6];
6955          do {
6956              target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6957              target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6958  
6959              host[i] = NULL;
6960              if (likely((pg >> (reg_off & 63)) & 1)) {
6961                  if (likely(in_page >= msize)) {
6962                      sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6963                                     mmu_idx, retaddr);
6964                      if (!(info.flags & TLB_MMIO)) {
6965                          host[i] = info.host;
6966                      }
6967                  } else {
6968                      /*
6969                       * Element crosses the page boundary.
6970                       * Probe both pages, but do not record the host address,
6971                       * so that we use the slow path.
6972                       */
6973                      sve_probe_page(&info, false, env, addr, 0,
6974                                     MMU_DATA_STORE, mmu_idx, retaddr);
6975                      sve_probe_page(&info2, false, env, addr + in_page, 0,
6976                                     MMU_DATA_STORE, mmu_idx, retaddr);
6977                      info.flags |= info2.flags;
6978                  }
6979  
6980                  if (unlikely(info.flags & TLB_WATCHPOINT)) {
6981                      cpu_check_watchpoint(env_cpu(env), addr, msize,
6982                                           info.attrs, BP_MEM_WRITE, retaddr);
6983                  }
6984  
6985                  if (mtedesc && info.tagged) {
6986                      mte_check(env, mtedesc, addr, retaddr);
6987                  }
6988              }
6989              i += 1;
6990              reg_off += esize;
6991          } while (reg_off & 63);
6992      } while (reg_off < reg_max);
6993  
6994      /*
6995       * Now that we have recognized all exceptions except SyncExternal
6996       * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6997       *
6998       * Note for the common case of an element in RAM, not crossing a page
6999       * boundary, we have stored the host address in host[].  This doubles
7000       * as a first-level check against the predicate, since only enabled
7001       * elements have non-null host addresses.
7002       */
7003      i = reg_off = 0;
7004      do {
7005          void *h = host[i];
7006          if (likely(h != NULL)) {
7007              set_helper_retaddr(retaddr);
7008              host_fn(vd, reg_off, h);
7009              clear_helper_retaddr();
7010          } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7011              target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7012              tlb_fn(env, vd, reg_off, addr, retaddr);
7013          }
7014          i += 1;
7015          reg_off += esize;
7016      } while (reg_off < reg_max);
7017  }
7018  
7019  static inline QEMU_ALWAYS_INLINE
sve_st1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7020  void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7021                     target_ulong base, uint32_t desc, uintptr_t retaddr,
7022                     int esize, int msize, zreg_off_fn *off_fn,
7023                     sve_ldst1_host_fn *host_fn,
7024                     sve_ldst1_tlb_fn *tlb_fn)
7025  {
7026      uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7027      /* Remove mtedesc from the normal sve descriptor. */
7028      desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7029  
7030      /*
7031       * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7032       * offset base entirely over the address space hole to change the
7033       * pointer tag, or change the bit55 selector.  So we could here
7034       * examine TBI + TCMA like we do for sve_ldN_r_mte().
7035       */
7036      sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7037                esize, msize, off_fn, host_fn, tlb_fn);
7038  }
7039  
7040  #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7041  void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7042                                   void *vm, target_ulong base, uint32_t desc) \
7043  {                                                                       \
7044      sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7045                off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7046  }                                                                       \
7047  void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7048      void *vm, target_ulong base, uint32_t desc)                         \
7049  {                                                                       \
7050      sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7051                    off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7052  }
7053  
7054  #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7055  void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7056                                   void *vm, target_ulong base, uint32_t desc) \
7057  {                                                                       \
7058      sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7059                off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7060  }                                                                       \
7061  void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7062      void *vm, target_ulong base, uint32_t desc)                         \
7063  {                                                                       \
7064      sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7065                    off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7066  }
7067  
DO_ST1_ZPZ_S(bs,zsu,MO_8)7068  DO_ST1_ZPZ_S(bs, zsu, MO_8)
7069  DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7070  DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7071  DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7072  DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7073  
7074  DO_ST1_ZPZ_S(bs, zss, MO_8)
7075  DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7076  DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7077  DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7078  DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7079  
7080  DO_ST1_ZPZ_D(bd, zsu, MO_8)
7081  DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7082  DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7083  DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7084  DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7085  DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7086  DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7087  
7088  DO_ST1_ZPZ_D(bd, zss, MO_8)
7089  DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7090  DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7091  DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7092  DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7093  DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7094  DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7095  
7096  DO_ST1_ZPZ_D(bd, zd, MO_8)
7097  DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7098  DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7099  DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7100  DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7101  DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7102  DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7103  
7104  #undef DO_ST1_ZPZ_S
7105  #undef DO_ST1_ZPZ_D
7106  
7107  void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7108  {
7109      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7110      uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7111  
7112      for (i = 0; i < opr_sz; ++i) {
7113          d[i] = n[i] ^ m[i] ^ k[i];
7114      }
7115  }
7116  
HELPER(sve2_bcax)7117  void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7118  {
7119      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7120      uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7121  
7122      for (i = 0; i < opr_sz; ++i) {
7123          d[i] = n[i] ^ (m[i] & ~k[i]);
7124      }
7125  }
7126  
HELPER(sve2_bsl1n)7127  void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7128  {
7129      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7130      uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7131  
7132      for (i = 0; i < opr_sz; ++i) {
7133          d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7134      }
7135  }
7136  
HELPER(sve2_bsl2n)7137  void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7138  {
7139      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7140      uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7141  
7142      for (i = 0; i < opr_sz; ++i) {
7143          d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7144      }
7145  }
7146  
HELPER(sve2_nbsl)7147  void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7148  {
7149      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7150      uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7151  
7152      for (i = 0; i < opr_sz; ++i) {
7153          d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7154      }
7155  }
7156  
7157  /*
7158   * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7159   * See hasless(v,1) from
7160   *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7161   */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)7162  static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7163  {
7164      int bits = 8 << esz;
7165      uint64_t ones = dup_const(esz, 1);
7166      uint64_t signs = ones << (bits - 1);
7167      uint64_t cmp0, cmp1;
7168  
7169      cmp1 = dup_const(esz, n);
7170      cmp0 = cmp1 ^ m0;
7171      cmp1 = cmp1 ^ m1;
7172      cmp0 = (cmp0 - ones) & ~cmp0;
7173      cmp1 = (cmp1 - ones) & ~cmp1;
7174      return (cmp0 | cmp1) & signs;
7175  }
7176  
do_match(void * vd,void * vn,void * vm,void * vg,uint32_t desc,int esz,bool nmatch)7177  static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7178                                  uint32_t desc, int esz, bool nmatch)
7179  {
7180      uint16_t esz_mask = pred_esz_masks[esz];
7181      intptr_t opr_sz = simd_oprsz(desc);
7182      uint32_t flags = PREDTEST_INIT;
7183      intptr_t i, j, k;
7184  
7185      for (i = 0; i < opr_sz; i += 16) {
7186          uint64_t m0 = *(uint64_t *)(vm + i);
7187          uint64_t m1 = *(uint64_t *)(vm + i + 8);
7188          uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7189          uint16_t out = 0;
7190  
7191          for (j = 0; j < 16; j += 8) {
7192              uint64_t n = *(uint64_t *)(vn + i + j);
7193  
7194              for (k = 0; k < 8; k += 1 << esz) {
7195                  if (pg & (1 << (j + k))) {
7196                      bool o = do_match2(n >> (k * 8), m0, m1, esz);
7197                      out |= (o ^ nmatch) << (j + k);
7198                  }
7199              }
7200          }
7201          *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7202          flags = iter_predtest_fwd(out, pg, flags);
7203      }
7204      return flags;
7205  }
7206  
7207  #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7208  uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7209  {                                                                             \
7210      return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7211  }
7212  
DO_PPZZ_MATCH(sve2_match_ppzz_b,MO_8,false)7213  DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7214  DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7215  
7216  DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7217  DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7218  
7219  #undef DO_PPZZ_MATCH
7220  
7221  void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7222                              uint32_t desc)
7223  {
7224      ARMVectorReg scratch;
7225      intptr_t i, j;
7226      intptr_t opr_sz = simd_oprsz(desc);
7227      uint32_t *d = vd, *n = vn, *m = vm;
7228      uint8_t *pg = vg;
7229  
7230      if (d == n) {
7231          n = memcpy(&scratch, n, opr_sz);
7232          if (d == m) {
7233              m = n;
7234          }
7235      } else if (d == m) {
7236          m = memcpy(&scratch, m, opr_sz);
7237      }
7238  
7239      for (i = 0; i < opr_sz; i += 4) {
7240          uint64_t count = 0;
7241          uint8_t pred;
7242  
7243          pred = pg[H1(i >> 3)] >> (i & 7);
7244          if (pred & 1) {
7245              uint32_t nn = n[H4(i >> 2)];
7246  
7247              for (j = 0; j <= i; j += 4) {
7248                  pred = pg[H1(j >> 3)] >> (j & 7);
7249                  if ((pred & 1) && nn == m[H4(j >> 2)]) {
7250                      ++count;
7251                  }
7252              }
7253          }
7254          d[H4(i >> 2)] = count;
7255      }
7256  }
7257  
HELPER(sve2_histcnt_d)7258  void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7259                              uint32_t desc)
7260  {
7261      ARMVectorReg scratch;
7262      intptr_t i, j;
7263      intptr_t opr_sz = simd_oprsz(desc);
7264      uint64_t *d = vd, *n = vn, *m = vm;
7265      uint8_t *pg = vg;
7266  
7267      if (d == n) {
7268          n = memcpy(&scratch, n, opr_sz);
7269          if (d == m) {
7270              m = n;
7271          }
7272      } else if (d == m) {
7273          m = memcpy(&scratch, m, opr_sz);
7274      }
7275  
7276      for (i = 0; i < opr_sz / 8; ++i) {
7277          uint64_t count = 0;
7278          if (pg[H1(i)] & 1) {
7279              uint64_t nn = n[i];
7280              for (j = 0; j <= i; ++j) {
7281                  if ((pg[H1(j)] & 1) && nn == m[j]) {
7282                      ++count;
7283                  }
7284              }
7285          }
7286          d[i] = count;
7287      }
7288  }
7289  
7290  /*
7291   * Returns the number of bytes in m0 and m1 that match n.
7292   * Unlike do_match2 we don't just need true/false, we need an exact count.
7293   * This requires two extra logical operations.
7294   */
do_histseg_cnt(uint8_t n,uint64_t m0,uint64_t m1)7295  static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7296  {
7297      const uint64_t mask = dup_const(MO_8, 0x7f);
7298      uint64_t cmp0, cmp1;
7299  
7300      cmp1 = dup_const(MO_8, n);
7301      cmp0 = cmp1 ^ m0;
7302      cmp1 = cmp1 ^ m1;
7303  
7304      /*
7305       * 1: clear msb of each byte to avoid carry to next byte (& mask)
7306       * 2: carry in to msb if byte != 0 (+ mask)
7307       * 3: set msb if cmp has msb set (| cmp)
7308       * 4: set ~msb to ignore them (| mask)
7309       * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7310       * 5: invert, resulting in 0x80 if and only if byte == 0.
7311       */
7312      cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7313      cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7314  
7315      /*
7316       * Combine the two compares in a way that the bits do
7317       * not overlap, and so preserves the count of set bits.
7318       * If the host has an efficient instruction for ctpop,
7319       * then ctpop(x) + ctpop(y) has the same number of
7320       * operations as ctpop(x | (y >> 1)).  If the host does
7321       * not have an efficient ctpop, then we only want to
7322       * use it once.
7323       */
7324      return ctpop64(cmp0 | (cmp1 >> 1));
7325  }
7326  
HELPER(sve2_histseg)7327  void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7328  {
7329      intptr_t i, j;
7330      intptr_t opr_sz = simd_oprsz(desc);
7331  
7332      for (i = 0; i < opr_sz; i += 16) {
7333          uint64_t n0 = *(uint64_t *)(vn + i);
7334          uint64_t m0 = *(uint64_t *)(vm + i);
7335          uint64_t n1 = *(uint64_t *)(vn + i + 8);
7336          uint64_t m1 = *(uint64_t *)(vm + i + 8);
7337          uint64_t out0 = 0;
7338          uint64_t out1 = 0;
7339  
7340          for (j = 0; j < 64; j += 8) {
7341              uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7342              uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7343              out0 |= cnt0 << j;
7344              out1 |= cnt1 << j;
7345          }
7346  
7347          *(uint64_t *)(vd + i) = out0;
7348          *(uint64_t *)(vd + i + 8) = out1;
7349      }
7350  }
7351  
HELPER(sve2_xar_b)7352  void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7353  {
7354      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7355      int shr = simd_data(desc);
7356      int shl = 8 - shr;
7357      uint64_t mask = dup_const(MO_8, 0xff >> shr);
7358      uint64_t *d = vd, *n = vn, *m = vm;
7359  
7360      for (i = 0; i < opr_sz; ++i) {
7361          uint64_t t = n[i] ^ m[i];
7362          d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7363      }
7364  }
7365  
HELPER(sve2_xar_h)7366  void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7367  {
7368      intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7369      int shr = simd_data(desc);
7370      int shl = 16 - shr;
7371      uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7372      uint64_t *d = vd, *n = vn, *m = vm;
7373  
7374      for (i = 0; i < opr_sz; ++i) {
7375          uint64_t t = n[i] ^ m[i];
7376          d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7377      }
7378  }
7379  
HELPER(sve2_xar_s)7380  void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7381  {
7382      intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7383      int shr = simd_data(desc);
7384      uint32_t *d = vd, *n = vn, *m = vm;
7385  
7386      for (i = 0; i < opr_sz; ++i) {
7387          d[i] = ror32(n[i] ^ m[i], shr);
7388      }
7389  }
7390  
HELPER(fmmla_s)7391  void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7392                       void *status, uint32_t desc)
7393  {
7394      intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7395  
7396      for (s = 0; s < opr_sz; ++s) {
7397          float32 *n = vn + s * sizeof(float32) * 4;
7398          float32 *m = vm + s * sizeof(float32) * 4;
7399          float32 *a = va + s * sizeof(float32) * 4;
7400          float32 *d = vd + s * sizeof(float32) * 4;
7401          float32 n00 = n[H4(0)], n01 = n[H4(1)];
7402          float32 n10 = n[H4(2)], n11 = n[H4(3)];
7403          float32 m00 = m[H4(0)], m01 = m[H4(1)];
7404          float32 m10 = m[H4(2)], m11 = m[H4(3)];
7405          float32 p0, p1;
7406  
7407          /* i = 0, j = 0 */
7408          p0 = float32_mul(n00, m00, status);
7409          p1 = float32_mul(n01, m01, status);
7410          d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7411  
7412          /* i = 0, j = 1 */
7413          p0 = float32_mul(n00, m10, status);
7414          p1 = float32_mul(n01, m11, status);
7415          d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7416  
7417          /* i = 1, j = 0 */
7418          p0 = float32_mul(n10, m00, status);
7419          p1 = float32_mul(n11, m01, status);
7420          d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7421  
7422          /* i = 1, j = 1 */
7423          p0 = float32_mul(n10, m10, status);
7424          p1 = float32_mul(n11, m11, status);
7425          d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7426      }
7427  }
7428  
HELPER(fmmla_d)7429  void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7430                       void *status, uint32_t desc)
7431  {
7432      intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7433  
7434      for (s = 0; s < opr_sz; ++s) {
7435          float64 *n = vn + s * sizeof(float64) * 4;
7436          float64 *m = vm + s * sizeof(float64) * 4;
7437          float64 *a = va + s * sizeof(float64) * 4;
7438          float64 *d = vd + s * sizeof(float64) * 4;
7439          float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7440          float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7441          float64 p0, p1;
7442  
7443          /* i = 0, j = 0 */
7444          p0 = float64_mul(n00, m00, status);
7445          p1 = float64_mul(n01, m01, status);
7446          d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7447  
7448          /* i = 0, j = 1 */
7449          p0 = float64_mul(n00, m10, status);
7450          p1 = float64_mul(n01, m11, status);
7451          d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7452  
7453          /* i = 1, j = 0 */
7454          p0 = float64_mul(n10, m00, status);
7455          p1 = float64_mul(n11, m01, status);
7456          d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7457  
7458          /* i = 1, j = 1 */
7459          p0 = float64_mul(n10, m10, status);
7460          p1 = float64_mul(n11, m11, status);
7461          d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7462      }
7463  }
7464  
7465  #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7466  void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7467  {                                                                             \
7468      intptr_t i = simd_oprsz(desc);                                            \
7469      uint64_t *g = vg;                                                         \
7470      do {                                                                      \
7471          uint64_t pg = g[(i - 1) >> 6];                                        \
7472          do {                                                                  \
7473              i -= sizeof(TYPEW);                                               \
7474              if (likely((pg >> (i & 63)) & 1)) {                               \
7475                  TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7476                  *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7477              }                                                                 \
7478          } while (i & 63);                                                     \
7479      } while (i != 0);                                                         \
7480  }
7481  
7482  DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7483  DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7484  DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7485  
7486  #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7487  void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7488  {                                                                             \
7489      intptr_t i = simd_oprsz(desc);                                            \
7490      uint64_t *g = vg;                                                         \
7491      do {                                                                      \
7492          uint64_t pg = g[(i - 1) >> 6];                                        \
7493          do {                                                                  \
7494              i -= sizeof(TYPEW);                                               \
7495              if (likely((pg >> (i & 63)) & 1)) {                               \
7496                  TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7497                  *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7498              }                                                                 \
7499          } while (i & 63);                                                     \
7500      } while (i != 0);                                                         \
7501  }
7502  
7503  DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7504  DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7505  
7506  #undef DO_FCVTLT
7507  #undef DO_FCVTNT
7508