xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision 7d87775f)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          void *vfpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     float_status *fpst = vfpst;
883     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
884     uint32_t neg_imag = neg_real ^ 1;
885     uintptr_t i;
886 
887     /* Shift boolean to the sign bit so we can xor to negate.  */
888     neg_real <<= 15;
889     neg_imag <<= 15;
890 
891     for (i = 0; i < opr_sz / 2; i += 2) {
892         float16 e0 = n[H2(i)];
893         float16 e1 = m[H2(i + 1)] ^ neg_imag;
894         float16 e2 = n[H2(i + 1)];
895         float16 e3 = m[H2(i)] ^ neg_real;
896 
897         d[H2(i)] = float16_add(e0, e1, fpst);
898         d[H2(i + 1)] = float16_add(e2, e3, fpst);
899     }
900     clear_tail(d, opr_sz, simd_maxsz(desc));
901 }
902 
903 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
904                          void *vfpst, uint32_t desc)
905 {
906     uintptr_t opr_sz = simd_oprsz(desc);
907     float32 *d = vd;
908     float32 *n = vn;
909     float32 *m = vm;
910     float_status *fpst = vfpst;
911     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
912     uint32_t neg_imag = neg_real ^ 1;
913     uintptr_t i;
914 
915     /* Shift boolean to the sign bit so we can xor to negate.  */
916     neg_real <<= 31;
917     neg_imag <<= 31;
918 
919     for (i = 0; i < opr_sz / 4; i += 2) {
920         float32 e0 = n[H4(i)];
921         float32 e1 = m[H4(i + 1)] ^ neg_imag;
922         float32 e2 = n[H4(i + 1)];
923         float32 e3 = m[H4(i)] ^ neg_real;
924 
925         d[H4(i)] = float32_add(e0, e1, fpst);
926         d[H4(i + 1)] = float32_add(e2, e3, fpst);
927     }
928     clear_tail(d, opr_sz, simd_maxsz(desc));
929 }
930 
931 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
932                          void *vfpst, uint32_t desc)
933 {
934     uintptr_t opr_sz = simd_oprsz(desc);
935     float64 *d = vd;
936     float64 *n = vn;
937     float64 *m = vm;
938     float_status *fpst = vfpst;
939     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
940     uint64_t neg_imag = neg_real ^ 1;
941     uintptr_t i;
942 
943     /* Shift boolean to the sign bit so we can xor to negate.  */
944     neg_real <<= 63;
945     neg_imag <<= 63;
946 
947     for (i = 0; i < opr_sz / 8; i += 2) {
948         float64 e0 = n[i];
949         float64 e1 = m[i + 1] ^ neg_imag;
950         float64 e2 = n[i + 1];
951         float64 e3 = m[i] ^ neg_real;
952 
953         d[i] = float64_add(e0, e1, fpst);
954         d[i + 1] = float64_add(e2, e3, fpst);
955     }
956     clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958 
959 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
960                          void *vfpst, uint32_t desc)
961 {
962     uintptr_t opr_sz = simd_oprsz(desc);
963     float16 *d = vd, *n = vn, *m = vm, *a = va;
964     float_status *fpst = vfpst;
965     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
966     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
967     uint32_t neg_real = flip ^ neg_imag;
968     uintptr_t i;
969 
970     /* Shift boolean to the sign bit so we can xor to negate.  */
971     neg_real <<= 15;
972     neg_imag <<= 15;
973 
974     for (i = 0; i < opr_sz / 2; i += 2) {
975         float16 e2 = n[H2(i + flip)];
976         float16 e1 = m[H2(i + flip)] ^ neg_real;
977         float16 e4 = e2;
978         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
979 
980         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
981         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
982     }
983     clear_tail(d, opr_sz, simd_maxsz(desc));
984 }
985 
986 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
987                              void *vfpst, uint32_t desc)
988 {
989     uintptr_t opr_sz = simd_oprsz(desc);
990     float16 *d = vd, *n = vn, *m = vm, *a = va;
991     float_status *fpst = vfpst;
992     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
993     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
994     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
995     uint32_t neg_real = flip ^ neg_imag;
996     intptr_t elements = opr_sz / sizeof(float16);
997     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
998     intptr_t i, j;
999 
1000     /* Shift boolean to the sign bit so we can xor to negate.  */
1001     neg_real <<= 15;
1002     neg_imag <<= 15;
1003 
1004     for (i = 0; i < elements; i += eltspersegment) {
1005         float16 mr = m[H2(i + 2 * index + 0)];
1006         float16 mi = m[H2(i + 2 * index + 1)];
1007         float16 e1 = neg_real ^ (flip ? mi : mr);
1008         float16 e3 = neg_imag ^ (flip ? mr : mi);
1009 
1010         for (j = i; j < i + eltspersegment; j += 2) {
1011             float16 e2 = n[H2(j + flip)];
1012             float16 e4 = e2;
1013 
1014             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1015             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1016         }
1017     }
1018     clear_tail(d, opr_sz, simd_maxsz(desc));
1019 }
1020 
1021 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1022                          void *vfpst, uint32_t desc)
1023 {
1024     uintptr_t opr_sz = simd_oprsz(desc);
1025     float32 *d = vd, *n = vn, *m = vm, *a = va;
1026     float_status *fpst = vfpst;
1027     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1028     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1029     uint32_t neg_real = flip ^ neg_imag;
1030     uintptr_t i;
1031 
1032     /* Shift boolean to the sign bit so we can xor to negate.  */
1033     neg_real <<= 31;
1034     neg_imag <<= 31;
1035 
1036     for (i = 0; i < opr_sz / 4; i += 2) {
1037         float32 e2 = n[H4(i + flip)];
1038         float32 e1 = m[H4(i + flip)] ^ neg_real;
1039         float32 e4 = e2;
1040         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1041 
1042         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1043         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1044     }
1045     clear_tail(d, opr_sz, simd_maxsz(desc));
1046 }
1047 
1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1049                              void *vfpst, uint32_t desc)
1050 {
1051     uintptr_t opr_sz = simd_oprsz(desc);
1052     float32 *d = vd, *n = vn, *m = vm, *a = va;
1053     float_status *fpst = vfpst;
1054     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1055     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1056     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1057     uint32_t neg_real = flip ^ neg_imag;
1058     intptr_t elements = opr_sz / sizeof(float32);
1059     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1060     intptr_t i, j;
1061 
1062     /* Shift boolean to the sign bit so we can xor to negate.  */
1063     neg_real <<= 31;
1064     neg_imag <<= 31;
1065 
1066     for (i = 0; i < elements; i += eltspersegment) {
1067         float32 mr = m[H4(i + 2 * index + 0)];
1068         float32 mi = m[H4(i + 2 * index + 1)];
1069         float32 e1 = neg_real ^ (flip ? mi : mr);
1070         float32 e3 = neg_imag ^ (flip ? mr : mi);
1071 
1072         for (j = i; j < i + eltspersegment; j += 2) {
1073             float32 e2 = n[H4(j + flip)];
1074             float32 e4 = e2;
1075 
1076             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1077             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1078         }
1079     }
1080     clear_tail(d, opr_sz, simd_maxsz(desc));
1081 }
1082 
1083 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1084                          void *vfpst, uint32_t desc)
1085 {
1086     uintptr_t opr_sz = simd_oprsz(desc);
1087     float64 *d = vd, *n = vn, *m = vm, *a = va;
1088     float_status *fpst = vfpst;
1089     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1090     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1091     uint64_t neg_real = flip ^ neg_imag;
1092     uintptr_t i;
1093 
1094     /* Shift boolean to the sign bit so we can xor to negate.  */
1095     neg_real <<= 63;
1096     neg_imag <<= 63;
1097 
1098     for (i = 0; i < opr_sz / 8; i += 2) {
1099         float64 e2 = n[i + flip];
1100         float64 e1 = m[i + flip] ^ neg_real;
1101         float64 e4 = e2;
1102         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1103 
1104         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1105         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1106     }
1107     clear_tail(d, opr_sz, simd_maxsz(desc));
1108 }
1109 
1110 /*
1111  * Floating point comparisons producing an integer result (all 1s or all 0s).
1112  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1113  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1114  */
1115 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1116 {
1117     return -float16_eq_quiet(op1, op2, stat);
1118 }
1119 
1120 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1121 {
1122     return -float32_eq_quiet(op1, op2, stat);
1123 }
1124 
1125 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1126 {
1127     return -float64_eq_quiet(op1, op2, stat);
1128 }
1129 
1130 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1131 {
1132     return -float16_le(op2, op1, stat);
1133 }
1134 
1135 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1136 {
1137     return -float32_le(op2, op1, stat);
1138 }
1139 
1140 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1141 {
1142     return -float64_le(op2, op1, stat);
1143 }
1144 
1145 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1146 {
1147     return -float16_lt(op2, op1, stat);
1148 }
1149 
1150 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1151 {
1152     return -float32_lt(op2, op1, stat);
1153 }
1154 
1155 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1156 {
1157     return -float64_lt(op2, op1, stat);
1158 }
1159 
1160 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1161 {
1162     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1163 }
1164 
1165 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1166 {
1167     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1168 }
1169 
1170 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1171 {
1172     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1173 }
1174 
1175 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1176 {
1177     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1178 }
1179 
1180 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1181 {
1182     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1183 }
1184 
1185 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1186 {
1187     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1188 }
1189 
1190 static int16_t vfp_tosszh(float16 x, void *fpstp)
1191 {
1192     float_status *fpst = fpstp;
1193     if (float16_is_any_nan(x)) {
1194         float_raise(float_flag_invalid, fpst);
1195         return 0;
1196     }
1197     return float16_to_int16_round_to_zero(x, fpst);
1198 }
1199 
1200 static uint16_t vfp_touszh(float16 x, void *fpstp)
1201 {
1202     float_status *fpst = fpstp;
1203     if (float16_is_any_nan(x)) {
1204         float_raise(float_flag_invalid, fpst);
1205         return 0;
1206     }
1207     return float16_to_uint16_round_to_zero(x, fpst);
1208 }
1209 
1210 #define DO_2OP(NAME, FUNC, TYPE) \
1211 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1212 {                                                                 \
1213     intptr_t i, oprsz = simd_oprsz(desc);                         \
1214     TYPE *d = vd, *n = vn;                                        \
1215     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1216         d[i] = FUNC(n[i], stat);                                  \
1217     }                                                             \
1218     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1219 }
1220 
1221 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1222 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1223 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1224 
1225 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1226 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1227 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1228 
1229 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1230 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1231 
1232 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1233 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1234 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1235 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1236 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1237 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1238 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1239 DO_2OP(gvec_touszh, vfp_touszh, float16)
1240 
1241 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1242     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1243     {                                                           \
1244         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1245     }
1246 
1247 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1248     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1249     {                                                           \
1250         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1251     }
1252 
1253 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1254     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1255     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1256     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1257     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1258 
1259 DO_2OP_CMP0(cgt, cgt, FWD)
1260 DO_2OP_CMP0(cge, cge, FWD)
1261 DO_2OP_CMP0(ceq, ceq, FWD)
1262 DO_2OP_CMP0(clt, cgt, REV)
1263 DO_2OP_CMP0(cle, cge, REV)
1264 
1265 #undef DO_2OP
1266 #undef DO_2OP_CMP0
1267 
1268 /* Floating-point trigonometric starting value.
1269  * See the ARM ARM pseudocode function FPTrigSMul.
1270  */
1271 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1272 {
1273     float16 result = float16_mul(op1, op1, stat);
1274     if (!float16_is_any_nan(result)) {
1275         result = float16_set_sign(result, op2 & 1);
1276     }
1277     return result;
1278 }
1279 
1280 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1281 {
1282     float32 result = float32_mul(op1, op1, stat);
1283     if (!float32_is_any_nan(result)) {
1284         result = float32_set_sign(result, op2 & 1);
1285     }
1286     return result;
1287 }
1288 
1289 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1290 {
1291     float64 result = float64_mul(op1, op1, stat);
1292     if (!float64_is_any_nan(result)) {
1293         result = float64_set_sign(result, op2 & 1);
1294     }
1295     return result;
1296 }
1297 
1298 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1299 {
1300     return float16_abs(float16_sub(op1, op2, stat));
1301 }
1302 
1303 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1304 {
1305     return float32_abs(float32_sub(op1, op2, stat));
1306 }
1307 
1308 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1309 {
1310     return float64_abs(float64_sub(op1, op2, stat));
1311 }
1312 
1313 /*
1314  * Reciprocal step. These are the AArch32 version which uses a
1315  * non-fused multiply-and-subtract.
1316  */
1317 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1318 {
1319     op1 = float16_squash_input_denormal(op1, stat);
1320     op2 = float16_squash_input_denormal(op2, stat);
1321 
1322     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1323         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1324         return float16_two;
1325     }
1326     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1327 }
1328 
1329 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1330 {
1331     op1 = float32_squash_input_denormal(op1, stat);
1332     op2 = float32_squash_input_denormal(op2, stat);
1333 
1334     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1335         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1336         return float32_two;
1337     }
1338     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1339 }
1340 
1341 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1342 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1343 {
1344     op1 = float16_squash_input_denormal(op1, stat);
1345     op2 = float16_squash_input_denormal(op2, stat);
1346 
1347     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1348         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1349         return float16_one_point_five;
1350     }
1351     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1352     return float16_div(op1, float16_two, stat);
1353 }
1354 
1355 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1356 {
1357     op1 = float32_squash_input_denormal(op1, stat);
1358     op2 = float32_squash_input_denormal(op2, stat);
1359 
1360     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1361         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1362         return float32_one_point_five;
1363     }
1364     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1365     return float32_div(op1, float32_two, stat);
1366 }
1367 
1368 #define DO_3OP(NAME, FUNC, TYPE) \
1369 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1370 {                                                                          \
1371     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1372     TYPE *d = vd, *n = vn, *m = vm;                                        \
1373     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1374         d[i] = FUNC(n[i], m[i], stat);                                     \
1375     }                                                                      \
1376     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1377 }
1378 
1379 DO_3OP(gvec_fadd_h, float16_add, float16)
1380 DO_3OP(gvec_fadd_s, float32_add, float32)
1381 DO_3OP(gvec_fadd_d, float64_add, float64)
1382 
1383 DO_3OP(gvec_fsub_h, float16_sub, float16)
1384 DO_3OP(gvec_fsub_s, float32_sub, float32)
1385 DO_3OP(gvec_fsub_d, float64_sub, float64)
1386 
1387 DO_3OP(gvec_fmul_h, float16_mul, float16)
1388 DO_3OP(gvec_fmul_s, float32_mul, float32)
1389 DO_3OP(gvec_fmul_d, float64_mul, float64)
1390 
1391 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1392 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1393 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1394 
1395 DO_3OP(gvec_fabd_h, float16_abd, float16)
1396 DO_3OP(gvec_fabd_s, float32_abd, float32)
1397 DO_3OP(gvec_fabd_d, float64_abd, float64)
1398 
1399 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1400 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1401 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1402 
1403 DO_3OP(gvec_fcge_h, float16_cge, float16)
1404 DO_3OP(gvec_fcge_s, float32_cge, float32)
1405 DO_3OP(gvec_fcge_d, float64_cge, float64)
1406 
1407 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1408 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1409 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1410 
1411 DO_3OP(gvec_facge_h, float16_acge, float16)
1412 DO_3OP(gvec_facge_s, float32_acge, float32)
1413 DO_3OP(gvec_facge_d, float64_acge, float64)
1414 
1415 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1416 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1417 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1418 
1419 DO_3OP(gvec_fmax_h, float16_max, float16)
1420 DO_3OP(gvec_fmax_s, float32_max, float32)
1421 DO_3OP(gvec_fmax_d, float64_max, float64)
1422 
1423 DO_3OP(gvec_fmin_h, float16_min, float16)
1424 DO_3OP(gvec_fmin_s, float32_min, float32)
1425 DO_3OP(gvec_fmin_d, float64_min, float64)
1426 
1427 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1428 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1429 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1430 
1431 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1432 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1433 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1434 
1435 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1436 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1437 
1438 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1439 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1440 
1441 #ifdef TARGET_AARCH64
1442 DO_3OP(gvec_fdiv_h, float16_div, float16)
1443 DO_3OP(gvec_fdiv_s, float32_div, float32)
1444 DO_3OP(gvec_fdiv_d, float64_div, float64)
1445 
1446 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1447 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1448 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1449 
1450 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1451 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1452 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1453 
1454 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1455 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1456 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1457 
1458 #endif
1459 #undef DO_3OP
1460 
1461 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1462 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1463                                  float_status *stat)
1464 {
1465     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1466 }
1467 
1468 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1469                                  float_status *stat)
1470 {
1471     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1472 }
1473 
1474 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1475                                  float_status *stat)
1476 {
1477     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1478 }
1479 
1480 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1481                                  float_status *stat)
1482 {
1483     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1484 }
1485 
1486 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1487 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1488                                 float_status *stat)
1489 {
1490     return float16_muladd(op1, op2, dest, 0, stat);
1491 }
1492 
1493 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1494                                  float_status *stat)
1495 {
1496     return float32_muladd(op1, op2, dest, 0, stat);
1497 }
1498 
1499 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1500                                  float_status *stat)
1501 {
1502     return float64_muladd(op1, op2, dest, 0, stat);
1503 }
1504 
1505 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1506                                  float_status *stat)
1507 {
1508     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1509 }
1510 
1511 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1512                                  float_status *stat)
1513 {
1514     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1515 }
1516 
1517 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1518                                  float_status *stat)
1519 {
1520     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1521 }
1522 
1523 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1524 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1525 {                                                                          \
1526     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1527     TYPE *d = vd, *n = vn, *m = vm;                                        \
1528     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1529         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1530     }                                                                      \
1531     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1532 }
1533 
1534 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1535 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1536 
1537 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1538 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1539 
1540 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1541 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1542 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1543 
1544 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1545 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1546 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1547 
1548 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1549  * For AdvSIMD, there is of course only one such vector segment.
1550  */
1551 
1552 #define DO_MUL_IDX(NAME, TYPE, H) \
1553 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1554 {                                                                          \
1555     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1556     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1557     intptr_t idx = simd_data(desc);                                        \
1558     TYPE *d = vd, *n = vn, *m = vm;                                        \
1559     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1560         TYPE mm = m[H(i + idx)];                                           \
1561         for (j = 0; j < segment; j++) {                                    \
1562             d[i + j] = n[i + j] * mm;                                      \
1563         }                                                                  \
1564     }                                                                      \
1565     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1566 }
1567 
1568 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1569 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1570 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1571 
1572 #undef DO_MUL_IDX
1573 
1574 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1575 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1576 {                                                                          \
1577     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1578     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1579     intptr_t idx = simd_data(desc);                                        \
1580     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1581     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1582         TYPE mm = m[H(i + idx)];                                           \
1583         for (j = 0; j < segment; j++) {                                    \
1584             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1585         }                                                                  \
1586     }                                                                      \
1587     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1588 }
1589 
1590 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1591 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1592 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1593 
1594 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1595 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1596 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1597 
1598 #undef DO_MLA_IDX
1599 
1600 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1601 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1602 {                                                                          \
1603     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1604     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1605     intptr_t idx = simd_data(desc);                                        \
1606     TYPE *d = vd, *n = vn, *m = vm;                                        \
1607     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1608         TYPE mm = m[H(i + idx)];                                           \
1609         for (j = 0; j < segment; j++) {                                    \
1610             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1611         }                                                                  \
1612     }                                                                      \
1613     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1614 }
1615 
1616 #define nop(N, M, S) (M)
1617 
1618 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1619 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1620 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1621 
1622 #ifdef TARGET_AARCH64
1623 
1624 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1625 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1626 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1627 
1628 #endif
1629 
1630 #undef nop
1631 
1632 /*
1633  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1634  * the fused ops below they assume accumulate both from and into Vd.
1635  */
1636 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1637 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1638 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1639 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1640 
1641 #undef DO_FMUL_IDX
1642 
1643 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1644 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1645                   void *stat, uint32_t desc)                               \
1646 {                                                                          \
1647     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1648     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1649     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1650     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1651     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1652     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1653     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1654         TYPE mm = m[H(i + idx)];                                           \
1655         for (j = 0; j < segment; j++) {                                    \
1656             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1657                                      mm, a[i + j], 0, stat);               \
1658         }                                                                  \
1659     }                                                                      \
1660     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1661 }
1662 
1663 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1664 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1665 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1666 
1667 #undef DO_FMLA_IDX
1668 
1669 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1670 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1671 {                                                                          \
1672     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1673     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1674     bool q = false;                                                        \
1675     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1676         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1677         if (dd < MIN) {                                                    \
1678             dd = MIN;                                                      \
1679             q = true;                                                      \
1680         } else if (dd > MAX) {                                             \
1681             dd = MAX;                                                      \
1682             q = true;                                                      \
1683         }                                                                  \
1684         d[i] = dd;                                                         \
1685     }                                                                      \
1686     if (q) {                                                               \
1687         uint32_t *qc = vq;                                                 \
1688         qc[0] = 1;                                                         \
1689     }                                                                      \
1690     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1691 }
1692 
1693 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1694 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1695 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1696 
1697 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1698 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1699 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1700 
1701 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1702 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1703 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1704 
1705 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1706 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1707 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1708 
1709 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1710 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1711 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1712 
1713 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1714 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1715 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1716 
1717 #undef DO_SAT
1718 
1719 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1720                           void *vm, uint32_t desc)
1721 {
1722     intptr_t i, oprsz = simd_oprsz(desc);
1723     uint64_t *d = vd, *n = vn, *m = vm;
1724     bool q = false;
1725 
1726     for (i = 0; i < oprsz / 8; i++) {
1727         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1728         if (dd < nn) {
1729             dd = UINT64_MAX;
1730             q = true;
1731         }
1732         d[i] = dd;
1733     }
1734     if (q) {
1735         uint32_t *qc = vq;
1736         qc[0] = 1;
1737     }
1738     clear_tail(d, oprsz, simd_maxsz(desc));
1739 }
1740 
1741 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1742                           void *vm, uint32_t desc)
1743 {
1744     intptr_t i, oprsz = simd_oprsz(desc);
1745     uint64_t *d = vd, *n = vn, *m = vm;
1746     bool q = false;
1747 
1748     for (i = 0; i < oprsz / 8; i++) {
1749         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1750         if (nn < mm) {
1751             dd = 0;
1752             q = true;
1753         }
1754         d[i] = dd;
1755     }
1756     if (q) {
1757         uint32_t *qc = vq;
1758         qc[0] = 1;
1759     }
1760     clear_tail(d, oprsz, simd_maxsz(desc));
1761 }
1762 
1763 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1764                           void *vm, uint32_t desc)
1765 {
1766     intptr_t i, oprsz = simd_oprsz(desc);
1767     int64_t *d = vd, *n = vn, *m = vm;
1768     bool q = false;
1769 
1770     for (i = 0; i < oprsz / 8; i++) {
1771         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1772         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1773             dd = (nn >> 63) ^ ~INT64_MIN;
1774             q = true;
1775         }
1776         d[i] = dd;
1777     }
1778     if (q) {
1779         uint32_t *qc = vq;
1780         qc[0] = 1;
1781     }
1782     clear_tail(d, oprsz, simd_maxsz(desc));
1783 }
1784 
1785 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1786                           void *vm, uint32_t desc)
1787 {
1788     intptr_t i, oprsz = simd_oprsz(desc);
1789     int64_t *d = vd, *n = vn, *m = vm;
1790     bool q = false;
1791 
1792     for (i = 0; i < oprsz / 8; i++) {
1793         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1794         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1795             dd = (nn >> 63) ^ ~INT64_MIN;
1796             q = true;
1797         }
1798         d[i] = dd;
1799     }
1800     if (q) {
1801         uint32_t *qc = vq;
1802         qc[0] = 1;
1803     }
1804     clear_tail(d, oprsz, simd_maxsz(desc));
1805 }
1806 
1807 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1808                            void *vm, uint32_t desc)
1809 {
1810     intptr_t i, oprsz = simd_oprsz(desc);
1811     uint64_t *d = vd, *n = vn, *m = vm;
1812     bool q = false;
1813 
1814     for (i = 0; i < oprsz / 8; i++) {
1815         uint64_t nn = n[i];
1816         int64_t mm = m[i];
1817         uint64_t dd = nn + mm;
1818 
1819         if (mm < 0) {
1820             if (nn < (uint64_t)-mm) {
1821                 dd = 0;
1822                 q = true;
1823             }
1824         } else {
1825             if (dd < nn) {
1826                 dd = UINT64_MAX;
1827                 q = true;
1828             }
1829         }
1830         d[i] = dd;
1831     }
1832     if (q) {
1833         uint32_t *qc = vq;
1834         qc[0] = 1;
1835     }
1836     clear_tail(d, oprsz, simd_maxsz(desc));
1837 }
1838 
1839 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1840                            void *vm, uint32_t desc)
1841 {
1842     intptr_t i, oprsz = simd_oprsz(desc);
1843     uint64_t *d = vd, *n = vn, *m = vm;
1844     bool q = false;
1845 
1846     for (i = 0; i < oprsz / 8; i++) {
1847         int64_t nn = n[i];
1848         uint64_t mm = m[i];
1849         int64_t dd = nn + mm;
1850 
1851         if (mm > (uint64_t)(INT64_MAX - nn)) {
1852             dd = INT64_MAX;
1853             q = true;
1854         }
1855         d[i] = dd;
1856     }
1857     if (q) {
1858         uint32_t *qc = vq;
1859         qc[0] = 1;
1860     }
1861     clear_tail(d, oprsz, simd_maxsz(desc));
1862 }
1863 
1864 #define DO_SRA(NAME, TYPE)                              \
1865 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1866 {                                                       \
1867     intptr_t i, oprsz = simd_oprsz(desc);               \
1868     int shift = simd_data(desc);                        \
1869     TYPE *d = vd, *n = vn;                              \
1870     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1871         d[i] += n[i] >> shift;                          \
1872     }                                                   \
1873     clear_tail(d, oprsz, simd_maxsz(desc));             \
1874 }
1875 
1876 DO_SRA(gvec_ssra_b, int8_t)
1877 DO_SRA(gvec_ssra_h, int16_t)
1878 DO_SRA(gvec_ssra_s, int32_t)
1879 DO_SRA(gvec_ssra_d, int64_t)
1880 
1881 DO_SRA(gvec_usra_b, uint8_t)
1882 DO_SRA(gvec_usra_h, uint16_t)
1883 DO_SRA(gvec_usra_s, uint32_t)
1884 DO_SRA(gvec_usra_d, uint64_t)
1885 
1886 #undef DO_SRA
1887 
1888 #define DO_RSHR(NAME, TYPE)                             \
1889 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1890 {                                                       \
1891     intptr_t i, oprsz = simd_oprsz(desc);               \
1892     int shift = simd_data(desc);                        \
1893     TYPE *d = vd, *n = vn;                              \
1894     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1895         TYPE tmp = n[i] >> (shift - 1);                 \
1896         d[i] = (tmp >> 1) + (tmp & 1);                  \
1897     }                                                   \
1898     clear_tail(d, oprsz, simd_maxsz(desc));             \
1899 }
1900 
1901 DO_RSHR(gvec_srshr_b, int8_t)
1902 DO_RSHR(gvec_srshr_h, int16_t)
1903 DO_RSHR(gvec_srshr_s, int32_t)
1904 DO_RSHR(gvec_srshr_d, int64_t)
1905 
1906 DO_RSHR(gvec_urshr_b, uint8_t)
1907 DO_RSHR(gvec_urshr_h, uint16_t)
1908 DO_RSHR(gvec_urshr_s, uint32_t)
1909 DO_RSHR(gvec_urshr_d, uint64_t)
1910 
1911 #undef DO_RSHR
1912 
1913 #define DO_RSRA(NAME, TYPE)                             \
1914 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1915 {                                                       \
1916     intptr_t i, oprsz = simd_oprsz(desc);               \
1917     int shift = simd_data(desc);                        \
1918     TYPE *d = vd, *n = vn;                              \
1919     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1920         TYPE tmp = n[i] >> (shift - 1);                 \
1921         d[i] += (tmp >> 1) + (tmp & 1);                 \
1922     }                                                   \
1923     clear_tail(d, oprsz, simd_maxsz(desc));             \
1924 }
1925 
1926 DO_RSRA(gvec_srsra_b, int8_t)
1927 DO_RSRA(gvec_srsra_h, int16_t)
1928 DO_RSRA(gvec_srsra_s, int32_t)
1929 DO_RSRA(gvec_srsra_d, int64_t)
1930 
1931 DO_RSRA(gvec_ursra_b, uint8_t)
1932 DO_RSRA(gvec_ursra_h, uint16_t)
1933 DO_RSRA(gvec_ursra_s, uint32_t)
1934 DO_RSRA(gvec_ursra_d, uint64_t)
1935 
1936 #undef DO_RSRA
1937 
1938 #define DO_SRI(NAME, TYPE)                              \
1939 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1940 {                                                       \
1941     intptr_t i, oprsz = simd_oprsz(desc);               \
1942     int shift = simd_data(desc);                        \
1943     TYPE *d = vd, *n = vn;                              \
1944     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1945         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1946     }                                                   \
1947     clear_tail(d, oprsz, simd_maxsz(desc));             \
1948 }
1949 
1950 DO_SRI(gvec_sri_b, uint8_t)
1951 DO_SRI(gvec_sri_h, uint16_t)
1952 DO_SRI(gvec_sri_s, uint32_t)
1953 DO_SRI(gvec_sri_d, uint64_t)
1954 
1955 #undef DO_SRI
1956 
1957 #define DO_SLI(NAME, TYPE)                              \
1958 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1959 {                                                       \
1960     intptr_t i, oprsz = simd_oprsz(desc);               \
1961     int shift = simd_data(desc);                        \
1962     TYPE *d = vd, *n = vn;                              \
1963     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1964         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1965     }                                                   \
1966     clear_tail(d, oprsz, simd_maxsz(desc));             \
1967 }
1968 
1969 DO_SLI(gvec_sli_b, uint8_t)
1970 DO_SLI(gvec_sli_h, uint16_t)
1971 DO_SLI(gvec_sli_s, uint32_t)
1972 DO_SLI(gvec_sli_d, uint64_t)
1973 
1974 #undef DO_SLI
1975 
1976 /*
1977  * Convert float16 to float32, raising no exceptions and
1978  * preserving exceptional values, including SNaN.
1979  * This is effectively an unpack+repack operation.
1980  */
1981 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1982 {
1983     const int f16_bias = 15;
1984     const int f32_bias = 127;
1985     uint32_t sign = extract32(f16, 15, 1);
1986     uint32_t exp = extract32(f16, 10, 5);
1987     uint32_t frac = extract32(f16, 0, 10);
1988 
1989     if (exp == 0x1f) {
1990         /* Inf or NaN */
1991         exp = 0xff;
1992     } else if (exp == 0) {
1993         /* Zero or denormal.  */
1994         if (frac != 0) {
1995             if (fz16) {
1996                 frac = 0;
1997             } else {
1998                 /*
1999                  * Denormal; these are all normal float32.
2000                  * Shift the fraction so that the msb is at bit 11,
2001                  * then remove bit 11 as the implicit bit of the
2002                  * normalized float32.  Note that we still go through
2003                  * the shift for normal numbers below, to put the
2004                  * float32 fraction at the right place.
2005                  */
2006                 int shift = clz32(frac) - 21;
2007                 frac = (frac << shift) & 0x3ff;
2008                 exp = f32_bias - f16_bias - shift + 1;
2009             }
2010         }
2011     } else {
2012         /* Normal number; adjust the bias.  */
2013         exp += f32_bias - f16_bias;
2014     }
2015     sign <<= 31;
2016     exp <<= 23;
2017     frac <<= 23 - 10;
2018 
2019     return sign | exp | frac;
2020 }
2021 
2022 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2023 {
2024     /*
2025      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2026      * Load the 2nd qword iff is_q & is_2.
2027      * Shift to the 2nd dword iff !is_q & is_2.
2028      * For !is_q & !is_2, the upper bits of the result are garbage.
2029      */
2030     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2031 }
2032 
2033 /*
2034  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2035  * as there is not yet SVE versions that might use blocking.
2036  */
2037 
2038 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2039                      uint32_t desc, bool fz16)
2040 {
2041     intptr_t i, oprsz = simd_oprsz(desc);
2042     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2043     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2044     int is_q = oprsz == 16;
2045     uint64_t n_4, m_4;
2046 
2047     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2048     n_4 = load4_f16(vn, is_q, is_2);
2049     m_4 = load4_f16(vm, is_q, is_2);
2050 
2051     /* Negate all inputs for FMLSL at once.  */
2052     if (is_s) {
2053         n_4 ^= 0x8000800080008000ull;
2054     }
2055 
2056     for (i = 0; i < oprsz / 4; i++) {
2057         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2058         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2059         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2060     }
2061     clear_tail(d, oprsz, simd_maxsz(desc));
2062 }
2063 
2064 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2065                             void *venv, uint32_t desc)
2066 {
2067     CPUARMState *env = venv;
2068     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2069              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2070 }
2071 
2072 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2073                             void *venv, uint32_t desc)
2074 {
2075     CPUARMState *env = venv;
2076     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
2077              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2078 }
2079 
2080 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2081                                void *venv, uint32_t desc)
2082 {
2083     intptr_t i, oprsz = simd_oprsz(desc);
2084     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2085     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2086     CPUARMState *env = venv;
2087     float_status *status = &env->vfp.fp_status;
2088     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2089 
2090     for (i = 0; i < oprsz; i += sizeof(float32)) {
2091         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2092         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2093         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2094         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2095         float32 aa = *(float32 *)(va + H1_4(i));
2096 
2097         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2098     }
2099 }
2100 
2101 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2102                          uint32_t desc, bool fz16)
2103 {
2104     intptr_t i, oprsz = simd_oprsz(desc);
2105     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2106     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2107     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2108     int is_q = oprsz == 16;
2109     uint64_t n_4;
2110     float32 m_1;
2111 
2112     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2113     n_4 = load4_f16(vn, is_q, is_2);
2114 
2115     /* Negate all inputs for FMLSL at once.  */
2116     if (is_s) {
2117         n_4 ^= 0x8000800080008000ull;
2118     }
2119 
2120     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2121 
2122     for (i = 0; i < oprsz / 4; i++) {
2123         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2124         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2125     }
2126     clear_tail(d, oprsz, simd_maxsz(desc));
2127 }
2128 
2129 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2130                                 void *venv, uint32_t desc)
2131 {
2132     CPUARMState *env = venv;
2133     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2134                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2135 }
2136 
2137 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2138                                 void *venv, uint32_t desc)
2139 {
2140     CPUARMState *env = venv;
2141     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2142                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2143 }
2144 
2145 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2146                                void *venv, uint32_t desc)
2147 {
2148     intptr_t i, j, oprsz = simd_oprsz(desc);
2149     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2150     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2151     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2152     CPUARMState *env = venv;
2153     float_status *status = &env->vfp.fp_status;
2154     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2155 
2156     for (i = 0; i < oprsz; i += 16) {
2157         float16 mm_16 = *(float16 *)(vm + i + idx);
2158         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2159 
2160         for (j = 0; j < 16; j += sizeof(float32)) {
2161             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2162             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2163             float32 aa = *(float32 *)(va + H1_4(i + j));
2164 
2165             *(float32 *)(vd + H1_4(i + j)) =
2166                 float32_muladd(nn, mm, aa, 0, status);
2167         }
2168     }
2169 }
2170 
2171 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2172 {
2173     intptr_t i, opr_sz = simd_oprsz(desc);
2174     int8_t *d = vd, *n = vn, *m = vm;
2175 
2176     for (i = 0; i < opr_sz; ++i) {
2177         int8_t mm = m[i];
2178         int8_t nn = n[i];
2179         int8_t res = 0;
2180         if (mm >= 0) {
2181             if (mm < 8) {
2182                 res = nn << mm;
2183             }
2184         } else {
2185             res = nn >> (mm > -8 ? -mm : 7);
2186         }
2187         d[i] = res;
2188     }
2189     clear_tail(d, opr_sz, simd_maxsz(desc));
2190 }
2191 
2192 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2193 {
2194     intptr_t i, opr_sz = simd_oprsz(desc);
2195     int16_t *d = vd, *n = vn, *m = vm;
2196 
2197     for (i = 0; i < opr_sz / 2; ++i) {
2198         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2199         int16_t nn = n[i];
2200         int16_t res = 0;
2201         if (mm >= 0) {
2202             if (mm < 16) {
2203                 res = nn << mm;
2204             }
2205         } else {
2206             res = nn >> (mm > -16 ? -mm : 15);
2207         }
2208         d[i] = res;
2209     }
2210     clear_tail(d, opr_sz, simd_maxsz(desc));
2211 }
2212 
2213 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2214 {
2215     intptr_t i, opr_sz = simd_oprsz(desc);
2216     uint8_t *d = vd, *n = vn, *m = vm;
2217 
2218     for (i = 0; i < opr_sz; ++i) {
2219         int8_t mm = m[i];
2220         uint8_t nn = n[i];
2221         uint8_t res = 0;
2222         if (mm >= 0) {
2223             if (mm < 8) {
2224                 res = nn << mm;
2225             }
2226         } else {
2227             if (mm > -8) {
2228                 res = nn >> -mm;
2229             }
2230         }
2231         d[i] = res;
2232     }
2233     clear_tail(d, opr_sz, simd_maxsz(desc));
2234 }
2235 
2236 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2237 {
2238     intptr_t i, opr_sz = simd_oprsz(desc);
2239     uint16_t *d = vd, *n = vn, *m = vm;
2240 
2241     for (i = 0; i < opr_sz / 2; ++i) {
2242         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2243         uint16_t nn = n[i];
2244         uint16_t res = 0;
2245         if (mm >= 0) {
2246             if (mm < 16) {
2247                 res = nn << mm;
2248             }
2249         } else {
2250             if (mm > -16) {
2251                 res = nn >> -mm;
2252             }
2253         }
2254         d[i] = res;
2255     }
2256     clear_tail(d, opr_sz, simd_maxsz(desc));
2257 }
2258 
2259 /*
2260  * 8x8->8 polynomial multiply.
2261  *
2262  * Polynomial multiplication is like integer multiplication except the
2263  * partial products are XORed, not added.
2264  *
2265  * TODO: expose this as a generic vector operation, as it is a common
2266  * crypto building block.
2267  */
2268 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2269 {
2270     intptr_t i, opr_sz = simd_oprsz(desc);
2271     uint64_t *d = vd, *n = vn, *m = vm;
2272 
2273     for (i = 0; i < opr_sz / 8; ++i) {
2274         d[i] = clmul_8x8_low(n[i], m[i]);
2275     }
2276     clear_tail(d, opr_sz, simd_maxsz(desc));
2277 }
2278 
2279 /*
2280  * 64x64->128 polynomial multiply.
2281  * Because of the lanes are not accessed in strict columns,
2282  * this probably cannot be turned into a generic helper.
2283  */
2284 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2285 {
2286     intptr_t i, opr_sz = simd_oprsz(desc);
2287     intptr_t hi = simd_data(desc);
2288     uint64_t *d = vd, *n = vn, *m = vm;
2289 
2290     for (i = 0; i < opr_sz / 8; i += 2) {
2291         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2292         d[i] = int128_getlo(r);
2293         d[i + 1] = int128_gethi(r);
2294     }
2295     clear_tail(d, opr_sz, simd_maxsz(desc));
2296 }
2297 
2298 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2299 {
2300     int hi = simd_data(desc);
2301     uint64_t *d = vd, *n = vn, *m = vm;
2302     uint64_t nn = n[hi], mm = m[hi];
2303 
2304     d[0] = clmul_8x4_packed(nn, mm);
2305     nn >>= 32;
2306     mm >>= 32;
2307     d[1] = clmul_8x4_packed(nn, mm);
2308 
2309     clear_tail(d, 16, simd_maxsz(desc));
2310 }
2311 
2312 #ifdef TARGET_AARCH64
2313 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2314 {
2315     int shift = simd_data(desc) * 8;
2316     intptr_t i, opr_sz = simd_oprsz(desc);
2317     uint64_t *d = vd, *n = vn, *m = vm;
2318 
2319     for (i = 0; i < opr_sz / 8; ++i) {
2320         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2321     }
2322 }
2323 
2324 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2325 {
2326     intptr_t sel = H4(simd_data(desc));
2327     intptr_t i, opr_sz = simd_oprsz(desc);
2328     uint32_t *n = vn, *m = vm;
2329     uint64_t *d = vd;
2330 
2331     for (i = 0; i < opr_sz / 8; ++i) {
2332         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2333     }
2334 }
2335 #endif
2336 
2337 #define DO_CMP0(NAME, TYPE, OP)                         \
2338 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2339 {                                                       \
2340     intptr_t i, opr_sz = simd_oprsz(desc);              \
2341     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2342         TYPE nn = *(TYPE *)(vn + i);                    \
2343         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2344     }                                                   \
2345     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2346 }
2347 
2348 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2349 DO_CMP0(gvec_clt0_b, int8_t, <)
2350 DO_CMP0(gvec_cle0_b, int8_t, <=)
2351 DO_CMP0(gvec_cgt0_b, int8_t, >)
2352 DO_CMP0(gvec_cge0_b, int8_t, >=)
2353 
2354 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2355 DO_CMP0(gvec_clt0_h, int16_t, <)
2356 DO_CMP0(gvec_cle0_h, int16_t, <=)
2357 DO_CMP0(gvec_cgt0_h, int16_t, >)
2358 DO_CMP0(gvec_cge0_h, int16_t, >=)
2359 
2360 #undef DO_CMP0
2361 
2362 #define DO_ABD(NAME, TYPE)                                      \
2363 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2364 {                                                               \
2365     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2366     TYPE *d = vd, *n = vn, *m = vm;                             \
2367                                                                 \
2368     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2369         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2370     }                                                           \
2371     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2372 }
2373 
2374 DO_ABD(gvec_sabd_b, int8_t)
2375 DO_ABD(gvec_sabd_h, int16_t)
2376 DO_ABD(gvec_sabd_s, int32_t)
2377 DO_ABD(gvec_sabd_d, int64_t)
2378 
2379 DO_ABD(gvec_uabd_b, uint8_t)
2380 DO_ABD(gvec_uabd_h, uint16_t)
2381 DO_ABD(gvec_uabd_s, uint32_t)
2382 DO_ABD(gvec_uabd_d, uint64_t)
2383 
2384 #undef DO_ABD
2385 
2386 #define DO_ABA(NAME, TYPE)                                      \
2387 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2388 {                                                               \
2389     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2390     TYPE *d = vd, *n = vn, *m = vm;                             \
2391                                                                 \
2392     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2393         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2394     }                                                           \
2395     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2396 }
2397 
2398 DO_ABA(gvec_saba_b, int8_t)
2399 DO_ABA(gvec_saba_h, int16_t)
2400 DO_ABA(gvec_saba_s, int32_t)
2401 DO_ABA(gvec_saba_d, int64_t)
2402 
2403 DO_ABA(gvec_uaba_b, uint8_t)
2404 DO_ABA(gvec_uaba_h, uint16_t)
2405 DO_ABA(gvec_uaba_s, uint32_t)
2406 DO_ABA(gvec_uaba_d, uint64_t)
2407 
2408 #undef DO_ABA
2409 
2410 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2411 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2412 {                                                                          \
2413     ARMVectorReg scratch;                                                  \
2414     intptr_t oprsz = simd_oprsz(desc);                                     \
2415     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2416     TYPE *d = vd, *n = vn, *m = vm;                                        \
2417     if (unlikely(d == m)) {                                                \
2418         m = memcpy(&scratch, m, oprsz);                                    \
2419     }                                                                      \
2420     for (intptr_t i = 0; i < half; ++i) {                                  \
2421         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2422     }                                                                      \
2423     for (intptr_t i = 0; i < half; ++i) {                                  \
2424         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2425     }                                                                      \
2426     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2427 }
2428 
2429 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2430 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2431 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2432 
2433 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2434 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2435 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2436 
2437 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2438 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2439 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2440 
2441 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2442 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2443 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2444 
2445 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2446 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2447 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2448 
2449 #undef DO_3OP_PAIR
2450 
2451 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2452 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2453 {                                                               \
2454     ARMVectorReg scratch;                                       \
2455     intptr_t oprsz = simd_oprsz(desc);                          \
2456     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2457     TYPE *d = vd, *n = vn, *m = vm;                             \
2458     if (unlikely(d == m)) {                                     \
2459         m = memcpy(&scratch, m, oprsz);                         \
2460     }                                                           \
2461     for (intptr_t i = 0; i < half; ++i) {                       \
2462         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2463     }                                                           \
2464     for (intptr_t i = 0; i < half; ++i) {                       \
2465         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2466     }                                                           \
2467     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2468 }
2469 
2470 #define ADD(A, B) (A + B)
2471 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2472 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2473 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2474 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2475 #undef  ADD
2476 
2477 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2478 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2479 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2480 
2481 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2482 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2483 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2484 
2485 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2486 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2487 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2488 
2489 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2490 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2491 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2492 
2493 #undef DO_3OP_PAIR
2494 
2495 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2496     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2497     {                                                                   \
2498         intptr_t i, oprsz = simd_oprsz(desc);                           \
2499         int shift = simd_data(desc);                                    \
2500         TYPE *d = vd, *n = vn;                                          \
2501         float_status *fpst = stat;                                      \
2502         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2503             d[i] = FUNC(n[i], shift, fpst);                             \
2504         }                                                               \
2505         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2506     }
2507 
2508 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2509 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2510 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2511 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2512 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2513 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2514 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2515 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2516 
2517 #undef DO_VCVT_FIXED
2518 
2519 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2520     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2521     {                                                                   \
2522         float_status *fpst = stat;                                      \
2523         intptr_t i, oprsz = simd_oprsz(desc);                           \
2524         uint32_t rmode = simd_data(desc);                               \
2525         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2526         TYPE *d = vd, *n = vn;                                          \
2527         set_float_rounding_mode(rmode, fpst);                           \
2528         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2529             d[i] = FUNC(n[i], 0, fpst);                                 \
2530         }                                                               \
2531         set_float_rounding_mode(prev_rmode, fpst);                      \
2532         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2533     }
2534 
2535 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2536 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2537 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2538 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2539 
2540 #undef DO_VCVT_RMODE
2541 
2542 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2543     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2544     {                                                                   \
2545         float_status *fpst = stat;                                      \
2546         intptr_t i, oprsz = simd_oprsz(desc);                           \
2547         uint32_t rmode = simd_data(desc);                               \
2548         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2549         TYPE *d = vd, *n = vn;                                          \
2550         set_float_rounding_mode(rmode, fpst);                           \
2551         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2552             d[i] = FUNC(n[i], fpst);                                    \
2553         }                                                               \
2554         set_float_rounding_mode(prev_rmode, fpst);                      \
2555         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2556     }
2557 
2558 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2559 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2560 
2561 #undef DO_VRINT_RMODE
2562 
2563 #ifdef TARGET_AARCH64
2564 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2565 {
2566     const uint8_t *indices = vm;
2567     CPUARMState *env = venv;
2568     size_t oprsz = simd_oprsz(desc);
2569     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2570     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2571     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2572     union {
2573         uint8_t b[16];
2574         uint64_t d[2];
2575     } result;
2576 
2577     /*
2578      * We must construct the final result in a temp, lest the output
2579      * overlaps the input table.  For TBL, begin with zero; for TBX,
2580      * begin with the original register contents.  Note that we always
2581      * copy 16 bytes here to avoid an extra branch; clearing the high
2582      * bits of the register for oprsz == 8 is handled below.
2583      */
2584     if (is_tbx) {
2585         memcpy(&result, vd, 16);
2586     } else {
2587         memset(&result, 0, 16);
2588     }
2589 
2590     for (size_t i = 0; i < oprsz; ++i) {
2591         uint32_t index = indices[H1(i)];
2592 
2593         if (index < table_len) {
2594             /*
2595              * Convert index (a byte offset into the virtual table
2596              * which is a series of 128-bit vectors concatenated)
2597              * into the correct register element, bearing in mind
2598              * that the table can wrap around from V31 to V0.
2599              */
2600             const uint8_t *table = (const uint8_t *)
2601                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2602             result.b[H1(i)] = table[H1(index % 16)];
2603         }
2604     }
2605 
2606     memcpy(vd, &result, 16);
2607     clear_tail(vd, oprsz, simd_maxsz(desc));
2608 }
2609 #endif
2610 
2611 /*
2612  * NxN -> N highpart multiply
2613  *
2614  * TODO: expose this as a generic vector operation.
2615  */
2616 
2617 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2618 {
2619     intptr_t i, opr_sz = simd_oprsz(desc);
2620     int8_t *d = vd, *n = vn, *m = vm;
2621 
2622     for (i = 0; i < opr_sz; ++i) {
2623         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2624     }
2625     clear_tail(d, opr_sz, simd_maxsz(desc));
2626 }
2627 
2628 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2629 {
2630     intptr_t i, opr_sz = simd_oprsz(desc);
2631     int16_t *d = vd, *n = vn, *m = vm;
2632 
2633     for (i = 0; i < opr_sz / 2; ++i) {
2634         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2635     }
2636     clear_tail(d, opr_sz, simd_maxsz(desc));
2637 }
2638 
2639 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2640 {
2641     intptr_t i, opr_sz = simd_oprsz(desc);
2642     int32_t *d = vd, *n = vn, *m = vm;
2643 
2644     for (i = 0; i < opr_sz / 4; ++i) {
2645         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2646     }
2647     clear_tail(d, opr_sz, simd_maxsz(desc));
2648 }
2649 
2650 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2651 {
2652     intptr_t i, opr_sz = simd_oprsz(desc);
2653     uint64_t *d = vd, *n = vn, *m = vm;
2654     uint64_t discard;
2655 
2656     for (i = 0; i < opr_sz / 8; ++i) {
2657         muls64(&discard, &d[i], n[i], m[i]);
2658     }
2659     clear_tail(d, opr_sz, simd_maxsz(desc));
2660 }
2661 
2662 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2663 {
2664     intptr_t i, opr_sz = simd_oprsz(desc);
2665     uint8_t *d = vd, *n = vn, *m = vm;
2666 
2667     for (i = 0; i < opr_sz; ++i) {
2668         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2669     }
2670     clear_tail(d, opr_sz, simd_maxsz(desc));
2671 }
2672 
2673 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2674 {
2675     intptr_t i, opr_sz = simd_oprsz(desc);
2676     uint16_t *d = vd, *n = vn, *m = vm;
2677 
2678     for (i = 0; i < opr_sz / 2; ++i) {
2679         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2680     }
2681     clear_tail(d, opr_sz, simd_maxsz(desc));
2682 }
2683 
2684 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2685 {
2686     intptr_t i, opr_sz = simd_oprsz(desc);
2687     uint32_t *d = vd, *n = vn, *m = vm;
2688 
2689     for (i = 0; i < opr_sz / 4; ++i) {
2690         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2691     }
2692     clear_tail(d, opr_sz, simd_maxsz(desc));
2693 }
2694 
2695 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2696 {
2697     intptr_t i, opr_sz = simd_oprsz(desc);
2698     uint64_t *d = vd, *n = vn, *m = vm;
2699     uint64_t discard;
2700 
2701     for (i = 0; i < opr_sz / 8; ++i) {
2702         mulu64(&discard, &d[i], n[i], m[i]);
2703     }
2704     clear_tail(d, opr_sz, simd_maxsz(desc));
2705 }
2706 
2707 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2708 {
2709     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2710     int shr = simd_data(desc);
2711     uint64_t *d = vd, *n = vn, *m = vm;
2712 
2713     for (i = 0; i < opr_sz; ++i) {
2714         d[i] = ror64(n[i] ^ m[i], shr);
2715     }
2716     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2717 }
2718 
2719 /*
2720  * Integer matrix-multiply accumulate
2721  */
2722 
2723 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2724 {
2725     int8_t *n = vn, *m = vm;
2726 
2727     for (intptr_t k = 0; k < 8; ++k) {
2728         sum += n[H1(k)] * m[H1(k)];
2729     }
2730     return sum;
2731 }
2732 
2733 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2734 {
2735     uint8_t *n = vn, *m = vm;
2736 
2737     for (intptr_t k = 0; k < 8; ++k) {
2738         sum += n[H1(k)] * m[H1(k)];
2739     }
2740     return sum;
2741 }
2742 
2743 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2744 {
2745     uint8_t *n = vn;
2746     int8_t *m = vm;
2747 
2748     for (intptr_t k = 0; k < 8; ++k) {
2749         sum += n[H1(k)] * m[H1(k)];
2750     }
2751     return sum;
2752 }
2753 
2754 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2755                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2756 {
2757     intptr_t seg, opr_sz = simd_oprsz(desc);
2758 
2759     for (seg = 0; seg < opr_sz; seg += 16) {
2760         uint32_t *d = vd + seg;
2761         uint32_t *a = va + seg;
2762         uint32_t sum0, sum1, sum2, sum3;
2763 
2764         /*
2765          * Process the entire segment at once, writing back the
2766          * results only after we've consumed all of the inputs.
2767          *
2768          * Key to indices by column:
2769          *          i   j                  i             j
2770          */
2771         sum0 = a[H4(0 + 0)];
2772         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2773         sum1 = a[H4(0 + 1)];
2774         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2775         sum2 = a[H4(2 + 0)];
2776         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2777         sum3 = a[H4(2 + 1)];
2778         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2779 
2780         d[H4(0)] = sum0;
2781         d[H4(1)] = sum1;
2782         d[H4(2)] = sum2;
2783         d[H4(3)] = sum3;
2784     }
2785     clear_tail(vd, opr_sz, simd_maxsz(desc));
2786 }
2787 
2788 #define DO_MMLA_B(NAME, INNER) \
2789     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2790     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2791 
2792 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2793 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2794 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2795 
2796 /*
2797  * BFloat16 Dot Product
2798  */
2799 
2800 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2801 {
2802     /*
2803      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2804      * For EBF = 0, we ignore the FPCR bits which determine rounding
2805      * mode and denormal-flushing, and we do unfused multiplies and
2806      * additions with intermediate rounding of all products and sums.
2807      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2808      * and we perform a fused two-way sum-of-products without intermediate
2809      * rounding of the products.
2810      * In either case, we don't set fp exception flags.
2811      *
2812      * EBF is AArch64 only, so even if it's set in the FPCR it has
2813      * no effect on AArch32 instructions.
2814      */
2815     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2816     *statusp = (float_status){
2817         .tininess_before_rounding = float_tininess_before_rounding,
2818         .float_rounding_mode = float_round_to_odd_inf,
2819         .flush_to_zero = true,
2820         .flush_inputs_to_zero = true,
2821         .default_nan_mode = true,
2822     };
2823 
2824     if (ebf) {
2825         float_status *fpst = &env->vfp.fp_status;
2826         set_flush_to_zero(get_flush_to_zero(fpst), statusp);
2827         set_flush_inputs_to_zero(get_flush_inputs_to_zero(fpst), statusp);
2828         set_float_rounding_mode(get_float_rounding_mode(fpst), statusp);
2829 
2830         /* EBF=1 needs to do a step with round-to-odd semantics */
2831         *oddstatusp = *statusp;
2832         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2833     }
2834 
2835     return ebf;
2836 }
2837 
2838 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2839 {
2840     float32 t1, t2;
2841 
2842     /*
2843      * Extract each BFloat16 from the element pair, and shift
2844      * them such that they become float32.
2845      */
2846     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2847     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2848     t1 = float32_add(t1, t2, fpst);
2849     t1 = float32_add(sum, t1, fpst);
2850 
2851     return t1;
2852 }
2853 
2854 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2855                      float_status *fpst, float_status *fpst_odd)
2856 {
2857     /*
2858      * Compare f16_dotadd() in sme_helper.c, but here we have
2859      * bfloat16 inputs. In particular that means that we do not
2860      * want the FPCR.FZ16 flush semantics, so we use the normal
2861      * float_status for the input handling here.
2862      */
2863     float64 e1r = float32_to_float64(e1 << 16, fpst);
2864     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2865     float64 e2r = float32_to_float64(e2 << 16, fpst);
2866     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2867     float64 t64;
2868     float32 t32;
2869 
2870     /*
2871      * The ARM pseudocode function FPDot performs both multiplies
2872      * and the add with a single rounding operation.  Emulate this
2873      * by performing the first multiply in round-to-odd, then doing
2874      * the second multiply as fused multiply-add, and rounding to
2875      * float32 all in one step.
2876      */
2877     t64 = float64_mul(e1r, e2r, fpst_odd);
2878     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2879 
2880     /* This conversion is exact, because we've already rounded. */
2881     t32 = float64_to_float32(t64, fpst);
2882 
2883     /* The final accumulation step is not fused. */
2884     return float32_add(sum, t32, fpst);
2885 }
2886 
2887 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2888                         CPUARMState *env, uint32_t desc)
2889 {
2890     intptr_t i, opr_sz = simd_oprsz(desc);
2891     float32 *d = vd, *a = va;
2892     uint32_t *n = vn, *m = vm;
2893     float_status fpst, fpst_odd;
2894 
2895     if (is_ebf(env, &fpst, &fpst_odd)) {
2896         for (i = 0; i < opr_sz / 4; ++i) {
2897             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2898         }
2899     } else {
2900         for (i = 0; i < opr_sz / 4; ++i) {
2901             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2902         }
2903     }
2904     clear_tail(d, opr_sz, simd_maxsz(desc));
2905 }
2906 
2907 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2908                             void *va, CPUARMState *env, uint32_t desc)
2909 {
2910     intptr_t i, j, opr_sz = simd_oprsz(desc);
2911     intptr_t index = simd_data(desc);
2912     intptr_t elements = opr_sz / 4;
2913     intptr_t eltspersegment = MIN(16 / 4, elements);
2914     float32 *d = vd, *a = va;
2915     uint32_t *n = vn, *m = vm;
2916     float_status fpst, fpst_odd;
2917 
2918     if (is_ebf(env, &fpst, &fpst_odd)) {
2919         for (i = 0; i < elements; i += eltspersegment) {
2920             uint32_t m_idx = m[i + H4(index)];
2921 
2922             for (j = i; j < i + eltspersegment; j++) {
2923                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2924             }
2925         }
2926     } else {
2927         for (i = 0; i < elements; i += eltspersegment) {
2928             uint32_t m_idx = m[i + H4(index)];
2929 
2930             for (j = i; j < i + eltspersegment; j++) {
2931                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2932             }
2933         }
2934     }
2935     clear_tail(d, opr_sz, simd_maxsz(desc));
2936 }
2937 
2938 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2939                          CPUARMState *env, uint32_t desc)
2940 {
2941     intptr_t s, opr_sz = simd_oprsz(desc);
2942     float32 *d = vd, *a = va;
2943     uint32_t *n = vn, *m = vm;
2944     float_status fpst, fpst_odd;
2945 
2946     if (is_ebf(env, &fpst, &fpst_odd)) {
2947         for (s = 0; s < opr_sz / 4; s += 4) {
2948             float32 sum00, sum01, sum10, sum11;
2949 
2950             /*
2951              * Process the entire segment at once, writing back the
2952              * results only after we've consumed all of the inputs.
2953              *
2954              * Key to indices by column:
2955              *               i   j               i   k             j   k
2956              */
2957             sum00 = a[s + H4(0 + 0)];
2958             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2959             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2960 
2961             sum01 = a[s + H4(0 + 1)];
2962             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2963             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2964 
2965             sum10 = a[s + H4(2 + 0)];
2966             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2967             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2968 
2969             sum11 = a[s + H4(2 + 1)];
2970             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2971             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2972 
2973             d[s + H4(0 + 0)] = sum00;
2974             d[s + H4(0 + 1)] = sum01;
2975             d[s + H4(2 + 0)] = sum10;
2976             d[s + H4(2 + 1)] = sum11;
2977         }
2978     } else {
2979         for (s = 0; s < opr_sz / 4; s += 4) {
2980             float32 sum00, sum01, sum10, sum11;
2981 
2982             /*
2983              * Process the entire segment at once, writing back the
2984              * results only after we've consumed all of the inputs.
2985              *
2986              * Key to indices by column:
2987              *               i   j           i   k             j   k
2988              */
2989             sum00 = a[s + H4(0 + 0)];
2990             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2991             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2992 
2993             sum01 = a[s + H4(0 + 1)];
2994             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
2995             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
2996 
2997             sum10 = a[s + H4(2 + 0)];
2998             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
2999             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3000 
3001             sum11 = a[s + H4(2 + 1)];
3002             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3003             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3004 
3005             d[s + H4(0 + 0)] = sum00;
3006             d[s + H4(0 + 1)] = sum01;
3007             d[s + H4(2 + 0)] = sum10;
3008             d[s + H4(2 + 1)] = sum11;
3009         }
3010     }
3011     clear_tail(d, opr_sz, simd_maxsz(desc));
3012 }
3013 
3014 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3015                          void *stat, uint32_t desc)
3016 {
3017     intptr_t i, opr_sz = simd_oprsz(desc);
3018     intptr_t sel = simd_data(desc);
3019     float32 *d = vd, *a = va;
3020     bfloat16 *n = vn, *m = vm;
3021 
3022     for (i = 0; i < opr_sz / 4; ++i) {
3023         float32 nn = n[H2(i * 2 + sel)] << 16;
3024         float32 mm = m[H2(i * 2 + sel)] << 16;
3025         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3026     }
3027     clear_tail(d, opr_sz, simd_maxsz(desc));
3028 }
3029 
3030 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3031                              void *va, void *stat, uint32_t desc)
3032 {
3033     intptr_t i, j, opr_sz = simd_oprsz(desc);
3034     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3035     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3036     intptr_t elements = opr_sz / 4;
3037     intptr_t eltspersegment = MIN(16 / 4, elements);
3038     float32 *d = vd, *a = va;
3039     bfloat16 *n = vn, *m = vm;
3040 
3041     for (i = 0; i < elements; i += eltspersegment) {
3042         float32 m_idx = m[H2(2 * i + index)] << 16;
3043 
3044         for (j = i; j < i + eltspersegment; j++) {
3045             float32 n_j = n[H2(2 * j + sel)] << 16;
3046             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3047         }
3048     }
3049     clear_tail(d, opr_sz, simd_maxsz(desc));
3050 }
3051 
3052 #define DO_CLAMP(NAME, TYPE) \
3053 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3054 {                                                                       \
3055     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3056     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3057         TYPE aa = *(TYPE *)(a + i);                                     \
3058         TYPE nn = *(TYPE *)(n + i);                                     \
3059         TYPE mm = *(TYPE *)(m + i);                                     \
3060         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3061         *(TYPE *)(d + i) = dd;                                          \
3062     }                                                                   \
3063     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3064 }
3065 
3066 DO_CLAMP(gvec_sclamp_b, int8_t)
3067 DO_CLAMP(gvec_sclamp_h, int16_t)
3068 DO_CLAMP(gvec_sclamp_s, int32_t)
3069 DO_CLAMP(gvec_sclamp_d, int64_t)
3070 
3071 DO_CLAMP(gvec_uclamp_b, uint8_t)
3072 DO_CLAMP(gvec_uclamp_h, uint16_t)
3073 DO_CLAMP(gvec_uclamp_s, uint32_t)
3074 DO_CLAMP(gvec_uclamp_d, uint64_t)
3075