xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision cb1c77fe)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
315                              void *va, uint32_t desc)
316 {
317     intptr_t i, opr_sz = simd_oprsz(desc);
318     int16_t *d = vd, *n = vn, *m = vm, *a = va;
319     uint32_t discard;
320 
321     for (i = 0; i < opr_sz / 2; ++i) {
322         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
323     }
324 }
325 
326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
327                              void *va, uint32_t desc)
328 {
329     intptr_t i, opr_sz = simd_oprsz(desc);
330     int16_t *d = vd, *n = vn, *m = vm, *a = va;
331     uint32_t discard;
332 
333     for (i = 0; i < opr_sz / 2; ++i) {
334         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
335     }
336 }
337 
338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
339 {
340     intptr_t i, opr_sz = simd_oprsz(desc);
341     int16_t *d = vd, *n = vn, *m = vm;
342     uint32_t discard;
343 
344     for (i = 0; i < opr_sz / 2; ++i) {
345         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
346     }
347 }
348 
349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
350 {
351     intptr_t i, opr_sz = simd_oprsz(desc);
352     int16_t *d = vd, *n = vn, *m = vm;
353     uint32_t discard;
354 
355     for (i = 0; i < opr_sz / 2; ++i) {
356         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
357     }
358 }
359 
360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
361 {
362     intptr_t i, j, opr_sz = simd_oprsz(desc);
363     int idx = simd_data(desc);
364     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
365     uint32_t discard;
366 
367     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
368         int16_t mm = m[i];
369         for (j = 0; j < 16 / 2; ++j) {
370             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
371         }
372     }
373 }
374 
375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
376 {
377     intptr_t i, j, opr_sz = simd_oprsz(desc);
378     int idx = simd_data(desc);
379     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
380     uint32_t discard;
381 
382     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
383         int16_t mm = m[i];
384         for (j = 0; j < 16 / 2; ++j) {
385             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
386         }
387     }
388 }
389 
390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
392                       bool neg, bool round, uint32_t *sat)
393 {
394     /* Simplify similarly to do_sqrdmlah_b above.  */
395     int64_t ret = (int64_t)src1 * src2;
396     if (neg) {
397         ret = -ret;
398     }
399     ret += ((int64_t)src3 << 31) + (round << 30);
400     ret >>= 31;
401 
402     if (ret != (int32_t)ret) {
403         *sat = 1;
404         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
405     }
406     return ret;
407 }
408 
409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
410                                   int32_t src2, int32_t src3)
411 {
412     uint32_t *sat = &env->vfp.qc[0];
413     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
414 }
415 
416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
417                               void *vq, uint32_t desc)
418 {
419     uintptr_t opr_sz = simd_oprsz(desc);
420     int32_t *d = vd;
421     int32_t *n = vn;
422     int32_t *m = vm;
423     uintptr_t i;
424 
425     for (i = 0; i < opr_sz / 4; ++i) {
426         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
427     }
428     clear_tail(d, opr_sz, simd_maxsz(desc));
429 }
430 
431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
432                                   int32_t src2, int32_t src3)
433 {
434     uint32_t *sat = &env->vfp.qc[0];
435     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
436 }
437 
438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
439                               void *vq, uint32_t desc)
440 {
441     uintptr_t opr_sz = simd_oprsz(desc);
442     int32_t *d = vd;
443     int32_t *n = vn;
444     int32_t *m = vm;
445     uintptr_t i;
446 
447     for (i = 0; i < opr_sz / 4; ++i) {
448         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
449     }
450     clear_tail(d, opr_sz, simd_maxsz(desc));
451 }
452 
453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
454                             void *vq, uint32_t desc)
455 {
456     intptr_t i, opr_sz = simd_oprsz(desc);
457     int32_t *d = vd, *n = vn, *m = vm;
458 
459     for (i = 0; i < opr_sz / 4; ++i) {
460         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
461     }
462     clear_tail(d, opr_sz, simd_maxsz(desc));
463 }
464 
465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
466                              void *vq, uint32_t desc)
467 {
468     intptr_t i, opr_sz = simd_oprsz(desc);
469     int32_t *d = vd, *n = vn, *m = vm;
470 
471     for (i = 0; i < opr_sz / 4; ++i) {
472         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
473     }
474     clear_tail(d, opr_sz, simd_maxsz(desc));
475 }
476 
477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
478                              void *va, uint32_t desc)
479 {
480     intptr_t i, opr_sz = simd_oprsz(desc);
481     int32_t *d = vd, *n = vn, *m = vm, *a = va;
482     uint32_t discard;
483 
484     for (i = 0; i < opr_sz / 4; ++i) {
485         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
486     }
487 }
488 
489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
490                              void *va, uint32_t desc)
491 {
492     intptr_t i, opr_sz = simd_oprsz(desc);
493     int32_t *d = vd, *n = vn, *m = vm, *a = va;
494     uint32_t discard;
495 
496     for (i = 0; i < opr_sz / 4; ++i) {
497         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
498     }
499 }
500 
501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
502 {
503     intptr_t i, opr_sz = simd_oprsz(desc);
504     int32_t *d = vd, *n = vn, *m = vm;
505     uint32_t discard;
506 
507     for (i = 0; i < opr_sz / 4; ++i) {
508         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
509     }
510 }
511 
512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
513 {
514     intptr_t i, opr_sz = simd_oprsz(desc);
515     int32_t *d = vd, *n = vn, *m = vm;
516     uint32_t discard;
517 
518     for (i = 0; i < opr_sz / 4; ++i) {
519         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
520     }
521 }
522 
523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
524 {
525     intptr_t i, j, opr_sz = simd_oprsz(desc);
526     int idx = simd_data(desc);
527     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
528     uint32_t discard;
529 
530     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
531         int32_t mm = m[i];
532         for (j = 0; j < 16 / 4; ++j) {
533             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
534         }
535     }
536 }
537 
538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
539 {
540     intptr_t i, j, opr_sz = simd_oprsz(desc);
541     int idx = simd_data(desc);
542     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
543     uint32_t discard;
544 
545     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
546         int32_t mm = m[i];
547         for (j = 0; j < 16 / 4; ++j) {
548             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
549         }
550     }
551 }
552 
553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
554 static int64_t do_sat128_d(Int128 r)
555 {
556     int64_t ls = int128_getlo(r);
557     int64_t hs = int128_gethi(r);
558 
559     if (unlikely(hs != (ls >> 63))) {
560         return hs < 0 ? INT64_MIN : INT64_MAX;
561     }
562     return ls;
563 }
564 
565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
566 {
567     uint64_t l, h;
568     Int128 r, t;
569 
570     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
571     muls64(&l, &h, m, n);
572     r = int128_make128(l, h);
573     if (neg) {
574         r = int128_neg(r);
575     }
576     if (a) {
577         t = int128_exts64(a);
578         t = int128_lshift(t, 63);
579         r = int128_add(r, t);
580     }
581     if (round) {
582         t = int128_exts64(1ll << 62);
583         r = int128_add(r, t);
584     }
585     r = int128_rshift(r, 63);
586 
587     return do_sat128_d(r);
588 }
589 
590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
591                              void *va, uint32_t desc)
592 {
593     intptr_t i, opr_sz = simd_oprsz(desc);
594     int64_t *d = vd, *n = vn, *m = vm, *a = va;
595 
596     for (i = 0; i < opr_sz / 8; ++i) {
597         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
598     }
599 }
600 
601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
602                              void *va, uint32_t desc)
603 {
604     intptr_t i, opr_sz = simd_oprsz(desc);
605     int64_t *d = vd, *n = vn, *m = vm, *a = va;
606 
607     for (i = 0; i < opr_sz / 8; ++i) {
608         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
609     }
610 }
611 
612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
613 {
614     intptr_t i, opr_sz = simd_oprsz(desc);
615     int64_t *d = vd, *n = vn, *m = vm;
616 
617     for (i = 0; i < opr_sz / 8; ++i) {
618         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
619     }
620 }
621 
622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int64_t *d = vd, *n = vn, *m = vm;
626 
627     for (i = 0; i < opr_sz / 8; ++i) {
628         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
629     }
630 }
631 
632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
633 {
634     intptr_t i, j, opr_sz = simd_oprsz(desc);
635     int idx = simd_data(desc);
636     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
637 
638     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
639         int64_t mm = m[i];
640         for (j = 0; j < 16 / 8; ++j) {
641             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
642         }
643     }
644 }
645 
646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
647 {
648     intptr_t i, j, opr_sz = simd_oprsz(desc);
649     int idx = simd_data(desc);
650     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
651 
652     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
653         int64_t mm = m[i];
654         for (j = 0; j < 16 / 8; ++j) {
655             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
656         }
657     }
658 }
659 
660 /* Integer 8 and 16-bit dot-product.
661  *
662  * Note that for the loops herein, host endianness does not matter
663  * with respect to the ordering of data within the quad-width lanes.
664  * All elements are treated equally, no matter where they are.
665  */
666 
667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
669 {                                                                         \
670     intptr_t i, opr_sz = simd_oprsz(desc);                                \
671     TYPED *d = vd, *a = va;                                               \
672     TYPEN *n = vn;                                                        \
673     TYPEM *m = vm;                                                        \
674     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
675         d[i] = (a[i] +                                                    \
676                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
677                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
678                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
679                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
680     }                                                                     \
681     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
682 }
683 
684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
689 
690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
692 {                                                                         \
693     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
694     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
695     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
696     intptr_t index = simd_data(desc);                                     \
697     TYPED *d = vd, *a = va;                                               \
698     TYPEN *n = vn;                                                        \
699     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
700     do {                                                                  \
701         TYPED m0 = m_indexed[i * 4 + 0];                                  \
702         TYPED m1 = m_indexed[i * 4 + 1];                                  \
703         TYPED m2 = m_indexed[i * 4 + 2];                                  \
704         TYPED m3 = m_indexed[i * 4 + 3];                                  \
705         do {                                                              \
706             d[i] = (a[i] +                                                \
707                     n[i * 4 + 0] * m0 +                                   \
708                     n[i * 4 + 1] * m1 +                                   \
709                     n[i * 4 + 2] * m2 +                                   \
710                     n[i * 4 + 3] * m3);                                   \
711         } while (++i < segend);                                           \
712         segend = i + 4;                                                   \
713     } while (i < opr_sz_n);                                               \
714     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
715 }
716 
717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
723 
724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
725                          void *vfpst, uint32_t desc)
726 {
727     uintptr_t opr_sz = simd_oprsz(desc);
728     float16 *d = vd;
729     float16 *n = vn;
730     float16 *m = vm;
731     float_status *fpst = vfpst;
732     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
733     uint32_t neg_imag = neg_real ^ 1;
734     uintptr_t i;
735 
736     /* Shift boolean to the sign bit so we can xor to negate.  */
737     neg_real <<= 15;
738     neg_imag <<= 15;
739 
740     for (i = 0; i < opr_sz / 2; i += 2) {
741         float16 e0 = n[H2(i)];
742         float16 e1 = m[H2(i + 1)] ^ neg_imag;
743         float16 e2 = n[H2(i + 1)];
744         float16 e3 = m[H2(i)] ^ neg_real;
745 
746         d[H2(i)] = float16_add(e0, e1, fpst);
747         d[H2(i + 1)] = float16_add(e2, e3, fpst);
748     }
749     clear_tail(d, opr_sz, simd_maxsz(desc));
750 }
751 
752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
753                          void *vfpst, uint32_t desc)
754 {
755     uintptr_t opr_sz = simd_oprsz(desc);
756     float32 *d = vd;
757     float32 *n = vn;
758     float32 *m = vm;
759     float_status *fpst = vfpst;
760     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
761     uint32_t neg_imag = neg_real ^ 1;
762     uintptr_t i;
763 
764     /* Shift boolean to the sign bit so we can xor to negate.  */
765     neg_real <<= 31;
766     neg_imag <<= 31;
767 
768     for (i = 0; i < opr_sz / 4; i += 2) {
769         float32 e0 = n[H4(i)];
770         float32 e1 = m[H4(i + 1)] ^ neg_imag;
771         float32 e2 = n[H4(i + 1)];
772         float32 e3 = m[H4(i)] ^ neg_real;
773 
774         d[H4(i)] = float32_add(e0, e1, fpst);
775         d[H4(i + 1)] = float32_add(e2, e3, fpst);
776     }
777     clear_tail(d, opr_sz, simd_maxsz(desc));
778 }
779 
780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
781                          void *vfpst, uint32_t desc)
782 {
783     uintptr_t opr_sz = simd_oprsz(desc);
784     float64 *d = vd;
785     float64 *n = vn;
786     float64 *m = vm;
787     float_status *fpst = vfpst;
788     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
789     uint64_t neg_imag = neg_real ^ 1;
790     uintptr_t i;
791 
792     /* Shift boolean to the sign bit so we can xor to negate.  */
793     neg_real <<= 63;
794     neg_imag <<= 63;
795 
796     for (i = 0; i < opr_sz / 8; i += 2) {
797         float64 e0 = n[i];
798         float64 e1 = m[i + 1] ^ neg_imag;
799         float64 e2 = n[i + 1];
800         float64 e3 = m[i] ^ neg_real;
801 
802         d[i] = float64_add(e0, e1, fpst);
803         d[i + 1] = float64_add(e2, e3, fpst);
804     }
805     clear_tail(d, opr_sz, simd_maxsz(desc));
806 }
807 
808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
809                          void *vfpst, uint32_t desc)
810 {
811     uintptr_t opr_sz = simd_oprsz(desc);
812     float16 *d = vd, *n = vn, *m = vm, *a = va;
813     float_status *fpst = vfpst;
814     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
815     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
816     uint32_t neg_real = flip ^ neg_imag;
817     uintptr_t i;
818 
819     /* Shift boolean to the sign bit so we can xor to negate.  */
820     neg_real <<= 15;
821     neg_imag <<= 15;
822 
823     for (i = 0; i < opr_sz / 2; i += 2) {
824         float16 e2 = n[H2(i + flip)];
825         float16 e1 = m[H2(i + flip)] ^ neg_real;
826         float16 e4 = e2;
827         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
828 
829         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
830         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
831     }
832     clear_tail(d, opr_sz, simd_maxsz(desc));
833 }
834 
835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
836                              void *vfpst, uint32_t desc)
837 {
838     uintptr_t opr_sz = simd_oprsz(desc);
839     float16 *d = vd, *n = vn, *m = vm, *a = va;
840     float_status *fpst = vfpst;
841     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
842     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
843     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
844     uint32_t neg_real = flip ^ neg_imag;
845     intptr_t elements = opr_sz / sizeof(float16);
846     intptr_t eltspersegment = 16 / sizeof(float16);
847     intptr_t i, j;
848 
849     /* Shift boolean to the sign bit so we can xor to negate.  */
850     neg_real <<= 15;
851     neg_imag <<= 15;
852 
853     for (i = 0; i < elements; i += eltspersegment) {
854         float16 mr = m[H2(i + 2 * index + 0)];
855         float16 mi = m[H2(i + 2 * index + 1)];
856         float16 e1 = neg_real ^ (flip ? mi : mr);
857         float16 e3 = neg_imag ^ (flip ? mr : mi);
858 
859         for (j = i; j < i + eltspersegment; j += 2) {
860             float16 e2 = n[H2(j + flip)];
861             float16 e4 = e2;
862 
863             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
864             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
865         }
866     }
867     clear_tail(d, opr_sz, simd_maxsz(desc));
868 }
869 
870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
871                          void *vfpst, uint32_t desc)
872 {
873     uintptr_t opr_sz = simd_oprsz(desc);
874     float32 *d = vd, *n = vn, *m = vm, *a = va;
875     float_status *fpst = vfpst;
876     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
877     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
878     uint32_t neg_real = flip ^ neg_imag;
879     uintptr_t i;
880 
881     /* Shift boolean to the sign bit so we can xor to negate.  */
882     neg_real <<= 31;
883     neg_imag <<= 31;
884 
885     for (i = 0; i < opr_sz / 4; i += 2) {
886         float32 e2 = n[H4(i + flip)];
887         float32 e1 = m[H4(i + flip)] ^ neg_real;
888         float32 e4 = e2;
889         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
890 
891         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
892         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
893     }
894     clear_tail(d, opr_sz, simd_maxsz(desc));
895 }
896 
897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
898                              void *vfpst, uint32_t desc)
899 {
900     uintptr_t opr_sz = simd_oprsz(desc);
901     float32 *d = vd, *n = vn, *m = vm, *a = va;
902     float_status *fpst = vfpst;
903     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
904     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
905     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
906     uint32_t neg_real = flip ^ neg_imag;
907     intptr_t elements = opr_sz / sizeof(float32);
908     intptr_t eltspersegment = 16 / sizeof(float32);
909     intptr_t i, j;
910 
911     /* Shift boolean to the sign bit so we can xor to negate.  */
912     neg_real <<= 31;
913     neg_imag <<= 31;
914 
915     for (i = 0; i < elements; i += eltspersegment) {
916         float32 mr = m[H4(i + 2 * index + 0)];
917         float32 mi = m[H4(i + 2 * index + 1)];
918         float32 e1 = neg_real ^ (flip ? mi : mr);
919         float32 e3 = neg_imag ^ (flip ? mr : mi);
920 
921         for (j = i; j < i + eltspersegment; j += 2) {
922             float32 e2 = n[H4(j + flip)];
923             float32 e4 = e2;
924 
925             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
926             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
927         }
928     }
929     clear_tail(d, opr_sz, simd_maxsz(desc));
930 }
931 
932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
933                          void *vfpst, uint32_t desc)
934 {
935     uintptr_t opr_sz = simd_oprsz(desc);
936     float64 *d = vd, *n = vn, *m = vm, *a = va;
937     float_status *fpst = vfpst;
938     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
939     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
940     uint64_t neg_real = flip ^ neg_imag;
941     uintptr_t i;
942 
943     /* Shift boolean to the sign bit so we can xor to negate.  */
944     neg_real <<= 63;
945     neg_imag <<= 63;
946 
947     for (i = 0; i < opr_sz / 8; i += 2) {
948         float64 e2 = n[i + flip];
949         float64 e1 = m[i + flip] ^ neg_real;
950         float64 e4 = e2;
951         float64 e3 = m[i + 1 - flip] ^ neg_imag;
952 
953         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
954         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
955     }
956     clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958 
959 /*
960  * Floating point comparisons producing an integer result (all 1s or all 0s).
961  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
962  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
963  */
964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
965 {
966     return -float16_eq_quiet(op1, op2, stat);
967 }
968 
969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
970 {
971     return -float32_eq_quiet(op1, op2, stat);
972 }
973 
974 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
975 {
976     return -float16_le(op2, op1, stat);
977 }
978 
979 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
980 {
981     return -float32_le(op2, op1, stat);
982 }
983 
984 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
985 {
986     return -float16_lt(op2, op1, stat);
987 }
988 
989 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
990 {
991     return -float32_lt(op2, op1, stat);
992 }
993 
994 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
995 {
996     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
997 }
998 
999 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1000 {
1001     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1002 }
1003 
1004 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1005 {
1006     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1007 }
1008 
1009 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1010 {
1011     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1012 }
1013 
1014 static int16_t vfp_tosszh(float16 x, void *fpstp)
1015 {
1016     float_status *fpst = fpstp;
1017     if (float16_is_any_nan(x)) {
1018         float_raise(float_flag_invalid, fpst);
1019         return 0;
1020     }
1021     return float16_to_int16_round_to_zero(x, fpst);
1022 }
1023 
1024 static uint16_t vfp_touszh(float16 x, void *fpstp)
1025 {
1026     float_status *fpst = fpstp;
1027     if (float16_is_any_nan(x)) {
1028         float_raise(float_flag_invalid, fpst);
1029         return 0;
1030     }
1031     return float16_to_uint16_round_to_zero(x, fpst);
1032 }
1033 
1034 #define DO_2OP(NAME, FUNC, TYPE) \
1035 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1036 {                                                                 \
1037     intptr_t i, oprsz = simd_oprsz(desc);                         \
1038     TYPE *d = vd, *n = vn;                                        \
1039     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1040         d[i] = FUNC(n[i], stat);                                  \
1041     }                                                             \
1042     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1043 }
1044 
1045 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1046 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1047 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1048 
1049 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1050 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1051 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1052 
1053 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1054 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1055 
1056 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1057 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1058 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1059 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1060 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1061 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1062 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1063 DO_2OP(gvec_touszh, vfp_touszh, float16)
1064 
1065 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1066     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1067     {                                                           \
1068         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1069     }
1070 
1071 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1072     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1073     {                                                           \
1074         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1075     }
1076 
1077 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1078     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1079     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1080     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1081     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1082 
1083 DO_2OP_CMP0(cgt, cgt, FWD)
1084 DO_2OP_CMP0(cge, cge, FWD)
1085 DO_2OP_CMP0(ceq, ceq, FWD)
1086 DO_2OP_CMP0(clt, cgt, REV)
1087 DO_2OP_CMP0(cle, cge, REV)
1088 
1089 #undef DO_2OP
1090 #undef DO_2OP_CMP0
1091 
1092 /* Floating-point trigonometric starting value.
1093  * See the ARM ARM pseudocode function FPTrigSMul.
1094  */
1095 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1096 {
1097     float16 result = float16_mul(op1, op1, stat);
1098     if (!float16_is_any_nan(result)) {
1099         result = float16_set_sign(result, op2 & 1);
1100     }
1101     return result;
1102 }
1103 
1104 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1105 {
1106     float32 result = float32_mul(op1, op1, stat);
1107     if (!float32_is_any_nan(result)) {
1108         result = float32_set_sign(result, op2 & 1);
1109     }
1110     return result;
1111 }
1112 
1113 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1114 {
1115     float64 result = float64_mul(op1, op1, stat);
1116     if (!float64_is_any_nan(result)) {
1117         result = float64_set_sign(result, op2 & 1);
1118     }
1119     return result;
1120 }
1121 
1122 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1123 {
1124     return float16_abs(float16_sub(op1, op2, stat));
1125 }
1126 
1127 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1128 {
1129     return float32_abs(float32_sub(op1, op2, stat));
1130 }
1131 
1132 /*
1133  * Reciprocal step. These are the AArch32 version which uses a
1134  * non-fused multiply-and-subtract.
1135  */
1136 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1137 {
1138     op1 = float16_squash_input_denormal(op1, stat);
1139     op2 = float16_squash_input_denormal(op2, stat);
1140 
1141     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1142         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1143         return float16_two;
1144     }
1145     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1146 }
1147 
1148 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1149 {
1150     op1 = float32_squash_input_denormal(op1, stat);
1151     op2 = float32_squash_input_denormal(op2, stat);
1152 
1153     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1154         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1155         return float32_two;
1156     }
1157     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1158 }
1159 
1160 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1161 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1162 {
1163     op1 = float16_squash_input_denormal(op1, stat);
1164     op2 = float16_squash_input_denormal(op2, stat);
1165 
1166     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1167         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1168         return float16_one_point_five;
1169     }
1170     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1171     return float16_div(op1, float16_two, stat);
1172 }
1173 
1174 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1175 {
1176     op1 = float32_squash_input_denormal(op1, stat);
1177     op2 = float32_squash_input_denormal(op2, stat);
1178 
1179     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1180         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1181         return float32_one_point_five;
1182     }
1183     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1184     return float32_div(op1, float32_two, stat);
1185 }
1186 
1187 #define DO_3OP(NAME, FUNC, TYPE) \
1188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1189 {                                                                          \
1190     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1191     TYPE *d = vd, *n = vn, *m = vm;                                        \
1192     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1193         d[i] = FUNC(n[i], m[i], stat);                                     \
1194     }                                                                      \
1195     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1196 }
1197 
1198 DO_3OP(gvec_fadd_h, float16_add, float16)
1199 DO_3OP(gvec_fadd_s, float32_add, float32)
1200 DO_3OP(gvec_fadd_d, float64_add, float64)
1201 
1202 DO_3OP(gvec_fsub_h, float16_sub, float16)
1203 DO_3OP(gvec_fsub_s, float32_sub, float32)
1204 DO_3OP(gvec_fsub_d, float64_sub, float64)
1205 
1206 DO_3OP(gvec_fmul_h, float16_mul, float16)
1207 DO_3OP(gvec_fmul_s, float32_mul, float32)
1208 DO_3OP(gvec_fmul_d, float64_mul, float64)
1209 
1210 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1211 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1212 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1213 
1214 DO_3OP(gvec_fabd_h, float16_abd, float16)
1215 DO_3OP(gvec_fabd_s, float32_abd, float32)
1216 
1217 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1218 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1219 
1220 DO_3OP(gvec_fcge_h, float16_cge, float16)
1221 DO_3OP(gvec_fcge_s, float32_cge, float32)
1222 
1223 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1224 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1225 
1226 DO_3OP(gvec_facge_h, float16_acge, float16)
1227 DO_3OP(gvec_facge_s, float32_acge, float32)
1228 
1229 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1230 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1231 
1232 DO_3OP(gvec_fmax_h, float16_max, float16)
1233 DO_3OP(gvec_fmax_s, float32_max, float32)
1234 
1235 DO_3OP(gvec_fmin_h, float16_min, float16)
1236 DO_3OP(gvec_fmin_s, float32_min, float32)
1237 
1238 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1239 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1240 
1241 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1242 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1243 
1244 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1245 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1246 
1247 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1248 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1249 
1250 #ifdef TARGET_AARCH64
1251 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1252 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1253 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1254 
1255 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1256 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1257 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1258 
1259 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1260 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1261 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1262 
1263 #endif
1264 #undef DO_3OP
1265 
1266 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1267 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1268                                  float_status *stat)
1269 {
1270     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1271 }
1272 
1273 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1274                                  float_status *stat)
1275 {
1276     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1277 }
1278 
1279 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1280                                  float_status *stat)
1281 {
1282     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1283 }
1284 
1285 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1286                                  float_status *stat)
1287 {
1288     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1289 }
1290 
1291 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1292 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1293                                 float_status *stat)
1294 {
1295     return float16_muladd(op1, op2, dest, 0, stat);
1296 }
1297 
1298 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1299                                  float_status *stat)
1300 {
1301     return float32_muladd(op1, op2, dest, 0, stat);
1302 }
1303 
1304 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1305                                  float_status *stat)
1306 {
1307     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1308 }
1309 
1310 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1311                                  float_status *stat)
1312 {
1313     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1314 }
1315 
1316 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1317 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1318 {                                                                          \
1319     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1320     TYPE *d = vd, *n = vn, *m = vm;                                        \
1321     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1322         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1323     }                                                                      \
1324     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1325 }
1326 
1327 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1328 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1329 
1330 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1331 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1332 
1333 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1334 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1335 
1336 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1337 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1338 
1339 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1340  * For AdvSIMD, there is of course only one such vector segment.
1341  */
1342 
1343 #define DO_MUL_IDX(NAME, TYPE, H) \
1344 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1345 {                                                                          \
1346     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1347     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1348     intptr_t idx = simd_data(desc);                                        \
1349     TYPE *d = vd, *n = vn, *m = vm;                                        \
1350     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1351         TYPE mm = m[H(i + idx)];                                           \
1352         for (j = 0; j < segment; j++) {                                    \
1353             d[i + j] = n[i + j] * mm;                                      \
1354         }                                                                  \
1355     }                                                                      \
1356     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1357 }
1358 
1359 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1360 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1361 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1362 
1363 #undef DO_MUL_IDX
1364 
1365 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1366 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1367 {                                                                          \
1368     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1369     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1370     intptr_t idx = simd_data(desc);                                        \
1371     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1372     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1373         TYPE mm = m[H(i + idx)];                                           \
1374         for (j = 0; j < segment; j++) {                                    \
1375             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1376         }                                                                  \
1377     }                                                                      \
1378     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1379 }
1380 
1381 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1382 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1383 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1384 
1385 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1386 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1387 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1388 
1389 #undef DO_MLA_IDX
1390 
1391 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1392 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1393 {                                                                          \
1394     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1395     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1396     intptr_t idx = simd_data(desc);                                        \
1397     TYPE *d = vd, *n = vn, *m = vm;                                        \
1398     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1399         TYPE mm = m[H(i + idx)];                                           \
1400         for (j = 0; j < segment; j++) {                                    \
1401             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1402         }                                                                  \
1403     }                                                                      \
1404     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1405 }
1406 
1407 #define nop(N, M, S) (M)
1408 
1409 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1410 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1411 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1412 
1413 #ifdef TARGET_AARCH64
1414 
1415 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1416 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1417 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1418 
1419 #endif
1420 
1421 #undef nop
1422 
1423 /*
1424  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1425  * the fused ops below they assume accumulate both from and into Vd.
1426  */
1427 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1428 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1429 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1430 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1431 
1432 #undef DO_FMUL_IDX
1433 
1434 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1435 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1436                   void *stat, uint32_t desc)                               \
1437 {                                                                          \
1438     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1439     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1440     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1441     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1442     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1443     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1444     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1445         TYPE mm = m[H(i + idx)];                                           \
1446         for (j = 0; j < segment; j++) {                                    \
1447             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1448                                      mm, a[i + j], 0, stat);               \
1449         }                                                                  \
1450     }                                                                      \
1451     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1452 }
1453 
1454 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1455 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1456 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1457 
1458 #undef DO_FMLA_IDX
1459 
1460 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1461 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1462 {                                                                          \
1463     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1464     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1465     bool q = false;                                                        \
1466     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1467         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1468         if (dd < MIN) {                                                    \
1469             dd = MIN;                                                      \
1470             q = true;                                                      \
1471         } else if (dd > MAX) {                                             \
1472             dd = MAX;                                                      \
1473             q = true;                                                      \
1474         }                                                                  \
1475         d[i] = dd;                                                         \
1476     }                                                                      \
1477     if (q) {                                                               \
1478         uint32_t *qc = vq;                                                 \
1479         qc[0] = 1;                                                         \
1480     }                                                                      \
1481     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1482 }
1483 
1484 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1485 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1486 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1487 
1488 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1489 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1490 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1491 
1492 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1493 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1494 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1495 
1496 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1497 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1498 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1499 
1500 #undef DO_SAT
1501 
1502 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1503                           void *vm, uint32_t desc)
1504 {
1505     intptr_t i, oprsz = simd_oprsz(desc);
1506     uint64_t *d = vd, *n = vn, *m = vm;
1507     bool q = false;
1508 
1509     for (i = 0; i < oprsz / 8; i++) {
1510         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1511         if (dd < nn) {
1512             dd = UINT64_MAX;
1513             q = true;
1514         }
1515         d[i] = dd;
1516     }
1517     if (q) {
1518         uint32_t *qc = vq;
1519         qc[0] = 1;
1520     }
1521     clear_tail(d, oprsz, simd_maxsz(desc));
1522 }
1523 
1524 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1525                           void *vm, uint32_t desc)
1526 {
1527     intptr_t i, oprsz = simd_oprsz(desc);
1528     uint64_t *d = vd, *n = vn, *m = vm;
1529     bool q = false;
1530 
1531     for (i = 0; i < oprsz / 8; i++) {
1532         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1533         if (nn < mm) {
1534             dd = 0;
1535             q = true;
1536         }
1537         d[i] = dd;
1538     }
1539     if (q) {
1540         uint32_t *qc = vq;
1541         qc[0] = 1;
1542     }
1543     clear_tail(d, oprsz, simd_maxsz(desc));
1544 }
1545 
1546 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1547                           void *vm, uint32_t desc)
1548 {
1549     intptr_t i, oprsz = simd_oprsz(desc);
1550     int64_t *d = vd, *n = vn, *m = vm;
1551     bool q = false;
1552 
1553     for (i = 0; i < oprsz / 8; i++) {
1554         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1555         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1556             dd = (nn >> 63) ^ ~INT64_MIN;
1557             q = true;
1558         }
1559         d[i] = dd;
1560     }
1561     if (q) {
1562         uint32_t *qc = vq;
1563         qc[0] = 1;
1564     }
1565     clear_tail(d, oprsz, simd_maxsz(desc));
1566 }
1567 
1568 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1569                           void *vm, uint32_t desc)
1570 {
1571     intptr_t i, oprsz = simd_oprsz(desc);
1572     int64_t *d = vd, *n = vn, *m = vm;
1573     bool q = false;
1574 
1575     for (i = 0; i < oprsz / 8; i++) {
1576         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1577         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1578             dd = (nn >> 63) ^ ~INT64_MIN;
1579             q = true;
1580         }
1581         d[i] = dd;
1582     }
1583     if (q) {
1584         uint32_t *qc = vq;
1585         qc[0] = 1;
1586     }
1587     clear_tail(d, oprsz, simd_maxsz(desc));
1588 }
1589 
1590 
1591 #define DO_SRA(NAME, TYPE)                              \
1592 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1593 {                                                       \
1594     intptr_t i, oprsz = simd_oprsz(desc);               \
1595     int shift = simd_data(desc);                        \
1596     TYPE *d = vd, *n = vn;                              \
1597     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1598         d[i] += n[i] >> shift;                          \
1599     }                                                   \
1600     clear_tail(d, oprsz, simd_maxsz(desc));             \
1601 }
1602 
1603 DO_SRA(gvec_ssra_b, int8_t)
1604 DO_SRA(gvec_ssra_h, int16_t)
1605 DO_SRA(gvec_ssra_s, int32_t)
1606 DO_SRA(gvec_ssra_d, int64_t)
1607 
1608 DO_SRA(gvec_usra_b, uint8_t)
1609 DO_SRA(gvec_usra_h, uint16_t)
1610 DO_SRA(gvec_usra_s, uint32_t)
1611 DO_SRA(gvec_usra_d, uint64_t)
1612 
1613 #undef DO_SRA
1614 
1615 #define DO_RSHR(NAME, TYPE)                             \
1616 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1617 {                                                       \
1618     intptr_t i, oprsz = simd_oprsz(desc);               \
1619     int shift = simd_data(desc);                        \
1620     TYPE *d = vd, *n = vn;                              \
1621     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1622         TYPE tmp = n[i] >> (shift - 1);                 \
1623         d[i] = (tmp >> 1) + (tmp & 1);                  \
1624     }                                                   \
1625     clear_tail(d, oprsz, simd_maxsz(desc));             \
1626 }
1627 
1628 DO_RSHR(gvec_srshr_b, int8_t)
1629 DO_RSHR(gvec_srshr_h, int16_t)
1630 DO_RSHR(gvec_srshr_s, int32_t)
1631 DO_RSHR(gvec_srshr_d, int64_t)
1632 
1633 DO_RSHR(gvec_urshr_b, uint8_t)
1634 DO_RSHR(gvec_urshr_h, uint16_t)
1635 DO_RSHR(gvec_urshr_s, uint32_t)
1636 DO_RSHR(gvec_urshr_d, uint64_t)
1637 
1638 #undef DO_RSHR
1639 
1640 #define DO_RSRA(NAME, TYPE)                             \
1641 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1642 {                                                       \
1643     intptr_t i, oprsz = simd_oprsz(desc);               \
1644     int shift = simd_data(desc);                        \
1645     TYPE *d = vd, *n = vn;                              \
1646     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1647         TYPE tmp = n[i] >> (shift - 1);                 \
1648         d[i] += (tmp >> 1) + (tmp & 1);                 \
1649     }                                                   \
1650     clear_tail(d, oprsz, simd_maxsz(desc));             \
1651 }
1652 
1653 DO_RSRA(gvec_srsra_b, int8_t)
1654 DO_RSRA(gvec_srsra_h, int16_t)
1655 DO_RSRA(gvec_srsra_s, int32_t)
1656 DO_RSRA(gvec_srsra_d, int64_t)
1657 
1658 DO_RSRA(gvec_ursra_b, uint8_t)
1659 DO_RSRA(gvec_ursra_h, uint16_t)
1660 DO_RSRA(gvec_ursra_s, uint32_t)
1661 DO_RSRA(gvec_ursra_d, uint64_t)
1662 
1663 #undef DO_RSRA
1664 
1665 #define DO_SRI(NAME, TYPE)                              \
1666 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1667 {                                                       \
1668     intptr_t i, oprsz = simd_oprsz(desc);               \
1669     int shift = simd_data(desc);                        \
1670     TYPE *d = vd, *n = vn;                              \
1671     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1672         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1673     }                                                   \
1674     clear_tail(d, oprsz, simd_maxsz(desc));             \
1675 }
1676 
1677 DO_SRI(gvec_sri_b, uint8_t)
1678 DO_SRI(gvec_sri_h, uint16_t)
1679 DO_SRI(gvec_sri_s, uint32_t)
1680 DO_SRI(gvec_sri_d, uint64_t)
1681 
1682 #undef DO_SRI
1683 
1684 #define DO_SLI(NAME, TYPE)                              \
1685 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1686 {                                                       \
1687     intptr_t i, oprsz = simd_oprsz(desc);               \
1688     int shift = simd_data(desc);                        \
1689     TYPE *d = vd, *n = vn;                              \
1690     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1691         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1692     }                                                   \
1693     clear_tail(d, oprsz, simd_maxsz(desc));             \
1694 }
1695 
1696 DO_SLI(gvec_sli_b, uint8_t)
1697 DO_SLI(gvec_sli_h, uint16_t)
1698 DO_SLI(gvec_sli_s, uint32_t)
1699 DO_SLI(gvec_sli_d, uint64_t)
1700 
1701 #undef DO_SLI
1702 
1703 /*
1704  * Convert float16 to float32, raising no exceptions and
1705  * preserving exceptional values, including SNaN.
1706  * This is effectively an unpack+repack operation.
1707  */
1708 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1709 {
1710     const int f16_bias = 15;
1711     const int f32_bias = 127;
1712     uint32_t sign = extract32(f16, 15, 1);
1713     uint32_t exp = extract32(f16, 10, 5);
1714     uint32_t frac = extract32(f16, 0, 10);
1715 
1716     if (exp == 0x1f) {
1717         /* Inf or NaN */
1718         exp = 0xff;
1719     } else if (exp == 0) {
1720         /* Zero or denormal.  */
1721         if (frac != 0) {
1722             if (fz16) {
1723                 frac = 0;
1724             } else {
1725                 /*
1726                  * Denormal; these are all normal float32.
1727                  * Shift the fraction so that the msb is at bit 11,
1728                  * then remove bit 11 as the implicit bit of the
1729                  * normalized float32.  Note that we still go through
1730                  * the shift for normal numbers below, to put the
1731                  * float32 fraction at the right place.
1732                  */
1733                 int shift = clz32(frac) - 21;
1734                 frac = (frac << shift) & 0x3ff;
1735                 exp = f32_bias - f16_bias - shift + 1;
1736             }
1737         }
1738     } else {
1739         /* Normal number; adjust the bias.  */
1740         exp += f32_bias - f16_bias;
1741     }
1742     sign <<= 31;
1743     exp <<= 23;
1744     frac <<= 23 - 10;
1745 
1746     return sign | exp | frac;
1747 }
1748 
1749 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1750 {
1751     /*
1752      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1753      * Load the 2nd qword iff is_q & is_2.
1754      * Shift to the 2nd dword iff !is_q & is_2.
1755      * For !is_q & !is_2, the upper bits of the result are garbage.
1756      */
1757     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1758 }
1759 
1760 /*
1761  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1762  * as there is not yet SVE versions that might use blocking.
1763  */
1764 
1765 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1766                      uint32_t desc, bool fz16)
1767 {
1768     intptr_t i, oprsz = simd_oprsz(desc);
1769     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1770     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1771     int is_q = oprsz == 16;
1772     uint64_t n_4, m_4;
1773 
1774     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1775     n_4 = load4_f16(vn, is_q, is_2);
1776     m_4 = load4_f16(vm, is_q, is_2);
1777 
1778     /* Negate all inputs for FMLSL at once.  */
1779     if (is_s) {
1780         n_4 ^= 0x8000800080008000ull;
1781     }
1782 
1783     for (i = 0; i < oprsz / 4; i++) {
1784         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1785         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1786         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1787     }
1788     clear_tail(d, oprsz, simd_maxsz(desc));
1789 }
1790 
1791 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1792                             void *venv, uint32_t desc)
1793 {
1794     CPUARMState *env = venv;
1795     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1796              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1797 }
1798 
1799 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1800                             void *venv, uint32_t desc)
1801 {
1802     CPUARMState *env = venv;
1803     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1804              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1805 }
1806 
1807 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1808                                void *venv, uint32_t desc)
1809 {
1810     intptr_t i, oprsz = simd_oprsz(desc);
1811     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1812     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1813     CPUARMState *env = venv;
1814     float_status *status = &env->vfp.fp_status;
1815     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1816 
1817     for (i = 0; i < oprsz; i += sizeof(float32)) {
1818         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1819         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1820         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1821         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1822         float32 aa = *(float32 *)(va + H1_4(i));
1823 
1824         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1825     }
1826 }
1827 
1828 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1829                          uint32_t desc, bool fz16)
1830 {
1831     intptr_t i, oprsz = simd_oprsz(desc);
1832     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1833     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1834     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1835     int is_q = oprsz == 16;
1836     uint64_t n_4;
1837     float32 m_1;
1838 
1839     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1840     n_4 = load4_f16(vn, is_q, is_2);
1841 
1842     /* Negate all inputs for FMLSL at once.  */
1843     if (is_s) {
1844         n_4 ^= 0x8000800080008000ull;
1845     }
1846 
1847     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1848 
1849     for (i = 0; i < oprsz / 4; i++) {
1850         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1851         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1852     }
1853     clear_tail(d, oprsz, simd_maxsz(desc));
1854 }
1855 
1856 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1857                                 void *venv, uint32_t desc)
1858 {
1859     CPUARMState *env = venv;
1860     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1861                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1862 }
1863 
1864 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1865                                 void *venv, uint32_t desc)
1866 {
1867     CPUARMState *env = venv;
1868     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1869                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1870 }
1871 
1872 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1873                                void *venv, uint32_t desc)
1874 {
1875     intptr_t i, j, oprsz = simd_oprsz(desc);
1876     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1877     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1878     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1879     CPUARMState *env = venv;
1880     float_status *status = &env->vfp.fp_status;
1881     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1882 
1883     for (i = 0; i < oprsz; i += 16) {
1884         float16 mm_16 = *(float16 *)(vm + i + idx);
1885         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1886 
1887         for (j = 0; j < 16; j += sizeof(float32)) {
1888             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1889             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1890             float32 aa = *(float32 *)(va + H1_4(i + j));
1891 
1892             *(float32 *)(vd + H1_4(i + j)) =
1893                 float32_muladd(nn, mm, aa, 0, status);
1894         }
1895     }
1896 }
1897 
1898 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1899 {
1900     intptr_t i, opr_sz = simd_oprsz(desc);
1901     int8_t *d = vd, *n = vn, *m = vm;
1902 
1903     for (i = 0; i < opr_sz; ++i) {
1904         int8_t mm = m[i];
1905         int8_t nn = n[i];
1906         int8_t res = 0;
1907         if (mm >= 0) {
1908             if (mm < 8) {
1909                 res = nn << mm;
1910             }
1911         } else {
1912             res = nn >> (mm > -8 ? -mm : 7);
1913         }
1914         d[i] = res;
1915     }
1916     clear_tail(d, opr_sz, simd_maxsz(desc));
1917 }
1918 
1919 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1920 {
1921     intptr_t i, opr_sz = simd_oprsz(desc);
1922     int16_t *d = vd, *n = vn, *m = vm;
1923 
1924     for (i = 0; i < opr_sz / 2; ++i) {
1925         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1926         int16_t nn = n[i];
1927         int16_t res = 0;
1928         if (mm >= 0) {
1929             if (mm < 16) {
1930                 res = nn << mm;
1931             }
1932         } else {
1933             res = nn >> (mm > -16 ? -mm : 15);
1934         }
1935         d[i] = res;
1936     }
1937     clear_tail(d, opr_sz, simd_maxsz(desc));
1938 }
1939 
1940 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1941 {
1942     intptr_t i, opr_sz = simd_oprsz(desc);
1943     uint8_t *d = vd, *n = vn, *m = vm;
1944 
1945     for (i = 0; i < opr_sz; ++i) {
1946         int8_t mm = m[i];
1947         uint8_t nn = n[i];
1948         uint8_t res = 0;
1949         if (mm >= 0) {
1950             if (mm < 8) {
1951                 res = nn << mm;
1952             }
1953         } else {
1954             if (mm > -8) {
1955                 res = nn >> -mm;
1956             }
1957         }
1958         d[i] = res;
1959     }
1960     clear_tail(d, opr_sz, simd_maxsz(desc));
1961 }
1962 
1963 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1964 {
1965     intptr_t i, opr_sz = simd_oprsz(desc);
1966     uint16_t *d = vd, *n = vn, *m = vm;
1967 
1968     for (i = 0; i < opr_sz / 2; ++i) {
1969         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1970         uint16_t nn = n[i];
1971         uint16_t res = 0;
1972         if (mm >= 0) {
1973             if (mm < 16) {
1974                 res = nn << mm;
1975             }
1976         } else {
1977             if (mm > -16) {
1978                 res = nn >> -mm;
1979             }
1980         }
1981         d[i] = res;
1982     }
1983     clear_tail(d, opr_sz, simd_maxsz(desc));
1984 }
1985 
1986 /*
1987  * 8x8->8 polynomial multiply.
1988  *
1989  * Polynomial multiplication is like integer multiplication except the
1990  * partial products are XORed, not added.
1991  *
1992  * TODO: expose this as a generic vector operation, as it is a common
1993  * crypto building block.
1994  */
1995 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1996 {
1997     intptr_t i, opr_sz = simd_oprsz(desc);
1998     uint64_t *d = vd, *n = vn, *m = vm;
1999 
2000     for (i = 0; i < opr_sz / 8; ++i) {
2001         d[i] = clmul_8x8_low(n[i], m[i]);
2002     }
2003     clear_tail(d, opr_sz, simd_maxsz(desc));
2004 }
2005 
2006 /*
2007  * 64x64->128 polynomial multiply.
2008  * Because of the lanes are not accessed in strict columns,
2009  * this probably cannot be turned into a generic helper.
2010  */
2011 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2012 {
2013     intptr_t i, opr_sz = simd_oprsz(desc);
2014     intptr_t hi = simd_data(desc);
2015     uint64_t *d = vd, *n = vn, *m = vm;
2016 
2017     for (i = 0; i < opr_sz / 8; i += 2) {
2018         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2019         d[i] = int128_getlo(r);
2020         d[i + 1] = int128_gethi(r);
2021     }
2022     clear_tail(d, opr_sz, simd_maxsz(desc));
2023 }
2024 
2025 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2026 {
2027     int hi = simd_data(desc);
2028     uint64_t *d = vd, *n = vn, *m = vm;
2029     uint64_t nn = n[hi], mm = m[hi];
2030 
2031     d[0] = clmul_8x4_packed(nn, mm);
2032     nn >>= 32;
2033     mm >>= 32;
2034     d[1] = clmul_8x4_packed(nn, mm);
2035 
2036     clear_tail(d, 16, simd_maxsz(desc));
2037 }
2038 
2039 #ifdef TARGET_AARCH64
2040 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2041 {
2042     int shift = simd_data(desc) * 8;
2043     intptr_t i, opr_sz = simd_oprsz(desc);
2044     uint64_t *d = vd, *n = vn, *m = vm;
2045 
2046     for (i = 0; i < opr_sz / 8; ++i) {
2047         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2048     }
2049 }
2050 
2051 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2052 {
2053     intptr_t sel = H4(simd_data(desc));
2054     intptr_t i, opr_sz = simd_oprsz(desc);
2055     uint32_t *n = vn, *m = vm;
2056     uint64_t *d = vd;
2057 
2058     for (i = 0; i < opr_sz / 8; ++i) {
2059         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2060     }
2061 }
2062 #endif
2063 
2064 #define DO_CMP0(NAME, TYPE, OP)                         \
2065 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2066 {                                                       \
2067     intptr_t i, opr_sz = simd_oprsz(desc);              \
2068     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2069         TYPE nn = *(TYPE *)(vn + i);                    \
2070         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2071     }                                                   \
2072     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2073 }
2074 
2075 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2076 DO_CMP0(gvec_clt0_b, int8_t, <)
2077 DO_CMP0(gvec_cle0_b, int8_t, <=)
2078 DO_CMP0(gvec_cgt0_b, int8_t, >)
2079 DO_CMP0(gvec_cge0_b, int8_t, >=)
2080 
2081 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2082 DO_CMP0(gvec_clt0_h, int16_t, <)
2083 DO_CMP0(gvec_cle0_h, int16_t, <=)
2084 DO_CMP0(gvec_cgt0_h, int16_t, >)
2085 DO_CMP0(gvec_cge0_h, int16_t, >=)
2086 
2087 #undef DO_CMP0
2088 
2089 #define DO_ABD(NAME, TYPE)                                      \
2090 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2091 {                                                               \
2092     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2093     TYPE *d = vd, *n = vn, *m = vm;                             \
2094                                                                 \
2095     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2096         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2097     }                                                           \
2098     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2099 }
2100 
2101 DO_ABD(gvec_sabd_b, int8_t)
2102 DO_ABD(gvec_sabd_h, int16_t)
2103 DO_ABD(gvec_sabd_s, int32_t)
2104 DO_ABD(gvec_sabd_d, int64_t)
2105 
2106 DO_ABD(gvec_uabd_b, uint8_t)
2107 DO_ABD(gvec_uabd_h, uint16_t)
2108 DO_ABD(gvec_uabd_s, uint32_t)
2109 DO_ABD(gvec_uabd_d, uint64_t)
2110 
2111 #undef DO_ABD
2112 
2113 #define DO_ABA(NAME, TYPE)                                      \
2114 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2115 {                                                               \
2116     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2117     TYPE *d = vd, *n = vn, *m = vm;                             \
2118                                                                 \
2119     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2120         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2121     }                                                           \
2122     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2123 }
2124 
2125 DO_ABA(gvec_saba_b, int8_t)
2126 DO_ABA(gvec_saba_h, int16_t)
2127 DO_ABA(gvec_saba_s, int32_t)
2128 DO_ABA(gvec_saba_d, int64_t)
2129 
2130 DO_ABA(gvec_uaba_b, uint8_t)
2131 DO_ABA(gvec_uaba_h, uint16_t)
2132 DO_ABA(gvec_uaba_s, uint32_t)
2133 DO_ABA(gvec_uaba_d, uint64_t)
2134 
2135 #undef DO_ABA
2136 
2137 #define DO_NEON_PAIRWISE(NAME, OP)                                      \
2138     void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
2139                          void *stat, uint32_t oprsz)                    \
2140     {                                                                   \
2141         float_status *fpst = stat;                                      \
2142         float32 *d = vd;                                                \
2143         float32 *n = vn;                                                \
2144         float32 *m = vm;                                                \
2145         float32 r0, r1;                                                 \
2146                                                                         \
2147         /* Read all inputs before writing outputs in case vm == vd */   \
2148         r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
2149         r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
2150                                                                         \
2151         d[H4(0)] = r0;                                                  \
2152         d[H4(1)] = r1;                                                  \
2153     }                                                                   \
2154                                                                         \
2155     void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
2156                          void *stat, uint32_t oprsz)                    \
2157     {                                                                   \
2158         float_status *fpst = stat;                                      \
2159         float16 *d = vd;                                                \
2160         float16 *n = vn;                                                \
2161         float16 *m = vm;                                                \
2162         float16 r0, r1, r2, r3;                                         \
2163                                                                         \
2164         /* Read all inputs before writing outputs in case vm == vd */   \
2165         r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
2166         r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
2167         r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
2168         r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
2169                                                                         \
2170         d[H2(0)] = r0;                                                  \
2171         d[H2(1)] = r1;                                                  \
2172         d[H2(2)] = r2;                                                  \
2173         d[H2(3)] = r3;                                                  \
2174     }
2175 
2176 DO_NEON_PAIRWISE(neon_padd, add)
2177 DO_NEON_PAIRWISE(neon_pmax, max)
2178 DO_NEON_PAIRWISE(neon_pmin, min)
2179 
2180 #undef DO_NEON_PAIRWISE
2181 
2182 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2183     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2184     {                                                                   \
2185         intptr_t i, oprsz = simd_oprsz(desc);                           \
2186         int shift = simd_data(desc);                                    \
2187         TYPE *d = vd, *n = vn;                                          \
2188         float_status *fpst = stat;                                      \
2189         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2190             d[i] = FUNC(n[i], shift, fpst);                             \
2191         }                                                               \
2192         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2193     }
2194 
2195 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2196 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2197 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2198 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2199 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2200 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2201 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2202 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2203 
2204 #undef DO_VCVT_FIXED
2205 
2206 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2207     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2208     {                                                                   \
2209         float_status *fpst = stat;                                      \
2210         intptr_t i, oprsz = simd_oprsz(desc);                           \
2211         uint32_t rmode = simd_data(desc);                               \
2212         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2213         TYPE *d = vd, *n = vn;                                          \
2214         set_float_rounding_mode(rmode, fpst);                           \
2215         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2216             d[i] = FUNC(n[i], 0, fpst);                                 \
2217         }                                                               \
2218         set_float_rounding_mode(prev_rmode, fpst);                      \
2219         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2220     }
2221 
2222 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2223 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2224 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2225 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2226 
2227 #undef DO_VCVT_RMODE
2228 
2229 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2230     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2231     {                                                                   \
2232         float_status *fpst = stat;                                      \
2233         intptr_t i, oprsz = simd_oprsz(desc);                           \
2234         uint32_t rmode = simd_data(desc);                               \
2235         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2236         TYPE *d = vd, *n = vn;                                          \
2237         set_float_rounding_mode(rmode, fpst);                           \
2238         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2239             d[i] = FUNC(n[i], fpst);                                    \
2240         }                                                               \
2241         set_float_rounding_mode(prev_rmode, fpst);                      \
2242         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2243     }
2244 
2245 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2246 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2247 
2248 #undef DO_VRINT_RMODE
2249 
2250 #ifdef TARGET_AARCH64
2251 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2252 {
2253     const uint8_t *indices = vm;
2254     CPUARMState *env = venv;
2255     size_t oprsz = simd_oprsz(desc);
2256     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2257     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2258     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2259     union {
2260         uint8_t b[16];
2261         uint64_t d[2];
2262     } result;
2263 
2264     /*
2265      * We must construct the final result in a temp, lest the output
2266      * overlaps the input table.  For TBL, begin with zero; for TBX,
2267      * begin with the original register contents.  Note that we always
2268      * copy 16 bytes here to avoid an extra branch; clearing the high
2269      * bits of the register for oprsz == 8 is handled below.
2270      */
2271     if (is_tbx) {
2272         memcpy(&result, vd, 16);
2273     } else {
2274         memset(&result, 0, 16);
2275     }
2276 
2277     for (size_t i = 0; i < oprsz; ++i) {
2278         uint32_t index = indices[H1(i)];
2279 
2280         if (index < table_len) {
2281             /*
2282              * Convert index (a byte offset into the virtual table
2283              * which is a series of 128-bit vectors concatenated)
2284              * into the correct register element, bearing in mind
2285              * that the table can wrap around from V31 to V0.
2286              */
2287             const uint8_t *table = (const uint8_t *)
2288                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2289             result.b[H1(i)] = table[H1(index % 16)];
2290         }
2291     }
2292 
2293     memcpy(vd, &result, 16);
2294     clear_tail(vd, oprsz, simd_maxsz(desc));
2295 }
2296 #endif
2297 
2298 /*
2299  * NxN -> N highpart multiply
2300  *
2301  * TODO: expose this as a generic vector operation.
2302  */
2303 
2304 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2305 {
2306     intptr_t i, opr_sz = simd_oprsz(desc);
2307     int8_t *d = vd, *n = vn, *m = vm;
2308 
2309     for (i = 0; i < opr_sz; ++i) {
2310         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2311     }
2312     clear_tail(d, opr_sz, simd_maxsz(desc));
2313 }
2314 
2315 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2316 {
2317     intptr_t i, opr_sz = simd_oprsz(desc);
2318     int16_t *d = vd, *n = vn, *m = vm;
2319 
2320     for (i = 0; i < opr_sz / 2; ++i) {
2321         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2322     }
2323     clear_tail(d, opr_sz, simd_maxsz(desc));
2324 }
2325 
2326 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2327 {
2328     intptr_t i, opr_sz = simd_oprsz(desc);
2329     int32_t *d = vd, *n = vn, *m = vm;
2330 
2331     for (i = 0; i < opr_sz / 4; ++i) {
2332         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2333     }
2334     clear_tail(d, opr_sz, simd_maxsz(desc));
2335 }
2336 
2337 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2338 {
2339     intptr_t i, opr_sz = simd_oprsz(desc);
2340     uint64_t *d = vd, *n = vn, *m = vm;
2341     uint64_t discard;
2342 
2343     for (i = 0; i < opr_sz / 8; ++i) {
2344         muls64(&discard, &d[i], n[i], m[i]);
2345     }
2346     clear_tail(d, opr_sz, simd_maxsz(desc));
2347 }
2348 
2349 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2350 {
2351     intptr_t i, opr_sz = simd_oprsz(desc);
2352     uint8_t *d = vd, *n = vn, *m = vm;
2353 
2354     for (i = 0; i < opr_sz; ++i) {
2355         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2356     }
2357     clear_tail(d, opr_sz, simd_maxsz(desc));
2358 }
2359 
2360 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2361 {
2362     intptr_t i, opr_sz = simd_oprsz(desc);
2363     uint16_t *d = vd, *n = vn, *m = vm;
2364 
2365     for (i = 0; i < opr_sz / 2; ++i) {
2366         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2367     }
2368     clear_tail(d, opr_sz, simd_maxsz(desc));
2369 }
2370 
2371 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2372 {
2373     intptr_t i, opr_sz = simd_oprsz(desc);
2374     uint32_t *d = vd, *n = vn, *m = vm;
2375 
2376     for (i = 0; i < opr_sz / 4; ++i) {
2377         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2378     }
2379     clear_tail(d, opr_sz, simd_maxsz(desc));
2380 }
2381 
2382 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2383 {
2384     intptr_t i, opr_sz = simd_oprsz(desc);
2385     uint64_t *d = vd, *n = vn, *m = vm;
2386     uint64_t discard;
2387 
2388     for (i = 0; i < opr_sz / 8; ++i) {
2389         mulu64(&discard, &d[i], n[i], m[i]);
2390     }
2391     clear_tail(d, opr_sz, simd_maxsz(desc));
2392 }
2393 
2394 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2395 {
2396     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2397     int shr = simd_data(desc);
2398     uint64_t *d = vd, *n = vn, *m = vm;
2399 
2400     for (i = 0; i < opr_sz; ++i) {
2401         d[i] = ror64(n[i] ^ m[i], shr);
2402     }
2403     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2404 }
2405 
2406 /*
2407  * Integer matrix-multiply accumulate
2408  */
2409 
2410 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2411 {
2412     int8_t *n = vn, *m = vm;
2413 
2414     for (intptr_t k = 0; k < 8; ++k) {
2415         sum += n[H1(k)] * m[H1(k)];
2416     }
2417     return sum;
2418 }
2419 
2420 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2421 {
2422     uint8_t *n = vn, *m = vm;
2423 
2424     for (intptr_t k = 0; k < 8; ++k) {
2425         sum += n[H1(k)] * m[H1(k)];
2426     }
2427     return sum;
2428 }
2429 
2430 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2431 {
2432     uint8_t *n = vn;
2433     int8_t *m = vm;
2434 
2435     for (intptr_t k = 0; k < 8; ++k) {
2436         sum += n[H1(k)] * m[H1(k)];
2437     }
2438     return sum;
2439 }
2440 
2441 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2442                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2443 {
2444     intptr_t seg, opr_sz = simd_oprsz(desc);
2445 
2446     for (seg = 0; seg < opr_sz; seg += 16) {
2447         uint32_t *d = vd + seg;
2448         uint32_t *a = va + seg;
2449         uint32_t sum0, sum1, sum2, sum3;
2450 
2451         /*
2452          * Process the entire segment at once, writing back the
2453          * results only after we've consumed all of the inputs.
2454          *
2455          * Key to indices by column:
2456          *          i   j                  i             j
2457          */
2458         sum0 = a[H4(0 + 0)];
2459         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2460         sum1 = a[H4(0 + 1)];
2461         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2462         sum2 = a[H4(2 + 0)];
2463         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2464         sum3 = a[H4(2 + 1)];
2465         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2466 
2467         d[H4(0)] = sum0;
2468         d[H4(1)] = sum1;
2469         d[H4(2)] = sum2;
2470         d[H4(3)] = sum3;
2471     }
2472     clear_tail(vd, opr_sz, simd_maxsz(desc));
2473 }
2474 
2475 #define DO_MMLA_B(NAME, INNER) \
2476     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2477     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2478 
2479 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2480 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2481 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2482 
2483 /*
2484  * BFloat16 Dot Product
2485  */
2486 
2487 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2488 {
2489     /* FPCR is ignored for BFDOT and BFMMLA. */
2490     float_status bf_status = {
2491         .tininess_before_rounding = float_tininess_before_rounding,
2492         .float_rounding_mode = float_round_to_odd_inf,
2493         .flush_to_zero = true,
2494         .flush_inputs_to_zero = true,
2495         .default_nan_mode = true,
2496     };
2497     float32 t1, t2;
2498 
2499     /*
2500      * Extract each BFloat16 from the element pair, and shift
2501      * them such that they become float32.
2502      */
2503     t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2504     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2505     t1 = float32_add(t1, t2, &bf_status);
2506     t1 = float32_add(sum, t1, &bf_status);
2507 
2508     return t1;
2509 }
2510 
2511 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2512 {
2513     intptr_t i, opr_sz = simd_oprsz(desc);
2514     float32 *d = vd, *a = va;
2515     uint32_t *n = vn, *m = vm;
2516 
2517     for (i = 0; i < opr_sz / 4; ++i) {
2518         d[i] = bfdotadd(a[i], n[i], m[i]);
2519     }
2520     clear_tail(d, opr_sz, simd_maxsz(desc));
2521 }
2522 
2523 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2524                             void *va, uint32_t desc)
2525 {
2526     intptr_t i, j, opr_sz = simd_oprsz(desc);
2527     intptr_t index = simd_data(desc);
2528     intptr_t elements = opr_sz / 4;
2529     intptr_t eltspersegment = MIN(16 / 4, elements);
2530     float32 *d = vd, *a = va;
2531     uint32_t *n = vn, *m = vm;
2532 
2533     for (i = 0; i < elements; i += eltspersegment) {
2534         uint32_t m_idx = m[i + H4(index)];
2535 
2536         for (j = i; j < i + eltspersegment; j++) {
2537             d[j] = bfdotadd(a[j], n[j], m_idx);
2538         }
2539     }
2540     clear_tail(d, opr_sz, simd_maxsz(desc));
2541 }
2542 
2543 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2544 {
2545     intptr_t s, opr_sz = simd_oprsz(desc);
2546     float32 *d = vd, *a = va;
2547     uint32_t *n = vn, *m = vm;
2548 
2549     for (s = 0; s < opr_sz / 4; s += 4) {
2550         float32 sum00, sum01, sum10, sum11;
2551 
2552         /*
2553          * Process the entire segment at once, writing back the
2554          * results only after we've consumed all of the inputs.
2555          *
2556          * Key to indices by column:
2557          *               i   j           i   k             j   k
2558          */
2559         sum00 = a[s + H4(0 + 0)];
2560         sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2561         sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2562 
2563         sum01 = a[s + H4(0 + 1)];
2564         sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2565         sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2566 
2567         sum10 = a[s + H4(2 + 0)];
2568         sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2569         sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2570 
2571         sum11 = a[s + H4(2 + 1)];
2572         sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2573         sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2574 
2575         d[s + H4(0 + 0)] = sum00;
2576         d[s + H4(0 + 1)] = sum01;
2577         d[s + H4(2 + 0)] = sum10;
2578         d[s + H4(2 + 1)] = sum11;
2579     }
2580     clear_tail(d, opr_sz, simd_maxsz(desc));
2581 }
2582 
2583 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2584                          void *stat, uint32_t desc)
2585 {
2586     intptr_t i, opr_sz = simd_oprsz(desc);
2587     intptr_t sel = simd_data(desc);
2588     float32 *d = vd, *a = va;
2589     bfloat16 *n = vn, *m = vm;
2590 
2591     for (i = 0; i < opr_sz / 4; ++i) {
2592         float32 nn = n[H2(i * 2 + sel)] << 16;
2593         float32 mm = m[H2(i * 2 + sel)] << 16;
2594         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2595     }
2596     clear_tail(d, opr_sz, simd_maxsz(desc));
2597 }
2598 
2599 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2600                              void *va, void *stat, uint32_t desc)
2601 {
2602     intptr_t i, j, opr_sz = simd_oprsz(desc);
2603     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2604     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2605     intptr_t elements = opr_sz / 4;
2606     intptr_t eltspersegment = MIN(16 / 4, elements);
2607     float32 *d = vd, *a = va;
2608     bfloat16 *n = vn, *m = vm;
2609 
2610     for (i = 0; i < elements; i += eltspersegment) {
2611         float32 m_idx = m[H2(2 * i + index)] << 16;
2612 
2613         for (j = i; j < i + eltspersegment; j++) {
2614             float32 n_j = n[H2(2 * j + sel)] << 16;
2615             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2616         }
2617     }
2618     clear_tail(d, opr_sz, simd_maxsz(desc));
2619 }
2620 
2621 #define DO_CLAMP(NAME, TYPE) \
2622 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2623 {                                                                       \
2624     intptr_t i, opr_sz = simd_oprsz(desc);                              \
2625     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2626         TYPE aa = *(TYPE *)(a + i);                                     \
2627         TYPE nn = *(TYPE *)(n + i);                                     \
2628         TYPE mm = *(TYPE *)(m + i);                                     \
2629         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2630         *(TYPE *)(d + i) = dd;                                          \
2631     }                                                                   \
2632     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2633 }
2634 
2635 DO_CLAMP(gvec_sclamp_b, int8_t)
2636 DO_CLAMP(gvec_sclamp_h, int16_t)
2637 DO_CLAMP(gvec_sclamp_s, int32_t)
2638 DO_CLAMP(gvec_sclamp_d, int64_t)
2639 
2640 DO_CLAMP(gvec_uclamp_b, uint8_t)
2641 DO_CLAMP(gvec_uclamp_h, uint16_t)
2642 DO_CLAMP(gvec_uclamp_s, uint32_t)
2643 DO_CLAMP(gvec_uclamp_d, uint64_t)
2644