xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision a1e250fc)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
315                              void *va, uint32_t desc)
316 {
317     intptr_t i, opr_sz = simd_oprsz(desc);
318     int16_t *d = vd, *n = vn, *m = vm, *a = va;
319     uint32_t discard;
320 
321     for (i = 0; i < opr_sz / 2; ++i) {
322         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
323     }
324 }
325 
326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
327                              void *va, uint32_t desc)
328 {
329     intptr_t i, opr_sz = simd_oprsz(desc);
330     int16_t *d = vd, *n = vn, *m = vm, *a = va;
331     uint32_t discard;
332 
333     for (i = 0; i < opr_sz / 2; ++i) {
334         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
335     }
336 }
337 
338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
339 {
340     intptr_t i, opr_sz = simd_oprsz(desc);
341     int16_t *d = vd, *n = vn, *m = vm;
342     uint32_t discard;
343 
344     for (i = 0; i < opr_sz / 2; ++i) {
345         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
346     }
347 }
348 
349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
350 {
351     intptr_t i, opr_sz = simd_oprsz(desc);
352     int16_t *d = vd, *n = vn, *m = vm;
353     uint32_t discard;
354 
355     for (i = 0; i < opr_sz / 2; ++i) {
356         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
357     }
358 }
359 
360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
361 {
362     intptr_t i, j, opr_sz = simd_oprsz(desc);
363     int idx = simd_data(desc);
364     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
365     uint32_t discard;
366 
367     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
368         int16_t mm = m[i];
369         for (j = 0; j < 16 / 2; ++j) {
370             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
371         }
372     }
373 }
374 
375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
376 {
377     intptr_t i, j, opr_sz = simd_oprsz(desc);
378     int idx = simd_data(desc);
379     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
380     uint32_t discard;
381 
382     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
383         int16_t mm = m[i];
384         for (j = 0; j < 16 / 2; ++j) {
385             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
386         }
387     }
388 }
389 
390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
392                       bool neg, bool round, uint32_t *sat)
393 {
394     /* Simplify similarly to do_sqrdmlah_b above.  */
395     int64_t ret = (int64_t)src1 * src2;
396     if (neg) {
397         ret = -ret;
398     }
399     ret += ((int64_t)src3 << 31) + (round << 30);
400     ret >>= 31;
401 
402     if (ret != (int32_t)ret) {
403         *sat = 1;
404         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
405     }
406     return ret;
407 }
408 
409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
410                                   int32_t src2, int32_t src3)
411 {
412     uint32_t *sat = &env->vfp.qc[0];
413     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
414 }
415 
416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
417                               void *vq, uint32_t desc)
418 {
419     uintptr_t opr_sz = simd_oprsz(desc);
420     int32_t *d = vd;
421     int32_t *n = vn;
422     int32_t *m = vm;
423     uintptr_t i;
424 
425     for (i = 0; i < opr_sz / 4; ++i) {
426         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
427     }
428     clear_tail(d, opr_sz, simd_maxsz(desc));
429 }
430 
431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
432                                   int32_t src2, int32_t src3)
433 {
434     uint32_t *sat = &env->vfp.qc[0];
435     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
436 }
437 
438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
439                               void *vq, uint32_t desc)
440 {
441     uintptr_t opr_sz = simd_oprsz(desc);
442     int32_t *d = vd;
443     int32_t *n = vn;
444     int32_t *m = vm;
445     uintptr_t i;
446 
447     for (i = 0; i < opr_sz / 4; ++i) {
448         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
449     }
450     clear_tail(d, opr_sz, simd_maxsz(desc));
451 }
452 
453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
454                             void *vq, uint32_t desc)
455 {
456     intptr_t i, opr_sz = simd_oprsz(desc);
457     int32_t *d = vd, *n = vn, *m = vm;
458 
459     for (i = 0; i < opr_sz / 4; ++i) {
460         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
461     }
462     clear_tail(d, opr_sz, simd_maxsz(desc));
463 }
464 
465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
466                              void *vq, uint32_t desc)
467 {
468     intptr_t i, opr_sz = simd_oprsz(desc);
469     int32_t *d = vd, *n = vn, *m = vm;
470 
471     for (i = 0; i < opr_sz / 4; ++i) {
472         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
473     }
474     clear_tail(d, opr_sz, simd_maxsz(desc));
475 }
476 
477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
478                              void *va, uint32_t desc)
479 {
480     intptr_t i, opr_sz = simd_oprsz(desc);
481     int32_t *d = vd, *n = vn, *m = vm, *a = va;
482     uint32_t discard;
483 
484     for (i = 0; i < opr_sz / 4; ++i) {
485         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
486     }
487 }
488 
489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
490                              void *va, uint32_t desc)
491 {
492     intptr_t i, opr_sz = simd_oprsz(desc);
493     int32_t *d = vd, *n = vn, *m = vm, *a = va;
494     uint32_t discard;
495 
496     for (i = 0; i < opr_sz / 4; ++i) {
497         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
498     }
499 }
500 
501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
502 {
503     intptr_t i, opr_sz = simd_oprsz(desc);
504     int32_t *d = vd, *n = vn, *m = vm;
505     uint32_t discard;
506 
507     for (i = 0; i < opr_sz / 4; ++i) {
508         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
509     }
510 }
511 
512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
513 {
514     intptr_t i, opr_sz = simd_oprsz(desc);
515     int32_t *d = vd, *n = vn, *m = vm;
516     uint32_t discard;
517 
518     for (i = 0; i < opr_sz / 4; ++i) {
519         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
520     }
521 }
522 
523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
524 {
525     intptr_t i, j, opr_sz = simd_oprsz(desc);
526     int idx = simd_data(desc);
527     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
528     uint32_t discard;
529 
530     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
531         int32_t mm = m[i];
532         for (j = 0; j < 16 / 4; ++j) {
533             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
534         }
535     }
536 }
537 
538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
539 {
540     intptr_t i, j, opr_sz = simd_oprsz(desc);
541     int idx = simd_data(desc);
542     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
543     uint32_t discard;
544 
545     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
546         int32_t mm = m[i];
547         for (j = 0; j < 16 / 4; ++j) {
548             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
549         }
550     }
551 }
552 
553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
554 static int64_t do_sat128_d(Int128 r)
555 {
556     int64_t ls = int128_getlo(r);
557     int64_t hs = int128_gethi(r);
558 
559     if (unlikely(hs != (ls >> 63))) {
560         return hs < 0 ? INT64_MIN : INT64_MAX;
561     }
562     return ls;
563 }
564 
565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
566 {
567     uint64_t l, h;
568     Int128 r, t;
569 
570     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
571     muls64(&l, &h, m, n);
572     r = int128_make128(l, h);
573     if (neg) {
574         r = int128_neg(r);
575     }
576     if (a) {
577         t = int128_exts64(a);
578         t = int128_lshift(t, 63);
579         r = int128_add(r, t);
580     }
581     if (round) {
582         t = int128_exts64(1ll << 62);
583         r = int128_add(r, t);
584     }
585     r = int128_rshift(r, 63);
586 
587     return do_sat128_d(r);
588 }
589 
590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
591                              void *va, uint32_t desc)
592 {
593     intptr_t i, opr_sz = simd_oprsz(desc);
594     int64_t *d = vd, *n = vn, *m = vm, *a = va;
595 
596     for (i = 0; i < opr_sz / 8; ++i) {
597         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
598     }
599 }
600 
601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
602                              void *va, uint32_t desc)
603 {
604     intptr_t i, opr_sz = simd_oprsz(desc);
605     int64_t *d = vd, *n = vn, *m = vm, *a = va;
606 
607     for (i = 0; i < opr_sz / 8; ++i) {
608         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
609     }
610 }
611 
612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
613 {
614     intptr_t i, opr_sz = simd_oprsz(desc);
615     int64_t *d = vd, *n = vn, *m = vm;
616 
617     for (i = 0; i < opr_sz / 8; ++i) {
618         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
619     }
620 }
621 
622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int64_t *d = vd, *n = vn, *m = vm;
626 
627     for (i = 0; i < opr_sz / 8; ++i) {
628         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
629     }
630 }
631 
632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
633 {
634     intptr_t i, j, opr_sz = simd_oprsz(desc);
635     int idx = simd_data(desc);
636     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
637 
638     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
639         int64_t mm = m[i];
640         for (j = 0; j < 16 / 8; ++j) {
641             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
642         }
643     }
644 }
645 
646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
647 {
648     intptr_t i, j, opr_sz = simd_oprsz(desc);
649     int idx = simd_data(desc);
650     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
651 
652     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
653         int64_t mm = m[i];
654         for (j = 0; j < 16 / 8; ++j) {
655             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
656         }
657     }
658 }
659 
660 /* Integer 8 and 16-bit dot-product.
661  *
662  * Note that for the loops herein, host endianness does not matter
663  * with respect to the ordering of data within the quad-width lanes.
664  * All elements are treated equally, no matter where they are.
665  */
666 
667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
669 {                                                                         \
670     intptr_t i, opr_sz = simd_oprsz(desc);                                \
671     TYPED *d = vd, *a = va;                                               \
672     TYPEN *n = vn;                                                        \
673     TYPEM *m = vm;                                                        \
674     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
675         d[i] = (a[i] +                                                    \
676                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
677                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
678                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
679                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
680     }                                                                     \
681     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
682 }
683 
684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
689 
690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
692 {                                                                         \
693     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
694     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
695     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
696     intptr_t index = simd_data(desc);                                     \
697     TYPED *d = vd, *a = va;                                               \
698     TYPEN *n = vn;                                                        \
699     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
700     do {                                                                  \
701         TYPED m0 = m_indexed[i * 4 + 0];                                  \
702         TYPED m1 = m_indexed[i * 4 + 1];                                  \
703         TYPED m2 = m_indexed[i * 4 + 2];                                  \
704         TYPED m3 = m_indexed[i * 4 + 3];                                  \
705         do {                                                              \
706             d[i] = (a[i] +                                                \
707                     n[i * 4 + 0] * m0 +                                   \
708                     n[i * 4 + 1] * m1 +                                   \
709                     n[i * 4 + 2] * m2 +                                   \
710                     n[i * 4 + 3] * m3);                                   \
711         } while (++i < segend);                                           \
712         segend = i + 4;                                                   \
713     } while (i < opr_sz_n);                                               \
714     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
715 }
716 
717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
723 
724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
725                          void *vfpst, uint32_t desc)
726 {
727     uintptr_t opr_sz = simd_oprsz(desc);
728     float16 *d = vd;
729     float16 *n = vn;
730     float16 *m = vm;
731     float_status *fpst = vfpst;
732     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
733     uint32_t neg_imag = neg_real ^ 1;
734     uintptr_t i;
735 
736     /* Shift boolean to the sign bit so we can xor to negate.  */
737     neg_real <<= 15;
738     neg_imag <<= 15;
739 
740     for (i = 0; i < opr_sz / 2; i += 2) {
741         float16 e0 = n[H2(i)];
742         float16 e1 = m[H2(i + 1)] ^ neg_imag;
743         float16 e2 = n[H2(i + 1)];
744         float16 e3 = m[H2(i)] ^ neg_real;
745 
746         d[H2(i)] = float16_add(e0, e1, fpst);
747         d[H2(i + 1)] = float16_add(e2, e3, fpst);
748     }
749     clear_tail(d, opr_sz, simd_maxsz(desc));
750 }
751 
752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
753                          void *vfpst, uint32_t desc)
754 {
755     uintptr_t opr_sz = simd_oprsz(desc);
756     float32 *d = vd;
757     float32 *n = vn;
758     float32 *m = vm;
759     float_status *fpst = vfpst;
760     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
761     uint32_t neg_imag = neg_real ^ 1;
762     uintptr_t i;
763 
764     /* Shift boolean to the sign bit so we can xor to negate.  */
765     neg_real <<= 31;
766     neg_imag <<= 31;
767 
768     for (i = 0; i < opr_sz / 4; i += 2) {
769         float32 e0 = n[H4(i)];
770         float32 e1 = m[H4(i + 1)] ^ neg_imag;
771         float32 e2 = n[H4(i + 1)];
772         float32 e3 = m[H4(i)] ^ neg_real;
773 
774         d[H4(i)] = float32_add(e0, e1, fpst);
775         d[H4(i + 1)] = float32_add(e2, e3, fpst);
776     }
777     clear_tail(d, opr_sz, simd_maxsz(desc));
778 }
779 
780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
781                          void *vfpst, uint32_t desc)
782 {
783     uintptr_t opr_sz = simd_oprsz(desc);
784     float64 *d = vd;
785     float64 *n = vn;
786     float64 *m = vm;
787     float_status *fpst = vfpst;
788     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
789     uint64_t neg_imag = neg_real ^ 1;
790     uintptr_t i;
791 
792     /* Shift boolean to the sign bit so we can xor to negate.  */
793     neg_real <<= 63;
794     neg_imag <<= 63;
795 
796     for (i = 0; i < opr_sz / 8; i += 2) {
797         float64 e0 = n[i];
798         float64 e1 = m[i + 1] ^ neg_imag;
799         float64 e2 = n[i + 1];
800         float64 e3 = m[i] ^ neg_real;
801 
802         d[i] = float64_add(e0, e1, fpst);
803         d[i + 1] = float64_add(e2, e3, fpst);
804     }
805     clear_tail(d, opr_sz, simd_maxsz(desc));
806 }
807 
808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
809                          void *vfpst, uint32_t desc)
810 {
811     uintptr_t opr_sz = simd_oprsz(desc);
812     float16 *d = vd, *n = vn, *m = vm, *a = va;
813     float_status *fpst = vfpst;
814     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
815     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
816     uint32_t neg_real = flip ^ neg_imag;
817     uintptr_t i;
818 
819     /* Shift boolean to the sign bit so we can xor to negate.  */
820     neg_real <<= 15;
821     neg_imag <<= 15;
822 
823     for (i = 0; i < opr_sz / 2; i += 2) {
824         float16 e2 = n[H2(i + flip)];
825         float16 e1 = m[H2(i + flip)] ^ neg_real;
826         float16 e4 = e2;
827         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
828 
829         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
830         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
831     }
832     clear_tail(d, opr_sz, simd_maxsz(desc));
833 }
834 
835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
836                              void *vfpst, uint32_t desc)
837 {
838     uintptr_t opr_sz = simd_oprsz(desc);
839     float16 *d = vd, *n = vn, *m = vm, *a = va;
840     float_status *fpst = vfpst;
841     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
842     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
843     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
844     uint32_t neg_real = flip ^ neg_imag;
845     intptr_t elements = opr_sz / sizeof(float16);
846     intptr_t eltspersegment = 16 / sizeof(float16);
847     intptr_t i, j;
848 
849     /* Shift boolean to the sign bit so we can xor to negate.  */
850     neg_real <<= 15;
851     neg_imag <<= 15;
852 
853     for (i = 0; i < elements; i += eltspersegment) {
854         float16 mr = m[H2(i + 2 * index + 0)];
855         float16 mi = m[H2(i + 2 * index + 1)];
856         float16 e1 = neg_real ^ (flip ? mi : mr);
857         float16 e3 = neg_imag ^ (flip ? mr : mi);
858 
859         for (j = i; j < i + eltspersegment; j += 2) {
860             float16 e2 = n[H2(j + flip)];
861             float16 e4 = e2;
862 
863             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
864             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
865         }
866     }
867     clear_tail(d, opr_sz, simd_maxsz(desc));
868 }
869 
870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
871                          void *vfpst, uint32_t desc)
872 {
873     uintptr_t opr_sz = simd_oprsz(desc);
874     float32 *d = vd, *n = vn, *m = vm, *a = va;
875     float_status *fpst = vfpst;
876     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
877     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
878     uint32_t neg_real = flip ^ neg_imag;
879     uintptr_t i;
880 
881     /* Shift boolean to the sign bit so we can xor to negate.  */
882     neg_real <<= 31;
883     neg_imag <<= 31;
884 
885     for (i = 0; i < opr_sz / 4; i += 2) {
886         float32 e2 = n[H4(i + flip)];
887         float32 e1 = m[H4(i + flip)] ^ neg_real;
888         float32 e4 = e2;
889         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
890 
891         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
892         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
893     }
894     clear_tail(d, opr_sz, simd_maxsz(desc));
895 }
896 
897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
898                              void *vfpst, uint32_t desc)
899 {
900     uintptr_t opr_sz = simd_oprsz(desc);
901     float32 *d = vd, *n = vn, *m = vm, *a = va;
902     float_status *fpst = vfpst;
903     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
904     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
905     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
906     uint32_t neg_real = flip ^ neg_imag;
907     intptr_t elements = opr_sz / sizeof(float32);
908     intptr_t eltspersegment = 16 / sizeof(float32);
909     intptr_t i, j;
910 
911     /* Shift boolean to the sign bit so we can xor to negate.  */
912     neg_real <<= 31;
913     neg_imag <<= 31;
914 
915     for (i = 0; i < elements; i += eltspersegment) {
916         float32 mr = m[H4(i + 2 * index + 0)];
917         float32 mi = m[H4(i + 2 * index + 1)];
918         float32 e1 = neg_real ^ (flip ? mi : mr);
919         float32 e3 = neg_imag ^ (flip ? mr : mi);
920 
921         for (j = i; j < i + eltspersegment; j += 2) {
922             float32 e2 = n[H4(j + flip)];
923             float32 e4 = e2;
924 
925             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
926             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
927         }
928     }
929     clear_tail(d, opr_sz, simd_maxsz(desc));
930 }
931 
932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
933                          void *vfpst, uint32_t desc)
934 {
935     uintptr_t opr_sz = simd_oprsz(desc);
936     float64 *d = vd, *n = vn, *m = vm, *a = va;
937     float_status *fpst = vfpst;
938     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
939     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
940     uint64_t neg_real = flip ^ neg_imag;
941     uintptr_t i;
942 
943     /* Shift boolean to the sign bit so we can xor to negate.  */
944     neg_real <<= 63;
945     neg_imag <<= 63;
946 
947     for (i = 0; i < opr_sz / 8; i += 2) {
948         float64 e2 = n[i + flip];
949         float64 e1 = m[i + flip] ^ neg_real;
950         float64 e4 = e2;
951         float64 e3 = m[i + 1 - flip] ^ neg_imag;
952 
953         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
954         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
955     }
956     clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958 
959 /*
960  * Floating point comparisons producing an integer result (all 1s or all 0s).
961  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
962  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
963  */
964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
965 {
966     return -float16_eq_quiet(op1, op2, stat);
967 }
968 
969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
970 {
971     return -float32_eq_quiet(op1, op2, stat);
972 }
973 
974 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
975 {
976     return -float16_le(op2, op1, stat);
977 }
978 
979 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
980 {
981     return -float32_le(op2, op1, stat);
982 }
983 
984 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
985 {
986     return -float16_lt(op2, op1, stat);
987 }
988 
989 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
990 {
991     return -float32_lt(op2, op1, stat);
992 }
993 
994 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
995 {
996     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
997 }
998 
999 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1000 {
1001     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1002 }
1003 
1004 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1005 {
1006     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1007 }
1008 
1009 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1010 {
1011     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1012 }
1013 
1014 static int16_t vfp_tosszh(float16 x, void *fpstp)
1015 {
1016     float_status *fpst = fpstp;
1017     if (float16_is_any_nan(x)) {
1018         float_raise(float_flag_invalid, fpst);
1019         return 0;
1020     }
1021     return float16_to_int16_round_to_zero(x, fpst);
1022 }
1023 
1024 static uint16_t vfp_touszh(float16 x, void *fpstp)
1025 {
1026     float_status *fpst = fpstp;
1027     if (float16_is_any_nan(x)) {
1028         float_raise(float_flag_invalid, fpst);
1029         return 0;
1030     }
1031     return float16_to_uint16_round_to_zero(x, fpst);
1032 }
1033 
1034 #define DO_2OP(NAME, FUNC, TYPE) \
1035 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1036 {                                                                 \
1037     intptr_t i, oprsz = simd_oprsz(desc);                         \
1038     TYPE *d = vd, *n = vn;                                        \
1039     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1040         d[i] = FUNC(n[i], stat);                                  \
1041     }                                                             \
1042     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1043 }
1044 
1045 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1046 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1047 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1048 
1049 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1050 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1051 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1052 
1053 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1054 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1055 
1056 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1057 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1058 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1059 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1060 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1061 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1062 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1063 DO_2OP(gvec_touszh, vfp_touszh, float16)
1064 
1065 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1066     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1067     {                                                           \
1068         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1069     }
1070 
1071 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1072     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1073     {                                                           \
1074         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1075     }
1076 
1077 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1078     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1079     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1080     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1081     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1082 
1083 DO_2OP_CMP0(cgt, cgt, FWD)
1084 DO_2OP_CMP0(cge, cge, FWD)
1085 DO_2OP_CMP0(ceq, ceq, FWD)
1086 DO_2OP_CMP0(clt, cgt, REV)
1087 DO_2OP_CMP0(cle, cge, REV)
1088 
1089 #undef DO_2OP
1090 #undef DO_2OP_CMP0
1091 
1092 /* Floating-point trigonometric starting value.
1093  * See the ARM ARM pseudocode function FPTrigSMul.
1094  */
1095 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1096 {
1097     float16 result = float16_mul(op1, op1, stat);
1098     if (!float16_is_any_nan(result)) {
1099         result = float16_set_sign(result, op2 & 1);
1100     }
1101     return result;
1102 }
1103 
1104 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1105 {
1106     float32 result = float32_mul(op1, op1, stat);
1107     if (!float32_is_any_nan(result)) {
1108         result = float32_set_sign(result, op2 & 1);
1109     }
1110     return result;
1111 }
1112 
1113 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1114 {
1115     float64 result = float64_mul(op1, op1, stat);
1116     if (!float64_is_any_nan(result)) {
1117         result = float64_set_sign(result, op2 & 1);
1118     }
1119     return result;
1120 }
1121 
1122 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1123 {
1124     return float16_abs(float16_sub(op1, op2, stat));
1125 }
1126 
1127 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1128 {
1129     return float32_abs(float32_sub(op1, op2, stat));
1130 }
1131 
1132 /*
1133  * Reciprocal step. These are the AArch32 version which uses a
1134  * non-fused multiply-and-subtract.
1135  */
1136 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1137 {
1138     op1 = float16_squash_input_denormal(op1, stat);
1139     op2 = float16_squash_input_denormal(op2, stat);
1140 
1141     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1142         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1143         return float16_two;
1144     }
1145     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1146 }
1147 
1148 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1149 {
1150     op1 = float32_squash_input_denormal(op1, stat);
1151     op2 = float32_squash_input_denormal(op2, stat);
1152 
1153     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1154         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1155         return float32_two;
1156     }
1157     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1158 }
1159 
1160 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1161 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1162 {
1163     op1 = float16_squash_input_denormal(op1, stat);
1164     op2 = float16_squash_input_denormal(op2, stat);
1165 
1166     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1167         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1168         return float16_one_point_five;
1169     }
1170     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1171     return float16_div(op1, float16_two, stat);
1172 }
1173 
1174 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1175 {
1176     op1 = float32_squash_input_denormal(op1, stat);
1177     op2 = float32_squash_input_denormal(op2, stat);
1178 
1179     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1180         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1181         return float32_one_point_five;
1182     }
1183     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1184     return float32_div(op1, float32_two, stat);
1185 }
1186 
1187 #define DO_3OP(NAME, FUNC, TYPE) \
1188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1189 {                                                                          \
1190     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1191     TYPE *d = vd, *n = vn, *m = vm;                                        \
1192     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1193         d[i] = FUNC(n[i], m[i], stat);                                     \
1194     }                                                                      \
1195     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1196 }
1197 
1198 DO_3OP(gvec_fadd_h, float16_add, float16)
1199 DO_3OP(gvec_fadd_s, float32_add, float32)
1200 DO_3OP(gvec_fadd_d, float64_add, float64)
1201 
1202 DO_3OP(gvec_fsub_h, float16_sub, float16)
1203 DO_3OP(gvec_fsub_s, float32_sub, float32)
1204 DO_3OP(gvec_fsub_d, float64_sub, float64)
1205 
1206 DO_3OP(gvec_fmul_h, float16_mul, float16)
1207 DO_3OP(gvec_fmul_s, float32_mul, float32)
1208 DO_3OP(gvec_fmul_d, float64_mul, float64)
1209 
1210 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1211 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1212 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1213 
1214 DO_3OP(gvec_fabd_h, float16_abd, float16)
1215 DO_3OP(gvec_fabd_s, float32_abd, float32)
1216 
1217 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1218 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1219 
1220 DO_3OP(gvec_fcge_h, float16_cge, float16)
1221 DO_3OP(gvec_fcge_s, float32_cge, float32)
1222 
1223 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1224 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1225 
1226 DO_3OP(gvec_facge_h, float16_acge, float16)
1227 DO_3OP(gvec_facge_s, float32_acge, float32)
1228 
1229 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1230 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1231 
1232 DO_3OP(gvec_fmax_h, float16_max, float16)
1233 DO_3OP(gvec_fmax_s, float32_max, float32)
1234 DO_3OP(gvec_fmax_d, float64_max, float64)
1235 
1236 DO_3OP(gvec_fmin_h, float16_min, float16)
1237 DO_3OP(gvec_fmin_s, float32_min, float32)
1238 DO_3OP(gvec_fmin_d, float64_min, float64)
1239 
1240 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1241 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1242 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1243 
1244 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1245 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1246 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1247 
1248 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1249 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1250 
1251 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1252 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1253 
1254 #ifdef TARGET_AARCH64
1255 DO_3OP(gvec_fdiv_h, float16_div, float16)
1256 DO_3OP(gvec_fdiv_s, float32_div, float32)
1257 DO_3OP(gvec_fdiv_d, float64_div, float64)
1258 
1259 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1260 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1261 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1262 
1263 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1264 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1265 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1266 
1267 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1268 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1269 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1270 
1271 #endif
1272 #undef DO_3OP
1273 
1274 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1275 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1276                                  float_status *stat)
1277 {
1278     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1279 }
1280 
1281 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1282                                  float_status *stat)
1283 {
1284     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1285 }
1286 
1287 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1288                                  float_status *stat)
1289 {
1290     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1291 }
1292 
1293 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1294                                  float_status *stat)
1295 {
1296     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1297 }
1298 
1299 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1300 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1301                                 float_status *stat)
1302 {
1303     return float16_muladd(op1, op2, dest, 0, stat);
1304 }
1305 
1306 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1307                                  float_status *stat)
1308 {
1309     return float32_muladd(op1, op2, dest, 0, stat);
1310 }
1311 
1312 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1313                                  float_status *stat)
1314 {
1315     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1316 }
1317 
1318 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1319                                  float_status *stat)
1320 {
1321     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1322 }
1323 
1324 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1325 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1326 {                                                                          \
1327     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1328     TYPE *d = vd, *n = vn, *m = vm;                                        \
1329     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1330         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1331     }                                                                      \
1332     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1333 }
1334 
1335 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1336 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1337 
1338 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1339 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1340 
1341 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1342 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1343 
1344 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1345 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1346 
1347 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1348  * For AdvSIMD, there is of course only one such vector segment.
1349  */
1350 
1351 #define DO_MUL_IDX(NAME, TYPE, H) \
1352 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1353 {                                                                          \
1354     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1355     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1356     intptr_t idx = simd_data(desc);                                        \
1357     TYPE *d = vd, *n = vn, *m = vm;                                        \
1358     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1359         TYPE mm = m[H(i + idx)];                                           \
1360         for (j = 0; j < segment; j++) {                                    \
1361             d[i + j] = n[i + j] * mm;                                      \
1362         }                                                                  \
1363     }                                                                      \
1364     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1365 }
1366 
1367 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1368 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1369 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1370 
1371 #undef DO_MUL_IDX
1372 
1373 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1374 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1375 {                                                                          \
1376     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1377     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1378     intptr_t idx = simd_data(desc);                                        \
1379     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1380     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1381         TYPE mm = m[H(i + idx)];                                           \
1382         for (j = 0; j < segment; j++) {                                    \
1383             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1384         }                                                                  \
1385     }                                                                      \
1386     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1387 }
1388 
1389 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1390 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1391 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1392 
1393 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1394 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1395 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1396 
1397 #undef DO_MLA_IDX
1398 
1399 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1400 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1401 {                                                                          \
1402     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1403     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1404     intptr_t idx = simd_data(desc);                                        \
1405     TYPE *d = vd, *n = vn, *m = vm;                                        \
1406     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1407         TYPE mm = m[H(i + idx)];                                           \
1408         for (j = 0; j < segment; j++) {                                    \
1409             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1410         }                                                                  \
1411     }                                                                      \
1412     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1413 }
1414 
1415 #define nop(N, M, S) (M)
1416 
1417 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1418 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1419 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1420 
1421 #ifdef TARGET_AARCH64
1422 
1423 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1424 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1425 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1426 
1427 #endif
1428 
1429 #undef nop
1430 
1431 /*
1432  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1433  * the fused ops below they assume accumulate both from and into Vd.
1434  */
1435 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1436 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1437 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1438 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1439 
1440 #undef DO_FMUL_IDX
1441 
1442 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1443 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1444                   void *stat, uint32_t desc)                               \
1445 {                                                                          \
1446     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1447     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1448     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1449     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1450     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1451     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1452     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1453         TYPE mm = m[H(i + idx)];                                           \
1454         for (j = 0; j < segment; j++) {                                    \
1455             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1456                                      mm, a[i + j], 0, stat);               \
1457         }                                                                  \
1458     }                                                                      \
1459     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1460 }
1461 
1462 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1463 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1464 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1465 
1466 #undef DO_FMLA_IDX
1467 
1468 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1469 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1470 {                                                                          \
1471     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1472     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1473     bool q = false;                                                        \
1474     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1475         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1476         if (dd < MIN) {                                                    \
1477             dd = MIN;                                                      \
1478             q = true;                                                      \
1479         } else if (dd > MAX) {                                             \
1480             dd = MAX;                                                      \
1481             q = true;                                                      \
1482         }                                                                  \
1483         d[i] = dd;                                                         \
1484     }                                                                      \
1485     if (q) {                                                               \
1486         uint32_t *qc = vq;                                                 \
1487         qc[0] = 1;                                                         \
1488     }                                                                      \
1489     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1490 }
1491 
1492 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1493 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1494 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1495 
1496 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1497 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1498 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1499 
1500 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1501 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1502 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1503 
1504 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1505 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1506 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1507 
1508 #undef DO_SAT
1509 
1510 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1511                           void *vm, uint32_t desc)
1512 {
1513     intptr_t i, oprsz = simd_oprsz(desc);
1514     uint64_t *d = vd, *n = vn, *m = vm;
1515     bool q = false;
1516 
1517     for (i = 0; i < oprsz / 8; i++) {
1518         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1519         if (dd < nn) {
1520             dd = UINT64_MAX;
1521             q = true;
1522         }
1523         d[i] = dd;
1524     }
1525     if (q) {
1526         uint32_t *qc = vq;
1527         qc[0] = 1;
1528     }
1529     clear_tail(d, oprsz, simd_maxsz(desc));
1530 }
1531 
1532 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1533                           void *vm, uint32_t desc)
1534 {
1535     intptr_t i, oprsz = simd_oprsz(desc);
1536     uint64_t *d = vd, *n = vn, *m = vm;
1537     bool q = false;
1538 
1539     for (i = 0; i < oprsz / 8; i++) {
1540         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1541         if (nn < mm) {
1542             dd = 0;
1543             q = true;
1544         }
1545         d[i] = dd;
1546     }
1547     if (q) {
1548         uint32_t *qc = vq;
1549         qc[0] = 1;
1550     }
1551     clear_tail(d, oprsz, simd_maxsz(desc));
1552 }
1553 
1554 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1555                           void *vm, uint32_t desc)
1556 {
1557     intptr_t i, oprsz = simd_oprsz(desc);
1558     int64_t *d = vd, *n = vn, *m = vm;
1559     bool q = false;
1560 
1561     for (i = 0; i < oprsz / 8; i++) {
1562         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1563         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1564             dd = (nn >> 63) ^ ~INT64_MIN;
1565             q = true;
1566         }
1567         d[i] = dd;
1568     }
1569     if (q) {
1570         uint32_t *qc = vq;
1571         qc[0] = 1;
1572     }
1573     clear_tail(d, oprsz, simd_maxsz(desc));
1574 }
1575 
1576 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1577                           void *vm, uint32_t desc)
1578 {
1579     intptr_t i, oprsz = simd_oprsz(desc);
1580     int64_t *d = vd, *n = vn, *m = vm;
1581     bool q = false;
1582 
1583     for (i = 0; i < oprsz / 8; i++) {
1584         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1585         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1586             dd = (nn >> 63) ^ ~INT64_MIN;
1587             q = true;
1588         }
1589         d[i] = dd;
1590     }
1591     if (q) {
1592         uint32_t *qc = vq;
1593         qc[0] = 1;
1594     }
1595     clear_tail(d, oprsz, simd_maxsz(desc));
1596 }
1597 
1598 
1599 #define DO_SRA(NAME, TYPE)                              \
1600 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1601 {                                                       \
1602     intptr_t i, oprsz = simd_oprsz(desc);               \
1603     int shift = simd_data(desc);                        \
1604     TYPE *d = vd, *n = vn;                              \
1605     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1606         d[i] += n[i] >> shift;                          \
1607     }                                                   \
1608     clear_tail(d, oprsz, simd_maxsz(desc));             \
1609 }
1610 
1611 DO_SRA(gvec_ssra_b, int8_t)
1612 DO_SRA(gvec_ssra_h, int16_t)
1613 DO_SRA(gvec_ssra_s, int32_t)
1614 DO_SRA(gvec_ssra_d, int64_t)
1615 
1616 DO_SRA(gvec_usra_b, uint8_t)
1617 DO_SRA(gvec_usra_h, uint16_t)
1618 DO_SRA(gvec_usra_s, uint32_t)
1619 DO_SRA(gvec_usra_d, uint64_t)
1620 
1621 #undef DO_SRA
1622 
1623 #define DO_RSHR(NAME, TYPE)                             \
1624 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1625 {                                                       \
1626     intptr_t i, oprsz = simd_oprsz(desc);               \
1627     int shift = simd_data(desc);                        \
1628     TYPE *d = vd, *n = vn;                              \
1629     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1630         TYPE tmp = n[i] >> (shift - 1);                 \
1631         d[i] = (tmp >> 1) + (tmp & 1);                  \
1632     }                                                   \
1633     clear_tail(d, oprsz, simd_maxsz(desc));             \
1634 }
1635 
1636 DO_RSHR(gvec_srshr_b, int8_t)
1637 DO_RSHR(gvec_srshr_h, int16_t)
1638 DO_RSHR(gvec_srshr_s, int32_t)
1639 DO_RSHR(gvec_srshr_d, int64_t)
1640 
1641 DO_RSHR(gvec_urshr_b, uint8_t)
1642 DO_RSHR(gvec_urshr_h, uint16_t)
1643 DO_RSHR(gvec_urshr_s, uint32_t)
1644 DO_RSHR(gvec_urshr_d, uint64_t)
1645 
1646 #undef DO_RSHR
1647 
1648 #define DO_RSRA(NAME, TYPE)                             \
1649 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1650 {                                                       \
1651     intptr_t i, oprsz = simd_oprsz(desc);               \
1652     int shift = simd_data(desc);                        \
1653     TYPE *d = vd, *n = vn;                              \
1654     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1655         TYPE tmp = n[i] >> (shift - 1);                 \
1656         d[i] += (tmp >> 1) + (tmp & 1);                 \
1657     }                                                   \
1658     clear_tail(d, oprsz, simd_maxsz(desc));             \
1659 }
1660 
1661 DO_RSRA(gvec_srsra_b, int8_t)
1662 DO_RSRA(gvec_srsra_h, int16_t)
1663 DO_RSRA(gvec_srsra_s, int32_t)
1664 DO_RSRA(gvec_srsra_d, int64_t)
1665 
1666 DO_RSRA(gvec_ursra_b, uint8_t)
1667 DO_RSRA(gvec_ursra_h, uint16_t)
1668 DO_RSRA(gvec_ursra_s, uint32_t)
1669 DO_RSRA(gvec_ursra_d, uint64_t)
1670 
1671 #undef DO_RSRA
1672 
1673 #define DO_SRI(NAME, TYPE)                              \
1674 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1675 {                                                       \
1676     intptr_t i, oprsz = simd_oprsz(desc);               \
1677     int shift = simd_data(desc);                        \
1678     TYPE *d = vd, *n = vn;                              \
1679     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1680         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1681     }                                                   \
1682     clear_tail(d, oprsz, simd_maxsz(desc));             \
1683 }
1684 
1685 DO_SRI(gvec_sri_b, uint8_t)
1686 DO_SRI(gvec_sri_h, uint16_t)
1687 DO_SRI(gvec_sri_s, uint32_t)
1688 DO_SRI(gvec_sri_d, uint64_t)
1689 
1690 #undef DO_SRI
1691 
1692 #define DO_SLI(NAME, TYPE)                              \
1693 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1694 {                                                       \
1695     intptr_t i, oprsz = simd_oprsz(desc);               \
1696     int shift = simd_data(desc);                        \
1697     TYPE *d = vd, *n = vn;                              \
1698     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1699         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1700     }                                                   \
1701     clear_tail(d, oprsz, simd_maxsz(desc));             \
1702 }
1703 
1704 DO_SLI(gvec_sli_b, uint8_t)
1705 DO_SLI(gvec_sli_h, uint16_t)
1706 DO_SLI(gvec_sli_s, uint32_t)
1707 DO_SLI(gvec_sli_d, uint64_t)
1708 
1709 #undef DO_SLI
1710 
1711 /*
1712  * Convert float16 to float32, raising no exceptions and
1713  * preserving exceptional values, including SNaN.
1714  * This is effectively an unpack+repack operation.
1715  */
1716 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1717 {
1718     const int f16_bias = 15;
1719     const int f32_bias = 127;
1720     uint32_t sign = extract32(f16, 15, 1);
1721     uint32_t exp = extract32(f16, 10, 5);
1722     uint32_t frac = extract32(f16, 0, 10);
1723 
1724     if (exp == 0x1f) {
1725         /* Inf or NaN */
1726         exp = 0xff;
1727     } else if (exp == 0) {
1728         /* Zero or denormal.  */
1729         if (frac != 0) {
1730             if (fz16) {
1731                 frac = 0;
1732             } else {
1733                 /*
1734                  * Denormal; these are all normal float32.
1735                  * Shift the fraction so that the msb is at bit 11,
1736                  * then remove bit 11 as the implicit bit of the
1737                  * normalized float32.  Note that we still go through
1738                  * the shift for normal numbers below, to put the
1739                  * float32 fraction at the right place.
1740                  */
1741                 int shift = clz32(frac) - 21;
1742                 frac = (frac << shift) & 0x3ff;
1743                 exp = f32_bias - f16_bias - shift + 1;
1744             }
1745         }
1746     } else {
1747         /* Normal number; adjust the bias.  */
1748         exp += f32_bias - f16_bias;
1749     }
1750     sign <<= 31;
1751     exp <<= 23;
1752     frac <<= 23 - 10;
1753 
1754     return sign | exp | frac;
1755 }
1756 
1757 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1758 {
1759     /*
1760      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1761      * Load the 2nd qword iff is_q & is_2.
1762      * Shift to the 2nd dword iff !is_q & is_2.
1763      * For !is_q & !is_2, the upper bits of the result are garbage.
1764      */
1765     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1766 }
1767 
1768 /*
1769  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1770  * as there is not yet SVE versions that might use blocking.
1771  */
1772 
1773 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1774                      uint32_t desc, bool fz16)
1775 {
1776     intptr_t i, oprsz = simd_oprsz(desc);
1777     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1778     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1779     int is_q = oprsz == 16;
1780     uint64_t n_4, m_4;
1781 
1782     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1783     n_4 = load4_f16(vn, is_q, is_2);
1784     m_4 = load4_f16(vm, is_q, is_2);
1785 
1786     /* Negate all inputs for FMLSL at once.  */
1787     if (is_s) {
1788         n_4 ^= 0x8000800080008000ull;
1789     }
1790 
1791     for (i = 0; i < oprsz / 4; i++) {
1792         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1793         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1794         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1795     }
1796     clear_tail(d, oprsz, simd_maxsz(desc));
1797 }
1798 
1799 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1800                             void *venv, uint32_t desc)
1801 {
1802     CPUARMState *env = venv;
1803     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1804              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1805 }
1806 
1807 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1808                             void *venv, uint32_t desc)
1809 {
1810     CPUARMState *env = venv;
1811     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1812              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1813 }
1814 
1815 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1816                                void *venv, uint32_t desc)
1817 {
1818     intptr_t i, oprsz = simd_oprsz(desc);
1819     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1820     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1821     CPUARMState *env = venv;
1822     float_status *status = &env->vfp.fp_status;
1823     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1824 
1825     for (i = 0; i < oprsz; i += sizeof(float32)) {
1826         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1827         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1828         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1829         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1830         float32 aa = *(float32 *)(va + H1_4(i));
1831 
1832         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1833     }
1834 }
1835 
1836 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1837                          uint32_t desc, bool fz16)
1838 {
1839     intptr_t i, oprsz = simd_oprsz(desc);
1840     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1841     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1842     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1843     int is_q = oprsz == 16;
1844     uint64_t n_4;
1845     float32 m_1;
1846 
1847     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1848     n_4 = load4_f16(vn, is_q, is_2);
1849 
1850     /* Negate all inputs for FMLSL at once.  */
1851     if (is_s) {
1852         n_4 ^= 0x8000800080008000ull;
1853     }
1854 
1855     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1856 
1857     for (i = 0; i < oprsz / 4; i++) {
1858         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1859         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1860     }
1861     clear_tail(d, oprsz, simd_maxsz(desc));
1862 }
1863 
1864 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1865                                 void *venv, uint32_t desc)
1866 {
1867     CPUARMState *env = venv;
1868     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1869                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1870 }
1871 
1872 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1873                                 void *venv, uint32_t desc)
1874 {
1875     CPUARMState *env = venv;
1876     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1877                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1878 }
1879 
1880 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1881                                void *venv, uint32_t desc)
1882 {
1883     intptr_t i, j, oprsz = simd_oprsz(desc);
1884     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1885     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1886     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1887     CPUARMState *env = venv;
1888     float_status *status = &env->vfp.fp_status;
1889     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1890 
1891     for (i = 0; i < oprsz; i += 16) {
1892         float16 mm_16 = *(float16 *)(vm + i + idx);
1893         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1894 
1895         for (j = 0; j < 16; j += sizeof(float32)) {
1896             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1897             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1898             float32 aa = *(float32 *)(va + H1_4(i + j));
1899 
1900             *(float32 *)(vd + H1_4(i + j)) =
1901                 float32_muladd(nn, mm, aa, 0, status);
1902         }
1903     }
1904 }
1905 
1906 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1907 {
1908     intptr_t i, opr_sz = simd_oprsz(desc);
1909     int8_t *d = vd, *n = vn, *m = vm;
1910 
1911     for (i = 0; i < opr_sz; ++i) {
1912         int8_t mm = m[i];
1913         int8_t nn = n[i];
1914         int8_t res = 0;
1915         if (mm >= 0) {
1916             if (mm < 8) {
1917                 res = nn << mm;
1918             }
1919         } else {
1920             res = nn >> (mm > -8 ? -mm : 7);
1921         }
1922         d[i] = res;
1923     }
1924     clear_tail(d, opr_sz, simd_maxsz(desc));
1925 }
1926 
1927 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1928 {
1929     intptr_t i, opr_sz = simd_oprsz(desc);
1930     int16_t *d = vd, *n = vn, *m = vm;
1931 
1932     for (i = 0; i < opr_sz / 2; ++i) {
1933         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1934         int16_t nn = n[i];
1935         int16_t res = 0;
1936         if (mm >= 0) {
1937             if (mm < 16) {
1938                 res = nn << mm;
1939             }
1940         } else {
1941             res = nn >> (mm > -16 ? -mm : 15);
1942         }
1943         d[i] = res;
1944     }
1945     clear_tail(d, opr_sz, simd_maxsz(desc));
1946 }
1947 
1948 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1949 {
1950     intptr_t i, opr_sz = simd_oprsz(desc);
1951     uint8_t *d = vd, *n = vn, *m = vm;
1952 
1953     for (i = 0; i < opr_sz; ++i) {
1954         int8_t mm = m[i];
1955         uint8_t nn = n[i];
1956         uint8_t res = 0;
1957         if (mm >= 0) {
1958             if (mm < 8) {
1959                 res = nn << mm;
1960             }
1961         } else {
1962             if (mm > -8) {
1963                 res = nn >> -mm;
1964             }
1965         }
1966         d[i] = res;
1967     }
1968     clear_tail(d, opr_sz, simd_maxsz(desc));
1969 }
1970 
1971 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1972 {
1973     intptr_t i, opr_sz = simd_oprsz(desc);
1974     uint16_t *d = vd, *n = vn, *m = vm;
1975 
1976     for (i = 0; i < opr_sz / 2; ++i) {
1977         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1978         uint16_t nn = n[i];
1979         uint16_t res = 0;
1980         if (mm >= 0) {
1981             if (mm < 16) {
1982                 res = nn << mm;
1983             }
1984         } else {
1985             if (mm > -16) {
1986                 res = nn >> -mm;
1987             }
1988         }
1989         d[i] = res;
1990     }
1991     clear_tail(d, opr_sz, simd_maxsz(desc));
1992 }
1993 
1994 /*
1995  * 8x8->8 polynomial multiply.
1996  *
1997  * Polynomial multiplication is like integer multiplication except the
1998  * partial products are XORed, not added.
1999  *
2000  * TODO: expose this as a generic vector operation, as it is a common
2001  * crypto building block.
2002  */
2003 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2004 {
2005     intptr_t i, opr_sz = simd_oprsz(desc);
2006     uint64_t *d = vd, *n = vn, *m = vm;
2007 
2008     for (i = 0; i < opr_sz / 8; ++i) {
2009         d[i] = clmul_8x8_low(n[i], m[i]);
2010     }
2011     clear_tail(d, opr_sz, simd_maxsz(desc));
2012 }
2013 
2014 /*
2015  * 64x64->128 polynomial multiply.
2016  * Because of the lanes are not accessed in strict columns,
2017  * this probably cannot be turned into a generic helper.
2018  */
2019 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2020 {
2021     intptr_t i, opr_sz = simd_oprsz(desc);
2022     intptr_t hi = simd_data(desc);
2023     uint64_t *d = vd, *n = vn, *m = vm;
2024 
2025     for (i = 0; i < opr_sz / 8; i += 2) {
2026         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2027         d[i] = int128_getlo(r);
2028         d[i + 1] = int128_gethi(r);
2029     }
2030     clear_tail(d, opr_sz, simd_maxsz(desc));
2031 }
2032 
2033 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2034 {
2035     int hi = simd_data(desc);
2036     uint64_t *d = vd, *n = vn, *m = vm;
2037     uint64_t nn = n[hi], mm = m[hi];
2038 
2039     d[0] = clmul_8x4_packed(nn, mm);
2040     nn >>= 32;
2041     mm >>= 32;
2042     d[1] = clmul_8x4_packed(nn, mm);
2043 
2044     clear_tail(d, 16, simd_maxsz(desc));
2045 }
2046 
2047 #ifdef TARGET_AARCH64
2048 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2049 {
2050     int shift = simd_data(desc) * 8;
2051     intptr_t i, opr_sz = simd_oprsz(desc);
2052     uint64_t *d = vd, *n = vn, *m = vm;
2053 
2054     for (i = 0; i < opr_sz / 8; ++i) {
2055         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2056     }
2057 }
2058 
2059 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2060 {
2061     intptr_t sel = H4(simd_data(desc));
2062     intptr_t i, opr_sz = simd_oprsz(desc);
2063     uint32_t *n = vn, *m = vm;
2064     uint64_t *d = vd;
2065 
2066     for (i = 0; i < opr_sz / 8; ++i) {
2067         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2068     }
2069 }
2070 #endif
2071 
2072 #define DO_CMP0(NAME, TYPE, OP)                         \
2073 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2074 {                                                       \
2075     intptr_t i, opr_sz = simd_oprsz(desc);              \
2076     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2077         TYPE nn = *(TYPE *)(vn + i);                    \
2078         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2079     }                                                   \
2080     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2081 }
2082 
2083 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2084 DO_CMP0(gvec_clt0_b, int8_t, <)
2085 DO_CMP0(gvec_cle0_b, int8_t, <=)
2086 DO_CMP0(gvec_cgt0_b, int8_t, >)
2087 DO_CMP0(gvec_cge0_b, int8_t, >=)
2088 
2089 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2090 DO_CMP0(gvec_clt0_h, int16_t, <)
2091 DO_CMP0(gvec_cle0_h, int16_t, <=)
2092 DO_CMP0(gvec_cgt0_h, int16_t, >)
2093 DO_CMP0(gvec_cge0_h, int16_t, >=)
2094 
2095 #undef DO_CMP0
2096 
2097 #define DO_ABD(NAME, TYPE)                                      \
2098 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2099 {                                                               \
2100     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2101     TYPE *d = vd, *n = vn, *m = vm;                             \
2102                                                                 \
2103     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2104         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2105     }                                                           \
2106     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2107 }
2108 
2109 DO_ABD(gvec_sabd_b, int8_t)
2110 DO_ABD(gvec_sabd_h, int16_t)
2111 DO_ABD(gvec_sabd_s, int32_t)
2112 DO_ABD(gvec_sabd_d, int64_t)
2113 
2114 DO_ABD(gvec_uabd_b, uint8_t)
2115 DO_ABD(gvec_uabd_h, uint16_t)
2116 DO_ABD(gvec_uabd_s, uint32_t)
2117 DO_ABD(gvec_uabd_d, uint64_t)
2118 
2119 #undef DO_ABD
2120 
2121 #define DO_ABA(NAME, TYPE)                                      \
2122 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2123 {                                                               \
2124     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2125     TYPE *d = vd, *n = vn, *m = vm;                             \
2126                                                                 \
2127     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2128         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2129     }                                                           \
2130     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2131 }
2132 
2133 DO_ABA(gvec_saba_b, int8_t)
2134 DO_ABA(gvec_saba_h, int16_t)
2135 DO_ABA(gvec_saba_s, int32_t)
2136 DO_ABA(gvec_saba_d, int64_t)
2137 
2138 DO_ABA(gvec_uaba_b, uint8_t)
2139 DO_ABA(gvec_uaba_h, uint16_t)
2140 DO_ABA(gvec_uaba_s, uint32_t)
2141 DO_ABA(gvec_uaba_d, uint64_t)
2142 
2143 #undef DO_ABA
2144 
2145 #define DO_NEON_PAIRWISE(NAME, OP)                                      \
2146     void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
2147                          void *stat, uint32_t oprsz)                    \
2148     {                                                                   \
2149         float_status *fpst = stat;                                      \
2150         float32 *d = vd;                                                \
2151         float32 *n = vn;                                                \
2152         float32 *m = vm;                                                \
2153         float32 r0, r1;                                                 \
2154                                                                         \
2155         /* Read all inputs before writing outputs in case vm == vd */   \
2156         r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
2157         r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
2158                                                                         \
2159         d[H4(0)] = r0;                                                  \
2160         d[H4(1)] = r1;                                                  \
2161     }                                                                   \
2162                                                                         \
2163     void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
2164                          void *stat, uint32_t oprsz)                    \
2165     {                                                                   \
2166         float_status *fpst = stat;                                      \
2167         float16 *d = vd;                                                \
2168         float16 *n = vn;                                                \
2169         float16 *m = vm;                                                \
2170         float16 r0, r1, r2, r3;                                         \
2171                                                                         \
2172         /* Read all inputs before writing outputs in case vm == vd */   \
2173         r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
2174         r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
2175         r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
2176         r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
2177                                                                         \
2178         d[H2(0)] = r0;                                                  \
2179         d[H2(1)] = r1;                                                  \
2180         d[H2(2)] = r2;                                                  \
2181         d[H2(3)] = r3;                                                  \
2182     }
2183 
2184 DO_NEON_PAIRWISE(neon_padd, add)
2185 DO_NEON_PAIRWISE(neon_pmax, max)
2186 DO_NEON_PAIRWISE(neon_pmin, min)
2187 
2188 #undef DO_NEON_PAIRWISE
2189 
2190 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2191     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2192     {                                                                   \
2193         intptr_t i, oprsz = simd_oprsz(desc);                           \
2194         int shift = simd_data(desc);                                    \
2195         TYPE *d = vd, *n = vn;                                          \
2196         float_status *fpst = stat;                                      \
2197         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2198             d[i] = FUNC(n[i], shift, fpst);                             \
2199         }                                                               \
2200         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2201     }
2202 
2203 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2204 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2205 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2206 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2207 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2208 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2209 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2210 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2211 
2212 #undef DO_VCVT_FIXED
2213 
2214 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2215     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2216     {                                                                   \
2217         float_status *fpst = stat;                                      \
2218         intptr_t i, oprsz = simd_oprsz(desc);                           \
2219         uint32_t rmode = simd_data(desc);                               \
2220         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2221         TYPE *d = vd, *n = vn;                                          \
2222         set_float_rounding_mode(rmode, fpst);                           \
2223         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2224             d[i] = FUNC(n[i], 0, fpst);                                 \
2225         }                                                               \
2226         set_float_rounding_mode(prev_rmode, fpst);                      \
2227         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2228     }
2229 
2230 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2231 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2232 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2233 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2234 
2235 #undef DO_VCVT_RMODE
2236 
2237 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2238     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2239     {                                                                   \
2240         float_status *fpst = stat;                                      \
2241         intptr_t i, oprsz = simd_oprsz(desc);                           \
2242         uint32_t rmode = simd_data(desc);                               \
2243         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2244         TYPE *d = vd, *n = vn;                                          \
2245         set_float_rounding_mode(rmode, fpst);                           \
2246         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2247             d[i] = FUNC(n[i], fpst);                                    \
2248         }                                                               \
2249         set_float_rounding_mode(prev_rmode, fpst);                      \
2250         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2251     }
2252 
2253 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2254 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2255 
2256 #undef DO_VRINT_RMODE
2257 
2258 #ifdef TARGET_AARCH64
2259 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2260 {
2261     const uint8_t *indices = vm;
2262     CPUARMState *env = venv;
2263     size_t oprsz = simd_oprsz(desc);
2264     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2265     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2266     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2267     union {
2268         uint8_t b[16];
2269         uint64_t d[2];
2270     } result;
2271 
2272     /*
2273      * We must construct the final result in a temp, lest the output
2274      * overlaps the input table.  For TBL, begin with zero; for TBX,
2275      * begin with the original register contents.  Note that we always
2276      * copy 16 bytes here to avoid an extra branch; clearing the high
2277      * bits of the register for oprsz == 8 is handled below.
2278      */
2279     if (is_tbx) {
2280         memcpy(&result, vd, 16);
2281     } else {
2282         memset(&result, 0, 16);
2283     }
2284 
2285     for (size_t i = 0; i < oprsz; ++i) {
2286         uint32_t index = indices[H1(i)];
2287 
2288         if (index < table_len) {
2289             /*
2290              * Convert index (a byte offset into the virtual table
2291              * which is a series of 128-bit vectors concatenated)
2292              * into the correct register element, bearing in mind
2293              * that the table can wrap around from V31 to V0.
2294              */
2295             const uint8_t *table = (const uint8_t *)
2296                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2297             result.b[H1(i)] = table[H1(index % 16)];
2298         }
2299     }
2300 
2301     memcpy(vd, &result, 16);
2302     clear_tail(vd, oprsz, simd_maxsz(desc));
2303 }
2304 #endif
2305 
2306 /*
2307  * NxN -> N highpart multiply
2308  *
2309  * TODO: expose this as a generic vector operation.
2310  */
2311 
2312 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2313 {
2314     intptr_t i, opr_sz = simd_oprsz(desc);
2315     int8_t *d = vd, *n = vn, *m = vm;
2316 
2317     for (i = 0; i < opr_sz; ++i) {
2318         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2319     }
2320     clear_tail(d, opr_sz, simd_maxsz(desc));
2321 }
2322 
2323 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2324 {
2325     intptr_t i, opr_sz = simd_oprsz(desc);
2326     int16_t *d = vd, *n = vn, *m = vm;
2327 
2328     for (i = 0; i < opr_sz / 2; ++i) {
2329         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2330     }
2331     clear_tail(d, opr_sz, simd_maxsz(desc));
2332 }
2333 
2334 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2335 {
2336     intptr_t i, opr_sz = simd_oprsz(desc);
2337     int32_t *d = vd, *n = vn, *m = vm;
2338 
2339     for (i = 0; i < opr_sz / 4; ++i) {
2340         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2341     }
2342     clear_tail(d, opr_sz, simd_maxsz(desc));
2343 }
2344 
2345 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2346 {
2347     intptr_t i, opr_sz = simd_oprsz(desc);
2348     uint64_t *d = vd, *n = vn, *m = vm;
2349     uint64_t discard;
2350 
2351     for (i = 0; i < opr_sz / 8; ++i) {
2352         muls64(&discard, &d[i], n[i], m[i]);
2353     }
2354     clear_tail(d, opr_sz, simd_maxsz(desc));
2355 }
2356 
2357 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2358 {
2359     intptr_t i, opr_sz = simd_oprsz(desc);
2360     uint8_t *d = vd, *n = vn, *m = vm;
2361 
2362     for (i = 0; i < opr_sz; ++i) {
2363         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2364     }
2365     clear_tail(d, opr_sz, simd_maxsz(desc));
2366 }
2367 
2368 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2369 {
2370     intptr_t i, opr_sz = simd_oprsz(desc);
2371     uint16_t *d = vd, *n = vn, *m = vm;
2372 
2373     for (i = 0; i < opr_sz / 2; ++i) {
2374         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2375     }
2376     clear_tail(d, opr_sz, simd_maxsz(desc));
2377 }
2378 
2379 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2380 {
2381     intptr_t i, opr_sz = simd_oprsz(desc);
2382     uint32_t *d = vd, *n = vn, *m = vm;
2383 
2384     for (i = 0; i < opr_sz / 4; ++i) {
2385         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2386     }
2387     clear_tail(d, opr_sz, simd_maxsz(desc));
2388 }
2389 
2390 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2391 {
2392     intptr_t i, opr_sz = simd_oprsz(desc);
2393     uint64_t *d = vd, *n = vn, *m = vm;
2394     uint64_t discard;
2395 
2396     for (i = 0; i < opr_sz / 8; ++i) {
2397         mulu64(&discard, &d[i], n[i], m[i]);
2398     }
2399     clear_tail(d, opr_sz, simd_maxsz(desc));
2400 }
2401 
2402 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2403 {
2404     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2405     int shr = simd_data(desc);
2406     uint64_t *d = vd, *n = vn, *m = vm;
2407 
2408     for (i = 0; i < opr_sz; ++i) {
2409         d[i] = ror64(n[i] ^ m[i], shr);
2410     }
2411     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2412 }
2413 
2414 /*
2415  * Integer matrix-multiply accumulate
2416  */
2417 
2418 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2419 {
2420     int8_t *n = vn, *m = vm;
2421 
2422     for (intptr_t k = 0; k < 8; ++k) {
2423         sum += n[H1(k)] * m[H1(k)];
2424     }
2425     return sum;
2426 }
2427 
2428 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2429 {
2430     uint8_t *n = vn, *m = vm;
2431 
2432     for (intptr_t k = 0; k < 8; ++k) {
2433         sum += n[H1(k)] * m[H1(k)];
2434     }
2435     return sum;
2436 }
2437 
2438 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2439 {
2440     uint8_t *n = vn;
2441     int8_t *m = vm;
2442 
2443     for (intptr_t k = 0; k < 8; ++k) {
2444         sum += n[H1(k)] * m[H1(k)];
2445     }
2446     return sum;
2447 }
2448 
2449 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2450                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2451 {
2452     intptr_t seg, opr_sz = simd_oprsz(desc);
2453 
2454     for (seg = 0; seg < opr_sz; seg += 16) {
2455         uint32_t *d = vd + seg;
2456         uint32_t *a = va + seg;
2457         uint32_t sum0, sum1, sum2, sum3;
2458 
2459         /*
2460          * Process the entire segment at once, writing back the
2461          * results only after we've consumed all of the inputs.
2462          *
2463          * Key to indices by column:
2464          *          i   j                  i             j
2465          */
2466         sum0 = a[H4(0 + 0)];
2467         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2468         sum1 = a[H4(0 + 1)];
2469         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2470         sum2 = a[H4(2 + 0)];
2471         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2472         sum3 = a[H4(2 + 1)];
2473         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2474 
2475         d[H4(0)] = sum0;
2476         d[H4(1)] = sum1;
2477         d[H4(2)] = sum2;
2478         d[H4(3)] = sum3;
2479     }
2480     clear_tail(vd, opr_sz, simd_maxsz(desc));
2481 }
2482 
2483 #define DO_MMLA_B(NAME, INNER) \
2484     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2485     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2486 
2487 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2488 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2489 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2490 
2491 /*
2492  * BFloat16 Dot Product
2493  */
2494 
2495 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2496 {
2497     /* FPCR is ignored for BFDOT and BFMMLA. */
2498     float_status bf_status = {
2499         .tininess_before_rounding = float_tininess_before_rounding,
2500         .float_rounding_mode = float_round_to_odd_inf,
2501         .flush_to_zero = true,
2502         .flush_inputs_to_zero = true,
2503         .default_nan_mode = true,
2504     };
2505     float32 t1, t2;
2506 
2507     /*
2508      * Extract each BFloat16 from the element pair, and shift
2509      * them such that they become float32.
2510      */
2511     t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2512     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2513     t1 = float32_add(t1, t2, &bf_status);
2514     t1 = float32_add(sum, t1, &bf_status);
2515 
2516     return t1;
2517 }
2518 
2519 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2520 {
2521     intptr_t i, opr_sz = simd_oprsz(desc);
2522     float32 *d = vd, *a = va;
2523     uint32_t *n = vn, *m = vm;
2524 
2525     for (i = 0; i < opr_sz / 4; ++i) {
2526         d[i] = bfdotadd(a[i], n[i], m[i]);
2527     }
2528     clear_tail(d, opr_sz, simd_maxsz(desc));
2529 }
2530 
2531 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2532                             void *va, uint32_t desc)
2533 {
2534     intptr_t i, j, opr_sz = simd_oprsz(desc);
2535     intptr_t index = simd_data(desc);
2536     intptr_t elements = opr_sz / 4;
2537     intptr_t eltspersegment = MIN(16 / 4, elements);
2538     float32 *d = vd, *a = va;
2539     uint32_t *n = vn, *m = vm;
2540 
2541     for (i = 0; i < elements; i += eltspersegment) {
2542         uint32_t m_idx = m[i + H4(index)];
2543 
2544         for (j = i; j < i + eltspersegment; j++) {
2545             d[j] = bfdotadd(a[j], n[j], m_idx);
2546         }
2547     }
2548     clear_tail(d, opr_sz, simd_maxsz(desc));
2549 }
2550 
2551 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2552 {
2553     intptr_t s, opr_sz = simd_oprsz(desc);
2554     float32 *d = vd, *a = va;
2555     uint32_t *n = vn, *m = vm;
2556 
2557     for (s = 0; s < opr_sz / 4; s += 4) {
2558         float32 sum00, sum01, sum10, sum11;
2559 
2560         /*
2561          * Process the entire segment at once, writing back the
2562          * results only after we've consumed all of the inputs.
2563          *
2564          * Key to indices by column:
2565          *               i   j           i   k             j   k
2566          */
2567         sum00 = a[s + H4(0 + 0)];
2568         sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2569         sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2570 
2571         sum01 = a[s + H4(0 + 1)];
2572         sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2573         sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2574 
2575         sum10 = a[s + H4(2 + 0)];
2576         sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2577         sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2578 
2579         sum11 = a[s + H4(2 + 1)];
2580         sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2581         sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2582 
2583         d[s + H4(0 + 0)] = sum00;
2584         d[s + H4(0 + 1)] = sum01;
2585         d[s + H4(2 + 0)] = sum10;
2586         d[s + H4(2 + 1)] = sum11;
2587     }
2588     clear_tail(d, opr_sz, simd_maxsz(desc));
2589 }
2590 
2591 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2592                          void *stat, uint32_t desc)
2593 {
2594     intptr_t i, opr_sz = simd_oprsz(desc);
2595     intptr_t sel = simd_data(desc);
2596     float32 *d = vd, *a = va;
2597     bfloat16 *n = vn, *m = vm;
2598 
2599     for (i = 0; i < opr_sz / 4; ++i) {
2600         float32 nn = n[H2(i * 2 + sel)] << 16;
2601         float32 mm = m[H2(i * 2 + sel)] << 16;
2602         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2603     }
2604     clear_tail(d, opr_sz, simd_maxsz(desc));
2605 }
2606 
2607 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2608                              void *va, void *stat, uint32_t desc)
2609 {
2610     intptr_t i, j, opr_sz = simd_oprsz(desc);
2611     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2612     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2613     intptr_t elements = opr_sz / 4;
2614     intptr_t eltspersegment = MIN(16 / 4, elements);
2615     float32 *d = vd, *a = va;
2616     bfloat16 *n = vn, *m = vm;
2617 
2618     for (i = 0; i < elements; i += eltspersegment) {
2619         float32 m_idx = m[H2(2 * i + index)] << 16;
2620 
2621         for (j = i; j < i + eltspersegment; j++) {
2622             float32 n_j = n[H2(2 * j + sel)] << 16;
2623             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2624         }
2625     }
2626     clear_tail(d, opr_sz, simd_maxsz(desc));
2627 }
2628 
2629 #define DO_CLAMP(NAME, TYPE) \
2630 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2631 {                                                                       \
2632     intptr_t i, opr_sz = simd_oprsz(desc);                              \
2633     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2634         TYPE aa = *(TYPE *)(a + i);                                     \
2635         TYPE nn = *(TYPE *)(n + i);                                     \
2636         TYPE mm = *(TYPE *)(m + i);                                     \
2637         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2638         *(TYPE *)(d + i) = dd;                                          \
2639     }                                                                   \
2640     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2641 }
2642 
2643 DO_CLAMP(gvec_sclamp_b, int8_t)
2644 DO_CLAMP(gvec_sclamp_h, int16_t)
2645 DO_CLAMP(gvec_sclamp_s, int32_t)
2646 DO_CLAMP(gvec_sclamp_d, int64_t)
2647 
2648 DO_CLAMP(gvec_uclamp_b, uint8_t)
2649 DO_CLAMP(gvec_uclamp_h, uint16_t)
2650 DO_CLAMP(gvec_uclamp_s, uint32_t)
2651 DO_CLAMP(gvec_uclamp_d, uint64_t)
2652