xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision bb509d94)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "vec_internal.h"
27 
28 /*
29  * Data for expanding active predicate bits to bytes, for byte elements.
30  *
31  *  for (i = 0; i < 256; ++i) {
32  *      unsigned long m = 0;
33  *      for (j = 0; j < 8; j++) {
34  *          if ((i >> j) & 1) {
35  *              m |= 0xfful << (j << 3);
36  *          }
37  *      }
38  *      printf("0x%016lx,\n", m);
39  *  }
40  */
41 const uint64_t expand_pred_b_data[256] = {
42     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
43     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
44     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
45     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
46     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
47     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
48     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
49     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
50     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
51     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
52     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
53     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
54     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
55     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
56     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
57     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
58     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
59     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
60     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
61     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
62     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
63     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
64     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
65     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
66     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
67     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
68     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
69     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
70     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
71     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
72     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
73     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
74     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
75     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
76     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
77     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
78     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
79     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
80     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
81     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
82     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
83     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
84     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
85     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
86     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
87     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
88     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
89     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
90     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
91     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
92     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
93     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
94     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
95     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
96     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
97     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
98     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
99     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
100     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
101     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
102     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
103     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
104     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
105     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
106     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
107     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
108     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
109     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
110     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
111     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
112     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
113     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
114     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
115     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
116     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
117     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
118     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
119     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
120     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
121     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
122     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
123     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
124     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
125     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
126     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
127     0xffffffffffffffff,
128 };
129 
130 /*
131  * Similarly for half-word elements.
132  *  for (i = 0; i < 256; ++i) {
133  *      unsigned long m = 0;
134  *      if (i & 0xaa) {
135  *          continue;
136  *      }
137  *      for (j = 0; j < 8; j += 2) {
138  *          if ((i >> j) & 1) {
139  *              m |= 0xfffful << (j << 3);
140  *          }
141  *      }
142  *      printf("[0x%x] = 0x%016lx,\n", i, m);
143  *  }
144  */
145 const uint64_t expand_pred_h_data[0x55 + 1] = {
146     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
147     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
148     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
149     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
150     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
151     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
152     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
153     [0x55] = 0xffffffffffffffff,
154 };
155 
156 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
157 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
158                      bool neg, bool round)
159 {
160     /*
161      * Simplify:
162      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
163      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
164      */
165     int32_t ret = (int32_t)src1 * src2;
166     if (neg) {
167         ret = -ret;
168     }
169     ret += ((int32_t)src3 << 7) + (round << 6);
170     ret >>= 7;
171 
172     if (ret != (int8_t)ret) {
173         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
174     }
175     return ret;
176 }
177 
178 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
179                              void *va, uint32_t desc)
180 {
181     intptr_t i, opr_sz = simd_oprsz(desc);
182     int8_t *d = vd, *n = vn, *m = vm, *a = va;
183 
184     for (i = 0; i < opr_sz; ++i) {
185         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
186     }
187 }
188 
189 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
190                              void *va, uint32_t desc)
191 {
192     intptr_t i, opr_sz = simd_oprsz(desc);
193     int8_t *d = vd, *n = vn, *m = vm, *a = va;
194 
195     for (i = 0; i < opr_sz; ++i) {
196         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
197     }
198 }
199 
200 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
201 {
202     intptr_t i, opr_sz = simd_oprsz(desc);
203     int8_t *d = vd, *n = vn, *m = vm;
204 
205     for (i = 0; i < opr_sz; ++i) {
206         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
207     }
208 }
209 
210 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
211 {
212     intptr_t i, opr_sz = simd_oprsz(desc);
213     int8_t *d = vd, *n = vn, *m = vm;
214 
215     for (i = 0; i < opr_sz; ++i) {
216         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
217     }
218 }
219 
220 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
221 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
222                       bool neg, bool round, uint32_t *sat)
223 {
224     /* Simplify similarly to do_sqrdmlah_b above.  */
225     int32_t ret = (int32_t)src1 * src2;
226     if (neg) {
227         ret = -ret;
228     }
229     ret += ((int32_t)src3 << 15) + (round << 14);
230     ret >>= 15;
231 
232     if (ret != (int16_t)ret) {
233         *sat = 1;
234         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
235     }
236     return ret;
237 }
238 
239 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
240                                   uint32_t src2, uint32_t src3)
241 {
242     uint32_t *sat = &env->vfp.qc[0];
243     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
244     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
245                                 false, true, sat);
246     return deposit32(e1, 16, 16, e2);
247 }
248 
249 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
250                               void *vq, uint32_t desc)
251 {
252     uintptr_t opr_sz = simd_oprsz(desc);
253     int16_t *d = vd;
254     int16_t *n = vn;
255     int16_t *m = vm;
256     uintptr_t i;
257 
258     for (i = 0; i < opr_sz / 2; ++i) {
259         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
260     }
261     clear_tail(d, opr_sz, simd_maxsz(desc));
262 }
263 
264 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
265                                   uint32_t src2, uint32_t src3)
266 {
267     uint32_t *sat = &env->vfp.qc[0];
268     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
269     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
270                                 true, true, sat);
271     return deposit32(e1, 16, 16, e2);
272 }
273 
274 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
275                               void *vq, uint32_t desc)
276 {
277     uintptr_t opr_sz = simd_oprsz(desc);
278     int16_t *d = vd;
279     int16_t *n = vn;
280     int16_t *m = vm;
281     uintptr_t i;
282 
283     for (i = 0; i < opr_sz / 2; ++i) {
284         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
285     }
286     clear_tail(d, opr_sz, simd_maxsz(desc));
287 }
288 
289 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
290                             void *vq, uint32_t desc)
291 {
292     intptr_t i, opr_sz = simd_oprsz(desc);
293     int16_t *d = vd, *n = vn, *m = vm;
294 
295     for (i = 0; i < opr_sz / 2; ++i) {
296         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
297     }
298     clear_tail(d, opr_sz, simd_maxsz(desc));
299 }
300 
301 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
302                              void *vq, uint32_t desc)
303 {
304     intptr_t i, opr_sz = simd_oprsz(desc);
305     int16_t *d = vd, *n = vn, *m = vm;
306 
307     for (i = 0; i < opr_sz / 2; ++i) {
308         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
309     }
310     clear_tail(d, opr_sz, simd_maxsz(desc));
311 }
312 
313 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
314                              void *va, uint32_t desc)
315 {
316     intptr_t i, opr_sz = simd_oprsz(desc);
317     int16_t *d = vd, *n = vn, *m = vm, *a = va;
318     uint32_t discard;
319 
320     for (i = 0; i < opr_sz / 2; ++i) {
321         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
322     }
323 }
324 
325 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
326                              void *va, uint32_t desc)
327 {
328     intptr_t i, opr_sz = simd_oprsz(desc);
329     int16_t *d = vd, *n = vn, *m = vm, *a = va;
330     uint32_t discard;
331 
332     for (i = 0; i < opr_sz / 2; ++i) {
333         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
334     }
335 }
336 
337 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
338 {
339     intptr_t i, opr_sz = simd_oprsz(desc);
340     int16_t *d = vd, *n = vn, *m = vm;
341     uint32_t discard;
342 
343     for (i = 0; i < opr_sz / 2; ++i) {
344         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
345     }
346 }
347 
348 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
349 {
350     intptr_t i, opr_sz = simd_oprsz(desc);
351     int16_t *d = vd, *n = vn, *m = vm;
352     uint32_t discard;
353 
354     for (i = 0; i < opr_sz / 2; ++i) {
355         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
356     }
357 }
358 
359 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
360 {
361     intptr_t i, j, opr_sz = simd_oprsz(desc);
362     int idx = simd_data(desc);
363     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
364     uint32_t discard;
365 
366     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
367         int16_t mm = m[i];
368         for (j = 0; j < 16 / 2; ++j) {
369             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
370         }
371     }
372 }
373 
374 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
375 {
376     intptr_t i, j, opr_sz = simd_oprsz(desc);
377     int idx = simd_data(desc);
378     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
379     uint32_t discard;
380 
381     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
382         int16_t mm = m[i];
383         for (j = 0; j < 16 / 2; ++j) {
384             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
385         }
386     }
387 }
388 
389 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
390 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
391                       bool neg, bool round, uint32_t *sat)
392 {
393     /* Simplify similarly to do_sqrdmlah_b above.  */
394     int64_t ret = (int64_t)src1 * src2;
395     if (neg) {
396         ret = -ret;
397     }
398     ret += ((int64_t)src3 << 31) + (round << 30);
399     ret >>= 31;
400 
401     if (ret != (int32_t)ret) {
402         *sat = 1;
403         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
404     }
405     return ret;
406 }
407 
408 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
409                                   int32_t src2, int32_t src3)
410 {
411     uint32_t *sat = &env->vfp.qc[0];
412     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
413 }
414 
415 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
416                               void *vq, uint32_t desc)
417 {
418     uintptr_t opr_sz = simd_oprsz(desc);
419     int32_t *d = vd;
420     int32_t *n = vn;
421     int32_t *m = vm;
422     uintptr_t i;
423 
424     for (i = 0; i < opr_sz / 4; ++i) {
425         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
426     }
427     clear_tail(d, opr_sz, simd_maxsz(desc));
428 }
429 
430 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
431                                   int32_t src2, int32_t src3)
432 {
433     uint32_t *sat = &env->vfp.qc[0];
434     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
435 }
436 
437 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
438                               void *vq, uint32_t desc)
439 {
440     uintptr_t opr_sz = simd_oprsz(desc);
441     int32_t *d = vd;
442     int32_t *n = vn;
443     int32_t *m = vm;
444     uintptr_t i;
445 
446     for (i = 0; i < opr_sz / 4; ++i) {
447         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
448     }
449     clear_tail(d, opr_sz, simd_maxsz(desc));
450 }
451 
452 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
453                             void *vq, uint32_t desc)
454 {
455     intptr_t i, opr_sz = simd_oprsz(desc);
456     int32_t *d = vd, *n = vn, *m = vm;
457 
458     for (i = 0; i < opr_sz / 4; ++i) {
459         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
460     }
461     clear_tail(d, opr_sz, simd_maxsz(desc));
462 }
463 
464 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
465                              void *vq, uint32_t desc)
466 {
467     intptr_t i, opr_sz = simd_oprsz(desc);
468     int32_t *d = vd, *n = vn, *m = vm;
469 
470     for (i = 0; i < opr_sz / 4; ++i) {
471         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
472     }
473     clear_tail(d, opr_sz, simd_maxsz(desc));
474 }
475 
476 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
477                              void *va, uint32_t desc)
478 {
479     intptr_t i, opr_sz = simd_oprsz(desc);
480     int32_t *d = vd, *n = vn, *m = vm, *a = va;
481     uint32_t discard;
482 
483     for (i = 0; i < opr_sz / 4; ++i) {
484         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
485     }
486 }
487 
488 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
489                              void *va, uint32_t desc)
490 {
491     intptr_t i, opr_sz = simd_oprsz(desc);
492     int32_t *d = vd, *n = vn, *m = vm, *a = va;
493     uint32_t discard;
494 
495     for (i = 0; i < opr_sz / 4; ++i) {
496         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
497     }
498 }
499 
500 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
501 {
502     intptr_t i, opr_sz = simd_oprsz(desc);
503     int32_t *d = vd, *n = vn, *m = vm;
504     uint32_t discard;
505 
506     for (i = 0; i < opr_sz / 4; ++i) {
507         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
508     }
509 }
510 
511 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
512 {
513     intptr_t i, opr_sz = simd_oprsz(desc);
514     int32_t *d = vd, *n = vn, *m = vm;
515     uint32_t discard;
516 
517     for (i = 0; i < opr_sz / 4; ++i) {
518         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
519     }
520 }
521 
522 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
523 {
524     intptr_t i, j, opr_sz = simd_oprsz(desc);
525     int idx = simd_data(desc);
526     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
527     uint32_t discard;
528 
529     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
530         int32_t mm = m[i];
531         for (j = 0; j < 16 / 4; ++j) {
532             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
533         }
534     }
535 }
536 
537 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
538 {
539     intptr_t i, j, opr_sz = simd_oprsz(desc);
540     int idx = simd_data(desc);
541     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
542     uint32_t discard;
543 
544     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
545         int32_t mm = m[i];
546         for (j = 0; j < 16 / 4; ++j) {
547             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
548         }
549     }
550 }
551 
552 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
553 static int64_t do_sat128_d(Int128 r)
554 {
555     int64_t ls = int128_getlo(r);
556     int64_t hs = int128_gethi(r);
557 
558     if (unlikely(hs != (ls >> 63))) {
559         return hs < 0 ? INT64_MIN : INT64_MAX;
560     }
561     return ls;
562 }
563 
564 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
565 {
566     uint64_t l, h;
567     Int128 r, t;
568 
569     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
570     muls64(&l, &h, m, n);
571     r = int128_make128(l, h);
572     if (neg) {
573         r = int128_neg(r);
574     }
575     if (a) {
576         t = int128_exts64(a);
577         t = int128_lshift(t, 63);
578         r = int128_add(r, t);
579     }
580     if (round) {
581         t = int128_exts64(1ll << 62);
582         r = int128_add(r, t);
583     }
584     r = int128_rshift(r, 63);
585 
586     return do_sat128_d(r);
587 }
588 
589 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
590                              void *va, uint32_t desc)
591 {
592     intptr_t i, opr_sz = simd_oprsz(desc);
593     int64_t *d = vd, *n = vn, *m = vm, *a = va;
594 
595     for (i = 0; i < opr_sz / 8; ++i) {
596         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
597     }
598 }
599 
600 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
601                              void *va, uint32_t desc)
602 {
603     intptr_t i, opr_sz = simd_oprsz(desc);
604     int64_t *d = vd, *n = vn, *m = vm, *a = va;
605 
606     for (i = 0; i < opr_sz / 8; ++i) {
607         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
608     }
609 }
610 
611 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
612 {
613     intptr_t i, opr_sz = simd_oprsz(desc);
614     int64_t *d = vd, *n = vn, *m = vm;
615 
616     for (i = 0; i < opr_sz / 8; ++i) {
617         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
618     }
619 }
620 
621 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
622 {
623     intptr_t i, opr_sz = simd_oprsz(desc);
624     int64_t *d = vd, *n = vn, *m = vm;
625 
626     for (i = 0; i < opr_sz / 8; ++i) {
627         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
628     }
629 }
630 
631 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
632 {
633     intptr_t i, j, opr_sz = simd_oprsz(desc);
634     int idx = simd_data(desc);
635     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
636 
637     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
638         int64_t mm = m[i];
639         for (j = 0; j < 16 / 8; ++j) {
640             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
641         }
642     }
643 }
644 
645 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, j, opr_sz = simd_oprsz(desc);
648     int idx = simd_data(desc);
649     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
650 
651     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
652         int64_t mm = m[i];
653         for (j = 0; j < 16 / 8; ++j) {
654             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
655         }
656     }
657 }
658 
659 /* Integer 8 and 16-bit dot-product.
660  *
661  * Note that for the loops herein, host endianness does not matter
662  * with respect to the ordering of data within the quad-width lanes.
663  * All elements are treated equally, no matter where they are.
664  */
665 
666 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
667 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
668 {                                                                         \
669     intptr_t i, opr_sz = simd_oprsz(desc);                                \
670     TYPED *d = vd, *a = va;                                               \
671     TYPEN *n = vn;                                                        \
672     TYPEM *m = vm;                                                        \
673     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
674         d[i] = (a[i] +                                                    \
675                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
676                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
677                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
678                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
679     }                                                                     \
680     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
681 }
682 
683 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
684 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
685 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
686 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
687 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
688 
689 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
690 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
691 {                                                                         \
692     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
693     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
694     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
695     intptr_t index = simd_data(desc);                                     \
696     TYPED *d = vd, *a = va;                                               \
697     TYPEN *n = vn;                                                        \
698     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
699     do {                                                                  \
700         TYPED m0 = m_indexed[i * 4 + 0];                                  \
701         TYPED m1 = m_indexed[i * 4 + 1];                                  \
702         TYPED m2 = m_indexed[i * 4 + 2];                                  \
703         TYPED m3 = m_indexed[i * 4 + 3];                                  \
704         do {                                                              \
705             d[i] = (a[i] +                                                \
706                     n[i * 4 + 0] * m0 +                                   \
707                     n[i * 4 + 1] * m1 +                                   \
708                     n[i * 4 + 2] * m2 +                                   \
709                     n[i * 4 + 3] * m3);                                   \
710         } while (++i < segend);                                           \
711         segend = i + 4;                                                   \
712     } while (i < opr_sz_n);                                               \
713     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
714 }
715 
716 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
717 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
718 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
719 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
720 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
721 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
722 
723 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
724                          void *vfpst, uint32_t desc)
725 {
726     uintptr_t opr_sz = simd_oprsz(desc);
727     float16 *d = vd;
728     float16 *n = vn;
729     float16 *m = vm;
730     float_status *fpst = vfpst;
731     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
732     uint32_t neg_imag = neg_real ^ 1;
733     uintptr_t i;
734 
735     /* Shift boolean to the sign bit so we can xor to negate.  */
736     neg_real <<= 15;
737     neg_imag <<= 15;
738 
739     for (i = 0; i < opr_sz / 2; i += 2) {
740         float16 e0 = n[H2(i)];
741         float16 e1 = m[H2(i + 1)] ^ neg_imag;
742         float16 e2 = n[H2(i + 1)];
743         float16 e3 = m[H2(i)] ^ neg_real;
744 
745         d[H2(i)] = float16_add(e0, e1, fpst);
746         d[H2(i + 1)] = float16_add(e2, e3, fpst);
747     }
748     clear_tail(d, opr_sz, simd_maxsz(desc));
749 }
750 
751 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
752                          void *vfpst, uint32_t desc)
753 {
754     uintptr_t opr_sz = simd_oprsz(desc);
755     float32 *d = vd;
756     float32 *n = vn;
757     float32 *m = vm;
758     float_status *fpst = vfpst;
759     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
760     uint32_t neg_imag = neg_real ^ 1;
761     uintptr_t i;
762 
763     /* Shift boolean to the sign bit so we can xor to negate.  */
764     neg_real <<= 31;
765     neg_imag <<= 31;
766 
767     for (i = 0; i < opr_sz / 4; i += 2) {
768         float32 e0 = n[H4(i)];
769         float32 e1 = m[H4(i + 1)] ^ neg_imag;
770         float32 e2 = n[H4(i + 1)];
771         float32 e3 = m[H4(i)] ^ neg_real;
772 
773         d[H4(i)] = float32_add(e0, e1, fpst);
774         d[H4(i + 1)] = float32_add(e2, e3, fpst);
775     }
776     clear_tail(d, opr_sz, simd_maxsz(desc));
777 }
778 
779 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
780                          void *vfpst, uint32_t desc)
781 {
782     uintptr_t opr_sz = simd_oprsz(desc);
783     float64 *d = vd;
784     float64 *n = vn;
785     float64 *m = vm;
786     float_status *fpst = vfpst;
787     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
788     uint64_t neg_imag = neg_real ^ 1;
789     uintptr_t i;
790 
791     /* Shift boolean to the sign bit so we can xor to negate.  */
792     neg_real <<= 63;
793     neg_imag <<= 63;
794 
795     for (i = 0; i < opr_sz / 8; i += 2) {
796         float64 e0 = n[i];
797         float64 e1 = m[i + 1] ^ neg_imag;
798         float64 e2 = n[i + 1];
799         float64 e3 = m[i] ^ neg_real;
800 
801         d[i] = float64_add(e0, e1, fpst);
802         d[i + 1] = float64_add(e2, e3, fpst);
803     }
804     clear_tail(d, opr_sz, simd_maxsz(desc));
805 }
806 
807 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
808                          void *vfpst, uint32_t desc)
809 {
810     uintptr_t opr_sz = simd_oprsz(desc);
811     float16 *d = vd, *n = vn, *m = vm, *a = va;
812     float_status *fpst = vfpst;
813     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
814     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
815     uint32_t neg_real = flip ^ neg_imag;
816     uintptr_t i;
817 
818     /* Shift boolean to the sign bit so we can xor to negate.  */
819     neg_real <<= 15;
820     neg_imag <<= 15;
821 
822     for (i = 0; i < opr_sz / 2; i += 2) {
823         float16 e2 = n[H2(i + flip)];
824         float16 e1 = m[H2(i + flip)] ^ neg_real;
825         float16 e4 = e2;
826         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
827 
828         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
829         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
830     }
831     clear_tail(d, opr_sz, simd_maxsz(desc));
832 }
833 
834 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
835                              void *vfpst, uint32_t desc)
836 {
837     uintptr_t opr_sz = simd_oprsz(desc);
838     float16 *d = vd, *n = vn, *m = vm, *a = va;
839     float_status *fpst = vfpst;
840     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
841     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
842     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
843     uint32_t neg_real = flip ^ neg_imag;
844     intptr_t elements = opr_sz / sizeof(float16);
845     intptr_t eltspersegment = 16 / sizeof(float16);
846     intptr_t i, j;
847 
848     /* Shift boolean to the sign bit so we can xor to negate.  */
849     neg_real <<= 15;
850     neg_imag <<= 15;
851 
852     for (i = 0; i < elements; i += eltspersegment) {
853         float16 mr = m[H2(i + 2 * index + 0)];
854         float16 mi = m[H2(i + 2 * index + 1)];
855         float16 e1 = neg_real ^ (flip ? mi : mr);
856         float16 e3 = neg_imag ^ (flip ? mr : mi);
857 
858         for (j = i; j < i + eltspersegment; j += 2) {
859             float16 e2 = n[H2(j + flip)];
860             float16 e4 = e2;
861 
862             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
863             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
864         }
865     }
866     clear_tail(d, opr_sz, simd_maxsz(desc));
867 }
868 
869 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
870                          void *vfpst, uint32_t desc)
871 {
872     uintptr_t opr_sz = simd_oprsz(desc);
873     float32 *d = vd, *n = vn, *m = vm, *a = va;
874     float_status *fpst = vfpst;
875     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
876     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
877     uint32_t neg_real = flip ^ neg_imag;
878     uintptr_t i;
879 
880     /* Shift boolean to the sign bit so we can xor to negate.  */
881     neg_real <<= 31;
882     neg_imag <<= 31;
883 
884     for (i = 0; i < opr_sz / 4; i += 2) {
885         float32 e2 = n[H4(i + flip)];
886         float32 e1 = m[H4(i + flip)] ^ neg_real;
887         float32 e4 = e2;
888         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
889 
890         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
891         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
892     }
893     clear_tail(d, opr_sz, simd_maxsz(desc));
894 }
895 
896 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
897                              void *vfpst, uint32_t desc)
898 {
899     uintptr_t opr_sz = simd_oprsz(desc);
900     float32 *d = vd, *n = vn, *m = vm, *a = va;
901     float_status *fpst = vfpst;
902     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
903     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
904     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
905     uint32_t neg_real = flip ^ neg_imag;
906     intptr_t elements = opr_sz / sizeof(float32);
907     intptr_t eltspersegment = 16 / sizeof(float32);
908     intptr_t i, j;
909 
910     /* Shift boolean to the sign bit so we can xor to negate.  */
911     neg_real <<= 31;
912     neg_imag <<= 31;
913 
914     for (i = 0; i < elements; i += eltspersegment) {
915         float32 mr = m[H4(i + 2 * index + 0)];
916         float32 mi = m[H4(i + 2 * index + 1)];
917         float32 e1 = neg_real ^ (flip ? mi : mr);
918         float32 e3 = neg_imag ^ (flip ? mr : mi);
919 
920         for (j = i; j < i + eltspersegment; j += 2) {
921             float32 e2 = n[H4(j + flip)];
922             float32 e4 = e2;
923 
924             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
925             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
926         }
927     }
928     clear_tail(d, opr_sz, simd_maxsz(desc));
929 }
930 
931 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
932                          void *vfpst, uint32_t desc)
933 {
934     uintptr_t opr_sz = simd_oprsz(desc);
935     float64 *d = vd, *n = vn, *m = vm, *a = va;
936     float_status *fpst = vfpst;
937     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
938     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
939     uint64_t neg_real = flip ^ neg_imag;
940     uintptr_t i;
941 
942     /* Shift boolean to the sign bit so we can xor to negate.  */
943     neg_real <<= 63;
944     neg_imag <<= 63;
945 
946     for (i = 0; i < opr_sz / 8; i += 2) {
947         float64 e2 = n[i + flip];
948         float64 e1 = m[i + flip] ^ neg_real;
949         float64 e4 = e2;
950         float64 e3 = m[i + 1 - flip] ^ neg_imag;
951 
952         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
953         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
954     }
955     clear_tail(d, opr_sz, simd_maxsz(desc));
956 }
957 
958 /*
959  * Floating point comparisons producing an integer result (all 1s or all 0s).
960  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
961  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
962  */
963 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
964 {
965     return -float16_eq_quiet(op1, op2, stat);
966 }
967 
968 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
969 {
970     return -float32_eq_quiet(op1, op2, stat);
971 }
972 
973 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
974 {
975     return -float16_le(op2, op1, stat);
976 }
977 
978 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
979 {
980     return -float32_le(op2, op1, stat);
981 }
982 
983 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
984 {
985     return -float16_lt(op2, op1, stat);
986 }
987 
988 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
989 {
990     return -float32_lt(op2, op1, stat);
991 }
992 
993 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
994 {
995     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
996 }
997 
998 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
999 {
1000     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1001 }
1002 
1003 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1004 {
1005     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1006 }
1007 
1008 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1009 {
1010     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1011 }
1012 
1013 static int16_t vfp_tosszh(float16 x, void *fpstp)
1014 {
1015     float_status *fpst = fpstp;
1016     if (float16_is_any_nan(x)) {
1017         float_raise(float_flag_invalid, fpst);
1018         return 0;
1019     }
1020     return float16_to_int16_round_to_zero(x, fpst);
1021 }
1022 
1023 static uint16_t vfp_touszh(float16 x, void *fpstp)
1024 {
1025     float_status *fpst = fpstp;
1026     if (float16_is_any_nan(x)) {
1027         float_raise(float_flag_invalid, fpst);
1028         return 0;
1029     }
1030     return float16_to_uint16_round_to_zero(x, fpst);
1031 }
1032 
1033 #define DO_2OP(NAME, FUNC, TYPE) \
1034 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1035 {                                                                 \
1036     intptr_t i, oprsz = simd_oprsz(desc);                         \
1037     TYPE *d = vd, *n = vn;                                        \
1038     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1039         d[i] = FUNC(n[i], stat);                                  \
1040     }                                                             \
1041     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1042 }
1043 
1044 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1045 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1046 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1047 
1048 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1049 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1050 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1051 
1052 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1053 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1054 
1055 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1056 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1057 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1058 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1059 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1060 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1061 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1062 DO_2OP(gvec_touszh, vfp_touszh, float16)
1063 
1064 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1065     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1066     {                                                           \
1067         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1068     }
1069 
1070 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1071     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1072     {                                                           \
1073         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1074     }
1075 
1076 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1077     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1078     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1079     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1080     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1081 
1082 DO_2OP_CMP0(cgt, cgt, FWD)
1083 DO_2OP_CMP0(cge, cge, FWD)
1084 DO_2OP_CMP0(ceq, ceq, FWD)
1085 DO_2OP_CMP0(clt, cgt, REV)
1086 DO_2OP_CMP0(cle, cge, REV)
1087 
1088 #undef DO_2OP
1089 #undef DO_2OP_CMP0
1090 
1091 /* Floating-point trigonometric starting value.
1092  * See the ARM ARM pseudocode function FPTrigSMul.
1093  */
1094 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1095 {
1096     float16 result = float16_mul(op1, op1, stat);
1097     if (!float16_is_any_nan(result)) {
1098         result = float16_set_sign(result, op2 & 1);
1099     }
1100     return result;
1101 }
1102 
1103 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1104 {
1105     float32 result = float32_mul(op1, op1, stat);
1106     if (!float32_is_any_nan(result)) {
1107         result = float32_set_sign(result, op2 & 1);
1108     }
1109     return result;
1110 }
1111 
1112 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1113 {
1114     float64 result = float64_mul(op1, op1, stat);
1115     if (!float64_is_any_nan(result)) {
1116         result = float64_set_sign(result, op2 & 1);
1117     }
1118     return result;
1119 }
1120 
1121 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1122 {
1123     return float16_abs(float16_sub(op1, op2, stat));
1124 }
1125 
1126 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1127 {
1128     return float32_abs(float32_sub(op1, op2, stat));
1129 }
1130 
1131 /*
1132  * Reciprocal step. These are the AArch32 version which uses a
1133  * non-fused multiply-and-subtract.
1134  */
1135 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1136 {
1137     op1 = float16_squash_input_denormal(op1, stat);
1138     op2 = float16_squash_input_denormal(op2, stat);
1139 
1140     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1141         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1142         return float16_two;
1143     }
1144     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1145 }
1146 
1147 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1148 {
1149     op1 = float32_squash_input_denormal(op1, stat);
1150     op2 = float32_squash_input_denormal(op2, stat);
1151 
1152     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1153         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1154         return float32_two;
1155     }
1156     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1157 }
1158 
1159 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1160 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1161 {
1162     op1 = float16_squash_input_denormal(op1, stat);
1163     op2 = float16_squash_input_denormal(op2, stat);
1164 
1165     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1166         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1167         return float16_one_point_five;
1168     }
1169     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1170     return float16_div(op1, float16_two, stat);
1171 }
1172 
1173 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1174 {
1175     op1 = float32_squash_input_denormal(op1, stat);
1176     op2 = float32_squash_input_denormal(op2, stat);
1177 
1178     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1179         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1180         return float32_one_point_five;
1181     }
1182     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1183     return float32_div(op1, float32_two, stat);
1184 }
1185 
1186 #define DO_3OP(NAME, FUNC, TYPE) \
1187 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1188 {                                                                          \
1189     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1190     TYPE *d = vd, *n = vn, *m = vm;                                        \
1191     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1192         d[i] = FUNC(n[i], m[i], stat);                                     \
1193     }                                                                      \
1194     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1195 }
1196 
1197 DO_3OP(gvec_fadd_h, float16_add, float16)
1198 DO_3OP(gvec_fadd_s, float32_add, float32)
1199 DO_3OP(gvec_fadd_d, float64_add, float64)
1200 
1201 DO_3OP(gvec_fsub_h, float16_sub, float16)
1202 DO_3OP(gvec_fsub_s, float32_sub, float32)
1203 DO_3OP(gvec_fsub_d, float64_sub, float64)
1204 
1205 DO_3OP(gvec_fmul_h, float16_mul, float16)
1206 DO_3OP(gvec_fmul_s, float32_mul, float32)
1207 DO_3OP(gvec_fmul_d, float64_mul, float64)
1208 
1209 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1210 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1211 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1212 
1213 DO_3OP(gvec_fabd_h, float16_abd, float16)
1214 DO_3OP(gvec_fabd_s, float32_abd, float32)
1215 
1216 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1217 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1218 
1219 DO_3OP(gvec_fcge_h, float16_cge, float16)
1220 DO_3OP(gvec_fcge_s, float32_cge, float32)
1221 
1222 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1223 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1224 
1225 DO_3OP(gvec_facge_h, float16_acge, float16)
1226 DO_3OP(gvec_facge_s, float32_acge, float32)
1227 
1228 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1229 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1230 
1231 DO_3OP(gvec_fmax_h, float16_max, float16)
1232 DO_3OP(gvec_fmax_s, float32_max, float32)
1233 
1234 DO_3OP(gvec_fmin_h, float16_min, float16)
1235 DO_3OP(gvec_fmin_s, float32_min, float32)
1236 
1237 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1238 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1239 
1240 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1241 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1242 
1243 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1244 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1245 
1246 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1247 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1248 
1249 #ifdef TARGET_AARCH64
1250 
1251 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1252 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1253 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1254 
1255 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1256 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1257 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1258 
1259 #endif
1260 #undef DO_3OP
1261 
1262 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1263 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1264                                  float_status *stat)
1265 {
1266     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1267 }
1268 
1269 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1270                                  float_status *stat)
1271 {
1272     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1273 }
1274 
1275 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1276                                  float_status *stat)
1277 {
1278     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1279 }
1280 
1281 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1282                                  float_status *stat)
1283 {
1284     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1285 }
1286 
1287 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1288 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1289                                 float_status *stat)
1290 {
1291     return float16_muladd(op1, op2, dest, 0, stat);
1292 }
1293 
1294 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1295                                  float_status *stat)
1296 {
1297     return float32_muladd(op1, op2, dest, 0, stat);
1298 }
1299 
1300 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1301                                  float_status *stat)
1302 {
1303     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1304 }
1305 
1306 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1307                                  float_status *stat)
1308 {
1309     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1310 }
1311 
1312 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1313 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1314 {                                                                          \
1315     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1316     TYPE *d = vd, *n = vn, *m = vm;                                        \
1317     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1318         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1319     }                                                                      \
1320     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1321 }
1322 
1323 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1324 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1325 
1326 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1327 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1328 
1329 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1330 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1331 
1332 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1333 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1334 
1335 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1336  * For AdvSIMD, there is of course only one such vector segment.
1337  */
1338 
1339 #define DO_MUL_IDX(NAME, TYPE, H) \
1340 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1341 {                                                                          \
1342     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1343     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1344     intptr_t idx = simd_data(desc);                                        \
1345     TYPE *d = vd, *n = vn, *m = vm;                                        \
1346     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1347         TYPE mm = m[H(i + idx)];                                           \
1348         for (j = 0; j < segment; j++) {                                    \
1349             d[i + j] = n[i + j] * mm;                                      \
1350         }                                                                  \
1351     }                                                                      \
1352     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1353 }
1354 
1355 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1356 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1357 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1358 
1359 #undef DO_MUL_IDX
1360 
1361 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1362 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1363 {                                                                          \
1364     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1365     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1366     intptr_t idx = simd_data(desc);                                        \
1367     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1368     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1369         TYPE mm = m[H(i + idx)];                                           \
1370         for (j = 0; j < segment; j++) {                                    \
1371             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1372         }                                                                  \
1373     }                                                                      \
1374     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1375 }
1376 
1377 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1378 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1379 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1380 
1381 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1382 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1383 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1384 
1385 #undef DO_MLA_IDX
1386 
1387 #define DO_FMUL_IDX(NAME, ADD, TYPE, H)                                    \
1388 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1389 {                                                                          \
1390     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1391     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1392     intptr_t idx = simd_data(desc);                                        \
1393     TYPE *d = vd, *n = vn, *m = vm;                                        \
1394     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1395         TYPE mm = m[H(i + idx)];                                           \
1396         for (j = 0; j < segment; j++) {                                    \
1397             d[i + j] = TYPE##_##ADD(d[i + j],                              \
1398                                     TYPE##_mul(n[i + j], mm, stat), stat); \
1399         }                                                                  \
1400     }                                                                      \
1401     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1402 }
1403 
1404 #define float16_nop(N, M, S) (M)
1405 #define float32_nop(N, M, S) (M)
1406 #define float64_nop(N, M, S) (M)
1407 
1408 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1409 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1410 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
1411 
1412 /*
1413  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1414  * the fused ops below they assume accumulate both from and into Vd.
1415  */
1416 DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1417 DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1418 DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1419 DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1420 
1421 #undef float16_nop
1422 #undef float32_nop
1423 #undef float64_nop
1424 #undef DO_FMUL_IDX
1425 
1426 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1427 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1428                   void *stat, uint32_t desc)                               \
1429 {                                                                          \
1430     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1431     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1432     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1433     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1434     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1435     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1436     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1437         TYPE mm = m[H(i + idx)];                                           \
1438         for (j = 0; j < segment; j++) {                                    \
1439             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1440                                      mm, a[i + j], 0, stat);               \
1441         }                                                                  \
1442     }                                                                      \
1443     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1444 }
1445 
1446 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1447 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1448 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1449 
1450 #undef DO_FMLA_IDX
1451 
1452 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1453 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1454 {                                                                          \
1455     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1456     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1457     bool q = false;                                                        \
1458     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1459         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1460         if (dd < MIN) {                                                    \
1461             dd = MIN;                                                      \
1462             q = true;                                                      \
1463         } else if (dd > MAX) {                                             \
1464             dd = MAX;                                                      \
1465             q = true;                                                      \
1466         }                                                                  \
1467         d[i] = dd;                                                         \
1468     }                                                                      \
1469     if (q) {                                                               \
1470         uint32_t *qc = vq;                                                 \
1471         qc[0] = 1;                                                         \
1472     }                                                                      \
1473     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1474 }
1475 
1476 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1477 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1478 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1479 
1480 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1481 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1482 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1483 
1484 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1485 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1486 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1487 
1488 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1489 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1490 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1491 
1492 #undef DO_SAT
1493 
1494 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1495                           void *vm, uint32_t desc)
1496 {
1497     intptr_t i, oprsz = simd_oprsz(desc);
1498     uint64_t *d = vd, *n = vn, *m = vm;
1499     bool q = false;
1500 
1501     for (i = 0; i < oprsz / 8; i++) {
1502         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1503         if (dd < nn) {
1504             dd = UINT64_MAX;
1505             q = true;
1506         }
1507         d[i] = dd;
1508     }
1509     if (q) {
1510         uint32_t *qc = vq;
1511         qc[0] = 1;
1512     }
1513     clear_tail(d, oprsz, simd_maxsz(desc));
1514 }
1515 
1516 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1517                           void *vm, uint32_t desc)
1518 {
1519     intptr_t i, oprsz = simd_oprsz(desc);
1520     uint64_t *d = vd, *n = vn, *m = vm;
1521     bool q = false;
1522 
1523     for (i = 0; i < oprsz / 8; i++) {
1524         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1525         if (nn < mm) {
1526             dd = 0;
1527             q = true;
1528         }
1529         d[i] = dd;
1530     }
1531     if (q) {
1532         uint32_t *qc = vq;
1533         qc[0] = 1;
1534     }
1535     clear_tail(d, oprsz, simd_maxsz(desc));
1536 }
1537 
1538 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1539                           void *vm, uint32_t desc)
1540 {
1541     intptr_t i, oprsz = simd_oprsz(desc);
1542     int64_t *d = vd, *n = vn, *m = vm;
1543     bool q = false;
1544 
1545     for (i = 0; i < oprsz / 8; i++) {
1546         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1547         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1548             dd = (nn >> 63) ^ ~INT64_MIN;
1549             q = true;
1550         }
1551         d[i] = dd;
1552     }
1553     if (q) {
1554         uint32_t *qc = vq;
1555         qc[0] = 1;
1556     }
1557     clear_tail(d, oprsz, simd_maxsz(desc));
1558 }
1559 
1560 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1561                           void *vm, uint32_t desc)
1562 {
1563     intptr_t i, oprsz = simd_oprsz(desc);
1564     int64_t *d = vd, *n = vn, *m = vm;
1565     bool q = false;
1566 
1567     for (i = 0; i < oprsz / 8; i++) {
1568         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1569         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1570             dd = (nn >> 63) ^ ~INT64_MIN;
1571             q = true;
1572         }
1573         d[i] = dd;
1574     }
1575     if (q) {
1576         uint32_t *qc = vq;
1577         qc[0] = 1;
1578     }
1579     clear_tail(d, oprsz, simd_maxsz(desc));
1580 }
1581 
1582 
1583 #define DO_SRA(NAME, TYPE)                              \
1584 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1585 {                                                       \
1586     intptr_t i, oprsz = simd_oprsz(desc);               \
1587     int shift = simd_data(desc);                        \
1588     TYPE *d = vd, *n = vn;                              \
1589     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1590         d[i] += n[i] >> shift;                          \
1591     }                                                   \
1592     clear_tail(d, oprsz, simd_maxsz(desc));             \
1593 }
1594 
1595 DO_SRA(gvec_ssra_b, int8_t)
1596 DO_SRA(gvec_ssra_h, int16_t)
1597 DO_SRA(gvec_ssra_s, int32_t)
1598 DO_SRA(gvec_ssra_d, int64_t)
1599 
1600 DO_SRA(gvec_usra_b, uint8_t)
1601 DO_SRA(gvec_usra_h, uint16_t)
1602 DO_SRA(gvec_usra_s, uint32_t)
1603 DO_SRA(gvec_usra_d, uint64_t)
1604 
1605 #undef DO_SRA
1606 
1607 #define DO_RSHR(NAME, TYPE)                             \
1608 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1609 {                                                       \
1610     intptr_t i, oprsz = simd_oprsz(desc);               \
1611     int shift = simd_data(desc);                        \
1612     TYPE *d = vd, *n = vn;                              \
1613     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1614         TYPE tmp = n[i] >> (shift - 1);                 \
1615         d[i] = (tmp >> 1) + (tmp & 1);                  \
1616     }                                                   \
1617     clear_tail(d, oprsz, simd_maxsz(desc));             \
1618 }
1619 
1620 DO_RSHR(gvec_srshr_b, int8_t)
1621 DO_RSHR(gvec_srshr_h, int16_t)
1622 DO_RSHR(gvec_srshr_s, int32_t)
1623 DO_RSHR(gvec_srshr_d, int64_t)
1624 
1625 DO_RSHR(gvec_urshr_b, uint8_t)
1626 DO_RSHR(gvec_urshr_h, uint16_t)
1627 DO_RSHR(gvec_urshr_s, uint32_t)
1628 DO_RSHR(gvec_urshr_d, uint64_t)
1629 
1630 #undef DO_RSHR
1631 
1632 #define DO_RSRA(NAME, TYPE)                             \
1633 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1634 {                                                       \
1635     intptr_t i, oprsz = simd_oprsz(desc);               \
1636     int shift = simd_data(desc);                        \
1637     TYPE *d = vd, *n = vn;                              \
1638     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1639         TYPE tmp = n[i] >> (shift - 1);                 \
1640         d[i] += (tmp >> 1) + (tmp & 1);                 \
1641     }                                                   \
1642     clear_tail(d, oprsz, simd_maxsz(desc));             \
1643 }
1644 
1645 DO_RSRA(gvec_srsra_b, int8_t)
1646 DO_RSRA(gvec_srsra_h, int16_t)
1647 DO_RSRA(gvec_srsra_s, int32_t)
1648 DO_RSRA(gvec_srsra_d, int64_t)
1649 
1650 DO_RSRA(gvec_ursra_b, uint8_t)
1651 DO_RSRA(gvec_ursra_h, uint16_t)
1652 DO_RSRA(gvec_ursra_s, uint32_t)
1653 DO_RSRA(gvec_ursra_d, uint64_t)
1654 
1655 #undef DO_RSRA
1656 
1657 #define DO_SRI(NAME, TYPE)                              \
1658 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1659 {                                                       \
1660     intptr_t i, oprsz = simd_oprsz(desc);               \
1661     int shift = simd_data(desc);                        \
1662     TYPE *d = vd, *n = vn;                              \
1663     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1664         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1665     }                                                   \
1666     clear_tail(d, oprsz, simd_maxsz(desc));             \
1667 }
1668 
1669 DO_SRI(gvec_sri_b, uint8_t)
1670 DO_SRI(gvec_sri_h, uint16_t)
1671 DO_SRI(gvec_sri_s, uint32_t)
1672 DO_SRI(gvec_sri_d, uint64_t)
1673 
1674 #undef DO_SRI
1675 
1676 #define DO_SLI(NAME, TYPE)                              \
1677 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1678 {                                                       \
1679     intptr_t i, oprsz = simd_oprsz(desc);               \
1680     int shift = simd_data(desc);                        \
1681     TYPE *d = vd, *n = vn;                              \
1682     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1683         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1684     }                                                   \
1685     clear_tail(d, oprsz, simd_maxsz(desc));             \
1686 }
1687 
1688 DO_SLI(gvec_sli_b, uint8_t)
1689 DO_SLI(gvec_sli_h, uint16_t)
1690 DO_SLI(gvec_sli_s, uint32_t)
1691 DO_SLI(gvec_sli_d, uint64_t)
1692 
1693 #undef DO_SLI
1694 
1695 /*
1696  * Convert float16 to float32, raising no exceptions and
1697  * preserving exceptional values, including SNaN.
1698  * This is effectively an unpack+repack operation.
1699  */
1700 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1701 {
1702     const int f16_bias = 15;
1703     const int f32_bias = 127;
1704     uint32_t sign = extract32(f16, 15, 1);
1705     uint32_t exp = extract32(f16, 10, 5);
1706     uint32_t frac = extract32(f16, 0, 10);
1707 
1708     if (exp == 0x1f) {
1709         /* Inf or NaN */
1710         exp = 0xff;
1711     } else if (exp == 0) {
1712         /* Zero or denormal.  */
1713         if (frac != 0) {
1714             if (fz16) {
1715                 frac = 0;
1716             } else {
1717                 /*
1718                  * Denormal; these are all normal float32.
1719                  * Shift the fraction so that the msb is at bit 11,
1720                  * then remove bit 11 as the implicit bit of the
1721                  * normalized float32.  Note that we still go through
1722                  * the shift for normal numbers below, to put the
1723                  * float32 fraction at the right place.
1724                  */
1725                 int shift = clz32(frac) - 21;
1726                 frac = (frac << shift) & 0x3ff;
1727                 exp = f32_bias - f16_bias - shift + 1;
1728             }
1729         }
1730     } else {
1731         /* Normal number; adjust the bias.  */
1732         exp += f32_bias - f16_bias;
1733     }
1734     sign <<= 31;
1735     exp <<= 23;
1736     frac <<= 23 - 10;
1737 
1738     return sign | exp | frac;
1739 }
1740 
1741 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1742 {
1743     /*
1744      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1745      * Load the 2nd qword iff is_q & is_2.
1746      * Shift to the 2nd dword iff !is_q & is_2.
1747      * For !is_q & !is_2, the upper bits of the result are garbage.
1748      */
1749     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1750 }
1751 
1752 /*
1753  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1754  * as there is not yet SVE versions that might use blocking.
1755  */
1756 
1757 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1758                      uint32_t desc, bool fz16)
1759 {
1760     intptr_t i, oprsz = simd_oprsz(desc);
1761     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1762     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1763     int is_q = oprsz == 16;
1764     uint64_t n_4, m_4;
1765 
1766     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1767     n_4 = load4_f16(vn, is_q, is_2);
1768     m_4 = load4_f16(vm, is_q, is_2);
1769 
1770     /* Negate all inputs for FMLSL at once.  */
1771     if (is_s) {
1772         n_4 ^= 0x8000800080008000ull;
1773     }
1774 
1775     for (i = 0; i < oprsz / 4; i++) {
1776         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1777         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1778         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1779     }
1780     clear_tail(d, oprsz, simd_maxsz(desc));
1781 }
1782 
1783 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1784                             void *venv, uint32_t desc)
1785 {
1786     CPUARMState *env = venv;
1787     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1788              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1789 }
1790 
1791 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1792                             void *venv, uint32_t desc)
1793 {
1794     CPUARMState *env = venv;
1795     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1796              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1797 }
1798 
1799 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1800                                void *venv, uint32_t desc)
1801 {
1802     intptr_t i, oprsz = simd_oprsz(desc);
1803     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1804     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1805     CPUARMState *env = venv;
1806     float_status *status = &env->vfp.fp_status;
1807     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1808 
1809     for (i = 0; i < oprsz; i += sizeof(float32)) {
1810         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1811         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1812         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1813         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1814         float32 aa = *(float32 *)(va + H1_4(i));
1815 
1816         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1817     }
1818 }
1819 
1820 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1821                          uint32_t desc, bool fz16)
1822 {
1823     intptr_t i, oprsz = simd_oprsz(desc);
1824     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1825     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1826     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1827     int is_q = oprsz == 16;
1828     uint64_t n_4;
1829     float32 m_1;
1830 
1831     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1832     n_4 = load4_f16(vn, is_q, is_2);
1833 
1834     /* Negate all inputs for FMLSL at once.  */
1835     if (is_s) {
1836         n_4 ^= 0x8000800080008000ull;
1837     }
1838 
1839     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1840 
1841     for (i = 0; i < oprsz / 4; i++) {
1842         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1843         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1844     }
1845     clear_tail(d, oprsz, simd_maxsz(desc));
1846 }
1847 
1848 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1849                                 void *venv, uint32_t desc)
1850 {
1851     CPUARMState *env = venv;
1852     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1853                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1854 }
1855 
1856 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1857                                 void *venv, uint32_t desc)
1858 {
1859     CPUARMState *env = venv;
1860     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1861                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1862 }
1863 
1864 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1865                                void *venv, uint32_t desc)
1866 {
1867     intptr_t i, j, oprsz = simd_oprsz(desc);
1868     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1869     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1870     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1871     CPUARMState *env = venv;
1872     float_status *status = &env->vfp.fp_status;
1873     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1874 
1875     for (i = 0; i < oprsz; i += 16) {
1876         float16 mm_16 = *(float16 *)(vm + i + idx);
1877         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1878 
1879         for (j = 0; j < 16; j += sizeof(float32)) {
1880             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1881             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1882             float32 aa = *(float32 *)(va + H1_4(i + j));
1883 
1884             *(float32 *)(vd + H1_4(i + j)) =
1885                 float32_muladd(nn, mm, aa, 0, status);
1886         }
1887     }
1888 }
1889 
1890 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1891 {
1892     intptr_t i, opr_sz = simd_oprsz(desc);
1893     int8_t *d = vd, *n = vn, *m = vm;
1894 
1895     for (i = 0; i < opr_sz; ++i) {
1896         int8_t mm = m[i];
1897         int8_t nn = n[i];
1898         int8_t res = 0;
1899         if (mm >= 0) {
1900             if (mm < 8) {
1901                 res = nn << mm;
1902             }
1903         } else {
1904             res = nn >> (mm > -8 ? -mm : 7);
1905         }
1906         d[i] = res;
1907     }
1908     clear_tail(d, opr_sz, simd_maxsz(desc));
1909 }
1910 
1911 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1912 {
1913     intptr_t i, opr_sz = simd_oprsz(desc);
1914     int16_t *d = vd, *n = vn, *m = vm;
1915 
1916     for (i = 0; i < opr_sz / 2; ++i) {
1917         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1918         int16_t nn = n[i];
1919         int16_t res = 0;
1920         if (mm >= 0) {
1921             if (mm < 16) {
1922                 res = nn << mm;
1923             }
1924         } else {
1925             res = nn >> (mm > -16 ? -mm : 15);
1926         }
1927         d[i] = res;
1928     }
1929     clear_tail(d, opr_sz, simd_maxsz(desc));
1930 }
1931 
1932 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1933 {
1934     intptr_t i, opr_sz = simd_oprsz(desc);
1935     uint8_t *d = vd, *n = vn, *m = vm;
1936 
1937     for (i = 0; i < opr_sz; ++i) {
1938         int8_t mm = m[i];
1939         uint8_t nn = n[i];
1940         uint8_t res = 0;
1941         if (mm >= 0) {
1942             if (mm < 8) {
1943                 res = nn << mm;
1944             }
1945         } else {
1946             if (mm > -8) {
1947                 res = nn >> -mm;
1948             }
1949         }
1950         d[i] = res;
1951     }
1952     clear_tail(d, opr_sz, simd_maxsz(desc));
1953 }
1954 
1955 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1956 {
1957     intptr_t i, opr_sz = simd_oprsz(desc);
1958     uint16_t *d = vd, *n = vn, *m = vm;
1959 
1960     for (i = 0; i < opr_sz / 2; ++i) {
1961         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1962         uint16_t nn = n[i];
1963         uint16_t res = 0;
1964         if (mm >= 0) {
1965             if (mm < 16) {
1966                 res = nn << mm;
1967             }
1968         } else {
1969             if (mm > -16) {
1970                 res = nn >> -mm;
1971             }
1972         }
1973         d[i] = res;
1974     }
1975     clear_tail(d, opr_sz, simd_maxsz(desc));
1976 }
1977 
1978 /*
1979  * 8x8->8 polynomial multiply.
1980  *
1981  * Polynomial multiplication is like integer multiplication except the
1982  * partial products are XORed, not added.
1983  *
1984  * TODO: expose this as a generic vector operation, as it is a common
1985  * crypto building block.
1986  */
1987 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1988 {
1989     intptr_t i, j, opr_sz = simd_oprsz(desc);
1990     uint64_t *d = vd, *n = vn, *m = vm;
1991 
1992     for (i = 0; i < opr_sz / 8; ++i) {
1993         uint64_t nn = n[i];
1994         uint64_t mm = m[i];
1995         uint64_t rr = 0;
1996 
1997         for (j = 0; j < 8; ++j) {
1998             uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1999             rr ^= mm & mask;
2000             mm = (mm << 1) & 0xfefefefefefefefeull;
2001             nn >>= 1;
2002         }
2003         d[i] = rr;
2004     }
2005     clear_tail(d, opr_sz, simd_maxsz(desc));
2006 }
2007 
2008 /*
2009  * 64x64->128 polynomial multiply.
2010  * Because of the lanes are not accessed in strict columns,
2011  * this probably cannot be turned into a generic helper.
2012  */
2013 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2014 {
2015     intptr_t i, j, opr_sz = simd_oprsz(desc);
2016     intptr_t hi = simd_data(desc);
2017     uint64_t *d = vd, *n = vn, *m = vm;
2018 
2019     for (i = 0; i < opr_sz / 8; i += 2) {
2020         uint64_t nn = n[i + hi];
2021         uint64_t mm = m[i + hi];
2022         uint64_t rhi = 0;
2023         uint64_t rlo = 0;
2024 
2025         /* Bit 0 can only influence the low 64-bit result.  */
2026         if (nn & 1) {
2027             rlo = mm;
2028         }
2029 
2030         for (j = 1; j < 64; ++j) {
2031             uint64_t mask = -((nn >> j) & 1);
2032             rlo ^= (mm << j) & mask;
2033             rhi ^= (mm >> (64 - j)) & mask;
2034         }
2035         d[i] = rlo;
2036         d[i + 1] = rhi;
2037     }
2038     clear_tail(d, opr_sz, simd_maxsz(desc));
2039 }
2040 
2041 /*
2042  * 8x8->16 polynomial multiply.
2043  *
2044  * The byte inputs are expanded to (or extracted from) half-words.
2045  * Note that neon and sve2 get the inputs from different positions.
2046  * This allows 4 bytes to be processed in parallel with uint64_t.
2047  */
2048 
2049 static uint64_t expand_byte_to_half(uint64_t x)
2050 {
2051     return  (x & 0x000000ff)
2052          | ((x & 0x0000ff00) << 8)
2053          | ((x & 0x00ff0000) << 16)
2054          | ((x & 0xff000000) << 24);
2055 }
2056 
2057 uint64_t pmull_w(uint64_t op1, uint64_t op2)
2058 {
2059     uint64_t result = 0;
2060     int i;
2061     for (i = 0; i < 16; ++i) {
2062         uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
2063         result ^= op2 & mask;
2064         op1 >>= 1;
2065         op2 <<= 1;
2066     }
2067     return result;
2068 }
2069 
2070 uint64_t pmull_h(uint64_t op1, uint64_t op2)
2071 {
2072     uint64_t result = 0;
2073     int i;
2074     for (i = 0; i < 8; ++i) {
2075         uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
2076         result ^= op2 & mask;
2077         op1 >>= 1;
2078         op2 <<= 1;
2079     }
2080     return result;
2081 }
2082 
2083 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2084 {
2085     int hi = simd_data(desc);
2086     uint64_t *d = vd, *n = vn, *m = vm;
2087     uint64_t nn = n[hi], mm = m[hi];
2088 
2089     d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2090     nn >>= 32;
2091     mm >>= 32;
2092     d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2093 
2094     clear_tail(d, 16, simd_maxsz(desc));
2095 }
2096 
2097 #ifdef TARGET_AARCH64
2098 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2099 {
2100     int shift = simd_data(desc) * 8;
2101     intptr_t i, opr_sz = simd_oprsz(desc);
2102     uint64_t *d = vd, *n = vn, *m = vm;
2103 
2104     for (i = 0; i < opr_sz / 8; ++i) {
2105         uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
2106         uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
2107 
2108         d[i] = pmull_h(nn, mm);
2109     }
2110 }
2111 
2112 static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2113 {
2114     uint64_t result = 0;
2115     int i;
2116 
2117     for (i = 0; i < 32; ++i) {
2118         uint64_t mask = -((op1 >> i) & 1);
2119         result ^= (op2 << i) & mask;
2120     }
2121     return result;
2122 }
2123 
2124 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2125 {
2126     intptr_t sel = H4(simd_data(desc));
2127     intptr_t i, opr_sz = simd_oprsz(desc);
2128     uint32_t *n = vn, *m = vm;
2129     uint64_t *d = vd;
2130 
2131     for (i = 0; i < opr_sz / 8; ++i) {
2132         d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2133     }
2134 }
2135 #endif
2136 
2137 #define DO_CMP0(NAME, TYPE, OP)                         \
2138 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2139 {                                                       \
2140     intptr_t i, opr_sz = simd_oprsz(desc);              \
2141     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2142         TYPE nn = *(TYPE *)(vn + i);                    \
2143         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2144     }                                                   \
2145     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2146 }
2147 
2148 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2149 DO_CMP0(gvec_clt0_b, int8_t, <)
2150 DO_CMP0(gvec_cle0_b, int8_t, <=)
2151 DO_CMP0(gvec_cgt0_b, int8_t, >)
2152 DO_CMP0(gvec_cge0_b, int8_t, >=)
2153 
2154 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2155 DO_CMP0(gvec_clt0_h, int16_t, <)
2156 DO_CMP0(gvec_cle0_h, int16_t, <=)
2157 DO_CMP0(gvec_cgt0_h, int16_t, >)
2158 DO_CMP0(gvec_cge0_h, int16_t, >=)
2159 
2160 #undef DO_CMP0
2161 
2162 #define DO_ABD(NAME, TYPE)                                      \
2163 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2164 {                                                               \
2165     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2166     TYPE *d = vd, *n = vn, *m = vm;                             \
2167                                                                 \
2168     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2169         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2170     }                                                           \
2171     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2172 }
2173 
2174 DO_ABD(gvec_sabd_b, int8_t)
2175 DO_ABD(gvec_sabd_h, int16_t)
2176 DO_ABD(gvec_sabd_s, int32_t)
2177 DO_ABD(gvec_sabd_d, int64_t)
2178 
2179 DO_ABD(gvec_uabd_b, uint8_t)
2180 DO_ABD(gvec_uabd_h, uint16_t)
2181 DO_ABD(gvec_uabd_s, uint32_t)
2182 DO_ABD(gvec_uabd_d, uint64_t)
2183 
2184 #undef DO_ABD
2185 
2186 #define DO_ABA(NAME, TYPE)                                      \
2187 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2188 {                                                               \
2189     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2190     TYPE *d = vd, *n = vn, *m = vm;                             \
2191                                                                 \
2192     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2193         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2194     }                                                           \
2195     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2196 }
2197 
2198 DO_ABA(gvec_saba_b, int8_t)
2199 DO_ABA(gvec_saba_h, int16_t)
2200 DO_ABA(gvec_saba_s, int32_t)
2201 DO_ABA(gvec_saba_d, int64_t)
2202 
2203 DO_ABA(gvec_uaba_b, uint8_t)
2204 DO_ABA(gvec_uaba_h, uint16_t)
2205 DO_ABA(gvec_uaba_s, uint32_t)
2206 DO_ABA(gvec_uaba_d, uint64_t)
2207 
2208 #undef DO_ABA
2209 
2210 #define DO_NEON_PAIRWISE(NAME, OP)                                      \
2211     void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
2212                          void *stat, uint32_t oprsz)                    \
2213     {                                                                   \
2214         float_status *fpst = stat;                                      \
2215         float32 *d = vd;                                                \
2216         float32 *n = vn;                                                \
2217         float32 *m = vm;                                                \
2218         float32 r0, r1;                                                 \
2219                                                                         \
2220         /* Read all inputs before writing outputs in case vm == vd */   \
2221         r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
2222         r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
2223                                                                         \
2224         d[H4(0)] = r0;                                                  \
2225         d[H4(1)] = r1;                                                  \
2226     }                                                                   \
2227                                                                         \
2228     void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
2229                          void *stat, uint32_t oprsz)                    \
2230     {                                                                   \
2231         float_status *fpst = stat;                                      \
2232         float16 *d = vd;                                                \
2233         float16 *n = vn;                                                \
2234         float16 *m = vm;                                                \
2235         float16 r0, r1, r2, r3;                                         \
2236                                                                         \
2237         /* Read all inputs before writing outputs in case vm == vd */   \
2238         r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
2239         r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
2240         r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
2241         r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
2242                                                                         \
2243         d[H2(0)] = r0;                                                  \
2244         d[H2(1)] = r1;                                                  \
2245         d[H2(2)] = r2;                                                  \
2246         d[H2(3)] = r3;                                                  \
2247     }
2248 
2249 DO_NEON_PAIRWISE(neon_padd, add)
2250 DO_NEON_PAIRWISE(neon_pmax, max)
2251 DO_NEON_PAIRWISE(neon_pmin, min)
2252 
2253 #undef DO_NEON_PAIRWISE
2254 
2255 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2256     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2257     {                                                                   \
2258         intptr_t i, oprsz = simd_oprsz(desc);                           \
2259         int shift = simd_data(desc);                                    \
2260         TYPE *d = vd, *n = vn;                                          \
2261         float_status *fpst = stat;                                      \
2262         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2263             d[i] = FUNC(n[i], shift, fpst);                             \
2264         }                                                               \
2265         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2266     }
2267 
2268 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2269 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2270 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2271 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2272 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2273 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2274 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2275 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2276 
2277 #undef DO_VCVT_FIXED
2278 
2279 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2280     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2281     {                                                                   \
2282         float_status *fpst = stat;                                      \
2283         intptr_t i, oprsz = simd_oprsz(desc);                           \
2284         uint32_t rmode = simd_data(desc);                               \
2285         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2286         TYPE *d = vd, *n = vn;                                          \
2287         set_float_rounding_mode(rmode, fpst);                           \
2288         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2289             d[i] = FUNC(n[i], 0, fpst);                                 \
2290         }                                                               \
2291         set_float_rounding_mode(prev_rmode, fpst);                      \
2292         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2293     }
2294 
2295 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2296 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2297 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2298 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2299 
2300 #undef DO_VCVT_RMODE
2301 
2302 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2303     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2304     {                                                                   \
2305         float_status *fpst = stat;                                      \
2306         intptr_t i, oprsz = simd_oprsz(desc);                           \
2307         uint32_t rmode = simd_data(desc);                               \
2308         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2309         TYPE *d = vd, *n = vn;                                          \
2310         set_float_rounding_mode(rmode, fpst);                           \
2311         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2312             d[i] = FUNC(n[i], fpst);                                    \
2313         }                                                               \
2314         set_float_rounding_mode(prev_rmode, fpst);                      \
2315         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2316     }
2317 
2318 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2319 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2320 
2321 #undef DO_VRINT_RMODE
2322 
2323 #ifdef TARGET_AARCH64
2324 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2325 {
2326     const uint8_t *indices = vm;
2327     CPUARMState *env = venv;
2328     size_t oprsz = simd_oprsz(desc);
2329     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2330     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2331     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2332     union {
2333         uint8_t b[16];
2334         uint64_t d[2];
2335     } result;
2336 
2337     /*
2338      * We must construct the final result in a temp, lest the output
2339      * overlaps the input table.  For TBL, begin with zero; for TBX,
2340      * begin with the original register contents.  Note that we always
2341      * copy 16 bytes here to avoid an extra branch; clearing the high
2342      * bits of the register for oprsz == 8 is handled below.
2343      */
2344     if (is_tbx) {
2345         memcpy(&result, vd, 16);
2346     } else {
2347         memset(&result, 0, 16);
2348     }
2349 
2350     for (size_t i = 0; i < oprsz; ++i) {
2351         uint32_t index = indices[H1(i)];
2352 
2353         if (index < table_len) {
2354             /*
2355              * Convert index (a byte offset into the virtual table
2356              * which is a series of 128-bit vectors concatenated)
2357              * into the correct register element, bearing in mind
2358              * that the table can wrap around from V31 to V0.
2359              */
2360             const uint8_t *table = (const uint8_t *)
2361                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2362             result.b[H1(i)] = table[H1(index % 16)];
2363         }
2364     }
2365 
2366     memcpy(vd, &result, 16);
2367     clear_tail(vd, oprsz, simd_maxsz(desc));
2368 }
2369 #endif
2370 
2371 /*
2372  * NxN -> N highpart multiply
2373  *
2374  * TODO: expose this as a generic vector operation.
2375  */
2376 
2377 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2378 {
2379     intptr_t i, opr_sz = simd_oprsz(desc);
2380     int8_t *d = vd, *n = vn, *m = vm;
2381 
2382     for (i = 0; i < opr_sz; ++i) {
2383         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2384     }
2385     clear_tail(d, opr_sz, simd_maxsz(desc));
2386 }
2387 
2388 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2389 {
2390     intptr_t i, opr_sz = simd_oprsz(desc);
2391     int16_t *d = vd, *n = vn, *m = vm;
2392 
2393     for (i = 0; i < opr_sz / 2; ++i) {
2394         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2395     }
2396     clear_tail(d, opr_sz, simd_maxsz(desc));
2397 }
2398 
2399 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2400 {
2401     intptr_t i, opr_sz = simd_oprsz(desc);
2402     int32_t *d = vd, *n = vn, *m = vm;
2403 
2404     for (i = 0; i < opr_sz / 4; ++i) {
2405         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2406     }
2407     clear_tail(d, opr_sz, simd_maxsz(desc));
2408 }
2409 
2410 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2411 {
2412     intptr_t i, opr_sz = simd_oprsz(desc);
2413     uint64_t *d = vd, *n = vn, *m = vm;
2414     uint64_t discard;
2415 
2416     for (i = 0; i < opr_sz / 8; ++i) {
2417         muls64(&discard, &d[i], n[i], m[i]);
2418     }
2419     clear_tail(d, opr_sz, simd_maxsz(desc));
2420 }
2421 
2422 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2423 {
2424     intptr_t i, opr_sz = simd_oprsz(desc);
2425     uint8_t *d = vd, *n = vn, *m = vm;
2426 
2427     for (i = 0; i < opr_sz; ++i) {
2428         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2429     }
2430     clear_tail(d, opr_sz, simd_maxsz(desc));
2431 }
2432 
2433 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2434 {
2435     intptr_t i, opr_sz = simd_oprsz(desc);
2436     uint16_t *d = vd, *n = vn, *m = vm;
2437 
2438     for (i = 0; i < opr_sz / 2; ++i) {
2439         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2440     }
2441     clear_tail(d, opr_sz, simd_maxsz(desc));
2442 }
2443 
2444 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2445 {
2446     intptr_t i, opr_sz = simd_oprsz(desc);
2447     uint32_t *d = vd, *n = vn, *m = vm;
2448 
2449     for (i = 0; i < opr_sz / 4; ++i) {
2450         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2451     }
2452     clear_tail(d, opr_sz, simd_maxsz(desc));
2453 }
2454 
2455 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2456 {
2457     intptr_t i, opr_sz = simd_oprsz(desc);
2458     uint64_t *d = vd, *n = vn, *m = vm;
2459     uint64_t discard;
2460 
2461     for (i = 0; i < opr_sz / 8; ++i) {
2462         mulu64(&discard, &d[i], n[i], m[i]);
2463     }
2464     clear_tail(d, opr_sz, simd_maxsz(desc));
2465 }
2466 
2467 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2468 {
2469     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2470     int shr = simd_data(desc);
2471     uint64_t *d = vd, *n = vn, *m = vm;
2472 
2473     for (i = 0; i < opr_sz; ++i) {
2474         d[i] = ror64(n[i] ^ m[i], shr);
2475     }
2476     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2477 }
2478 
2479 /*
2480  * Integer matrix-multiply accumulate
2481  */
2482 
2483 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2484 {
2485     int8_t *n = vn, *m = vm;
2486 
2487     for (intptr_t k = 0; k < 8; ++k) {
2488         sum += n[H1(k)] * m[H1(k)];
2489     }
2490     return sum;
2491 }
2492 
2493 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2494 {
2495     uint8_t *n = vn, *m = vm;
2496 
2497     for (intptr_t k = 0; k < 8; ++k) {
2498         sum += n[H1(k)] * m[H1(k)];
2499     }
2500     return sum;
2501 }
2502 
2503 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2504 {
2505     uint8_t *n = vn;
2506     int8_t *m = vm;
2507 
2508     for (intptr_t k = 0; k < 8; ++k) {
2509         sum += n[H1(k)] * m[H1(k)];
2510     }
2511     return sum;
2512 }
2513 
2514 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2515                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2516 {
2517     intptr_t seg, opr_sz = simd_oprsz(desc);
2518 
2519     for (seg = 0; seg < opr_sz; seg += 16) {
2520         uint32_t *d = vd + seg;
2521         uint32_t *a = va + seg;
2522         uint32_t sum0, sum1, sum2, sum3;
2523 
2524         /*
2525          * Process the entire segment at once, writing back the
2526          * results only after we've consumed all of the inputs.
2527          *
2528          * Key to indices by column:
2529          *          i   j                  i             j
2530          */
2531         sum0 = a[H4(0 + 0)];
2532         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2533         sum1 = a[H4(0 + 1)];
2534         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2535         sum2 = a[H4(2 + 0)];
2536         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2537         sum3 = a[H4(2 + 1)];
2538         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2539 
2540         d[H4(0)] = sum0;
2541         d[H4(1)] = sum1;
2542         d[H4(2)] = sum2;
2543         d[H4(3)] = sum3;
2544     }
2545     clear_tail(vd, opr_sz, simd_maxsz(desc));
2546 }
2547 
2548 #define DO_MMLA_B(NAME, INNER) \
2549     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2550     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2551 
2552 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2553 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2554 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2555 
2556 /*
2557  * BFloat16 Dot Product
2558  */
2559 
2560 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2561 {
2562     /* FPCR is ignored for BFDOT and BFMMLA. */
2563     float_status bf_status = {
2564         .tininess_before_rounding = float_tininess_before_rounding,
2565         .float_rounding_mode = float_round_to_odd_inf,
2566         .flush_to_zero = true,
2567         .flush_inputs_to_zero = true,
2568         .default_nan_mode = true,
2569     };
2570     float32 t1, t2;
2571 
2572     /*
2573      * Extract each BFloat16 from the element pair, and shift
2574      * them such that they become float32.
2575      */
2576     t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2577     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2578     t1 = float32_add(t1, t2, &bf_status);
2579     t1 = float32_add(sum, t1, &bf_status);
2580 
2581     return t1;
2582 }
2583 
2584 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2585 {
2586     intptr_t i, opr_sz = simd_oprsz(desc);
2587     float32 *d = vd, *a = va;
2588     uint32_t *n = vn, *m = vm;
2589 
2590     for (i = 0; i < opr_sz / 4; ++i) {
2591         d[i] = bfdotadd(a[i], n[i], m[i]);
2592     }
2593     clear_tail(d, opr_sz, simd_maxsz(desc));
2594 }
2595 
2596 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2597                             void *va, uint32_t desc)
2598 {
2599     intptr_t i, j, opr_sz = simd_oprsz(desc);
2600     intptr_t index = simd_data(desc);
2601     intptr_t elements = opr_sz / 4;
2602     intptr_t eltspersegment = MIN(16 / 4, elements);
2603     float32 *d = vd, *a = va;
2604     uint32_t *n = vn, *m = vm;
2605 
2606     for (i = 0; i < elements; i += eltspersegment) {
2607         uint32_t m_idx = m[i + H4(index)];
2608 
2609         for (j = i; j < i + eltspersegment; j++) {
2610             d[j] = bfdotadd(a[j], n[j], m_idx);
2611         }
2612     }
2613     clear_tail(d, opr_sz, simd_maxsz(desc));
2614 }
2615 
2616 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2617 {
2618     intptr_t s, opr_sz = simd_oprsz(desc);
2619     float32 *d = vd, *a = va;
2620     uint32_t *n = vn, *m = vm;
2621 
2622     for (s = 0; s < opr_sz / 4; s += 4) {
2623         float32 sum00, sum01, sum10, sum11;
2624 
2625         /*
2626          * Process the entire segment at once, writing back the
2627          * results only after we've consumed all of the inputs.
2628          *
2629          * Key to indicies by column:
2630          *               i   j           i   k             j   k
2631          */
2632         sum00 = a[s + H4(0 + 0)];
2633         sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2634         sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2635 
2636         sum01 = a[s + H4(0 + 1)];
2637         sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2638         sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2639 
2640         sum10 = a[s + H4(2 + 0)];
2641         sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2642         sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2643 
2644         sum11 = a[s + H4(2 + 1)];
2645         sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2646         sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2647 
2648         d[s + H4(0 + 0)] = sum00;
2649         d[s + H4(0 + 1)] = sum01;
2650         d[s + H4(2 + 0)] = sum10;
2651         d[s + H4(2 + 1)] = sum11;
2652     }
2653     clear_tail(d, opr_sz, simd_maxsz(desc));
2654 }
2655 
2656 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2657                          void *stat, uint32_t desc)
2658 {
2659     intptr_t i, opr_sz = simd_oprsz(desc);
2660     intptr_t sel = simd_data(desc);
2661     float32 *d = vd, *a = va;
2662     bfloat16 *n = vn, *m = vm;
2663 
2664     for (i = 0; i < opr_sz / 4; ++i) {
2665         float32 nn = n[H2(i * 2 + sel)] << 16;
2666         float32 mm = m[H2(i * 2 + sel)] << 16;
2667         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2668     }
2669     clear_tail(d, opr_sz, simd_maxsz(desc));
2670 }
2671 
2672 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2673                              void *va, void *stat, uint32_t desc)
2674 {
2675     intptr_t i, j, opr_sz = simd_oprsz(desc);
2676     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2677     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2678     intptr_t elements = opr_sz / 4;
2679     intptr_t eltspersegment = MIN(16 / 4, elements);
2680     float32 *d = vd, *a = va;
2681     bfloat16 *n = vn, *m = vm;
2682 
2683     for (i = 0; i < elements; i += eltspersegment) {
2684         float32 m_idx = m[H2(2 * i + index)] << 16;
2685 
2686         for (j = i; j < i + eltspersegment; j++) {
2687             float32 n_j = n[H2(2 * j + sel)] << 16;
2688             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2689         }
2690     }
2691     clear_tail(d, opr_sz, simd_maxsz(desc));
2692 }
2693 
2694 #define DO_CLAMP(NAME, TYPE) \
2695 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2696 {                                                                       \
2697     intptr_t i, opr_sz = simd_oprsz(desc);                              \
2698     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2699         TYPE aa = *(TYPE *)(a + i);                                     \
2700         TYPE nn = *(TYPE *)(n + i);                                     \
2701         TYPE mm = *(TYPE *)(m + i);                                     \
2702         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2703         *(TYPE *)(d + i) = dd;                                          \
2704     }                                                                   \
2705     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2706 }
2707 
2708 DO_CLAMP(gvec_sclamp_b, int8_t)
2709 DO_CLAMP(gvec_sclamp_h, int16_t)
2710 DO_CLAMP(gvec_sclamp_s, int32_t)
2711 DO_CLAMP(gvec_sclamp_d, int64_t)
2712 
2713 DO_CLAMP(gvec_uclamp_b, uint8_t)
2714 DO_CLAMP(gvec_uclamp_h, uint16_t)
2715 DO_CLAMP(gvec_uclamp_s, uint32_t)
2716 DO_CLAMP(gvec_uclamp_d, uint64_t)
2717