xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision 8f6343ae)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
315                              void *va, uint32_t desc)
316 {
317     intptr_t i, opr_sz = simd_oprsz(desc);
318     int16_t *d = vd, *n = vn, *m = vm, *a = va;
319     uint32_t discard;
320 
321     for (i = 0; i < opr_sz / 2; ++i) {
322         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
323     }
324 }
325 
326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
327                              void *va, uint32_t desc)
328 {
329     intptr_t i, opr_sz = simd_oprsz(desc);
330     int16_t *d = vd, *n = vn, *m = vm, *a = va;
331     uint32_t discard;
332 
333     for (i = 0; i < opr_sz / 2; ++i) {
334         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
335     }
336 }
337 
338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
339 {
340     intptr_t i, opr_sz = simd_oprsz(desc);
341     int16_t *d = vd, *n = vn, *m = vm;
342     uint32_t discard;
343 
344     for (i = 0; i < opr_sz / 2; ++i) {
345         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
346     }
347 }
348 
349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
350 {
351     intptr_t i, opr_sz = simd_oprsz(desc);
352     int16_t *d = vd, *n = vn, *m = vm;
353     uint32_t discard;
354 
355     for (i = 0; i < opr_sz / 2; ++i) {
356         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
357     }
358 }
359 
360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
361 {
362     intptr_t i, j, opr_sz = simd_oprsz(desc);
363     int idx = simd_data(desc);
364     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
365     uint32_t discard;
366 
367     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
368         int16_t mm = m[i];
369         for (j = 0; j < 16 / 2; ++j) {
370             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
371         }
372     }
373 }
374 
375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
376 {
377     intptr_t i, j, opr_sz = simd_oprsz(desc);
378     int idx = simd_data(desc);
379     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
380     uint32_t discard;
381 
382     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
383         int16_t mm = m[i];
384         for (j = 0; j < 16 / 2; ++j) {
385             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
386         }
387     }
388 }
389 
390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
392                       bool neg, bool round, uint32_t *sat)
393 {
394     /* Simplify similarly to do_sqrdmlah_b above.  */
395     int64_t ret = (int64_t)src1 * src2;
396     if (neg) {
397         ret = -ret;
398     }
399     ret += ((int64_t)src3 << 31) + (round << 30);
400     ret >>= 31;
401 
402     if (ret != (int32_t)ret) {
403         *sat = 1;
404         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
405     }
406     return ret;
407 }
408 
409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
410                                   int32_t src2, int32_t src3)
411 {
412     uint32_t *sat = &env->vfp.qc[0];
413     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
414 }
415 
416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
417                               void *vq, uint32_t desc)
418 {
419     uintptr_t opr_sz = simd_oprsz(desc);
420     int32_t *d = vd;
421     int32_t *n = vn;
422     int32_t *m = vm;
423     uintptr_t i;
424 
425     for (i = 0; i < opr_sz / 4; ++i) {
426         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
427     }
428     clear_tail(d, opr_sz, simd_maxsz(desc));
429 }
430 
431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
432                                   int32_t src2, int32_t src3)
433 {
434     uint32_t *sat = &env->vfp.qc[0];
435     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
436 }
437 
438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
439                               void *vq, uint32_t desc)
440 {
441     uintptr_t opr_sz = simd_oprsz(desc);
442     int32_t *d = vd;
443     int32_t *n = vn;
444     int32_t *m = vm;
445     uintptr_t i;
446 
447     for (i = 0; i < opr_sz / 4; ++i) {
448         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
449     }
450     clear_tail(d, opr_sz, simd_maxsz(desc));
451 }
452 
453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
454                             void *vq, uint32_t desc)
455 {
456     intptr_t i, opr_sz = simd_oprsz(desc);
457     int32_t *d = vd, *n = vn, *m = vm;
458 
459     for (i = 0; i < opr_sz / 4; ++i) {
460         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
461     }
462     clear_tail(d, opr_sz, simd_maxsz(desc));
463 }
464 
465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
466                              void *vq, uint32_t desc)
467 {
468     intptr_t i, opr_sz = simd_oprsz(desc);
469     int32_t *d = vd, *n = vn, *m = vm;
470 
471     for (i = 0; i < opr_sz / 4; ++i) {
472         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
473     }
474     clear_tail(d, opr_sz, simd_maxsz(desc));
475 }
476 
477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
478                              void *va, uint32_t desc)
479 {
480     intptr_t i, opr_sz = simd_oprsz(desc);
481     int32_t *d = vd, *n = vn, *m = vm, *a = va;
482     uint32_t discard;
483 
484     for (i = 0; i < opr_sz / 4; ++i) {
485         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
486     }
487 }
488 
489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
490                              void *va, uint32_t desc)
491 {
492     intptr_t i, opr_sz = simd_oprsz(desc);
493     int32_t *d = vd, *n = vn, *m = vm, *a = va;
494     uint32_t discard;
495 
496     for (i = 0; i < opr_sz / 4; ++i) {
497         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
498     }
499 }
500 
501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
502 {
503     intptr_t i, opr_sz = simd_oprsz(desc);
504     int32_t *d = vd, *n = vn, *m = vm;
505     uint32_t discard;
506 
507     for (i = 0; i < opr_sz / 4; ++i) {
508         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
509     }
510 }
511 
512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
513 {
514     intptr_t i, opr_sz = simd_oprsz(desc);
515     int32_t *d = vd, *n = vn, *m = vm;
516     uint32_t discard;
517 
518     for (i = 0; i < opr_sz / 4; ++i) {
519         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
520     }
521 }
522 
523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
524 {
525     intptr_t i, j, opr_sz = simd_oprsz(desc);
526     int idx = simd_data(desc);
527     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
528     uint32_t discard;
529 
530     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
531         int32_t mm = m[i];
532         for (j = 0; j < 16 / 4; ++j) {
533             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
534         }
535     }
536 }
537 
538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
539 {
540     intptr_t i, j, opr_sz = simd_oprsz(desc);
541     int idx = simd_data(desc);
542     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
543     uint32_t discard;
544 
545     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
546         int32_t mm = m[i];
547         for (j = 0; j < 16 / 4; ++j) {
548             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
549         }
550     }
551 }
552 
553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
554 static int64_t do_sat128_d(Int128 r)
555 {
556     int64_t ls = int128_getlo(r);
557     int64_t hs = int128_gethi(r);
558 
559     if (unlikely(hs != (ls >> 63))) {
560         return hs < 0 ? INT64_MIN : INT64_MAX;
561     }
562     return ls;
563 }
564 
565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
566 {
567     uint64_t l, h;
568     Int128 r, t;
569 
570     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
571     muls64(&l, &h, m, n);
572     r = int128_make128(l, h);
573     if (neg) {
574         r = int128_neg(r);
575     }
576     if (a) {
577         t = int128_exts64(a);
578         t = int128_lshift(t, 63);
579         r = int128_add(r, t);
580     }
581     if (round) {
582         t = int128_exts64(1ll << 62);
583         r = int128_add(r, t);
584     }
585     r = int128_rshift(r, 63);
586 
587     return do_sat128_d(r);
588 }
589 
590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
591                              void *va, uint32_t desc)
592 {
593     intptr_t i, opr_sz = simd_oprsz(desc);
594     int64_t *d = vd, *n = vn, *m = vm, *a = va;
595 
596     for (i = 0; i < opr_sz / 8; ++i) {
597         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
598     }
599 }
600 
601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
602                              void *va, uint32_t desc)
603 {
604     intptr_t i, opr_sz = simd_oprsz(desc);
605     int64_t *d = vd, *n = vn, *m = vm, *a = va;
606 
607     for (i = 0; i < opr_sz / 8; ++i) {
608         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
609     }
610 }
611 
612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
613 {
614     intptr_t i, opr_sz = simd_oprsz(desc);
615     int64_t *d = vd, *n = vn, *m = vm;
616 
617     for (i = 0; i < opr_sz / 8; ++i) {
618         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
619     }
620 }
621 
622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int64_t *d = vd, *n = vn, *m = vm;
626 
627     for (i = 0; i < opr_sz / 8; ++i) {
628         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
629     }
630 }
631 
632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
633 {
634     intptr_t i, j, opr_sz = simd_oprsz(desc);
635     int idx = simd_data(desc);
636     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
637 
638     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
639         int64_t mm = m[i];
640         for (j = 0; j < 16 / 8; ++j) {
641             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
642         }
643     }
644 }
645 
646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
647 {
648     intptr_t i, j, opr_sz = simd_oprsz(desc);
649     int idx = simd_data(desc);
650     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
651 
652     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
653         int64_t mm = m[i];
654         for (j = 0; j < 16 / 8; ++j) {
655             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
656         }
657     }
658 }
659 
660 /* Integer 8 and 16-bit dot-product.
661  *
662  * Note that for the loops herein, host endianness does not matter
663  * with respect to the ordering of data within the quad-width lanes.
664  * All elements are treated equally, no matter where they are.
665  */
666 
667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
669 {                                                                         \
670     intptr_t i, opr_sz = simd_oprsz(desc);                                \
671     TYPED *d = vd, *a = va;                                               \
672     TYPEN *n = vn;                                                        \
673     TYPEM *m = vm;                                                        \
674     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
675         d[i] = (a[i] +                                                    \
676                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
677                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
678                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
679                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
680     }                                                                     \
681     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
682 }
683 
684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
689 
690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
692 {                                                                         \
693     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
694     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
695     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
696     intptr_t index = simd_data(desc);                                     \
697     TYPED *d = vd, *a = va;                                               \
698     TYPEN *n = vn;                                                        \
699     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
700     do {                                                                  \
701         TYPED m0 = m_indexed[i * 4 + 0];                                  \
702         TYPED m1 = m_indexed[i * 4 + 1];                                  \
703         TYPED m2 = m_indexed[i * 4 + 2];                                  \
704         TYPED m3 = m_indexed[i * 4 + 3];                                  \
705         do {                                                              \
706             d[i] = (a[i] +                                                \
707                     n[i * 4 + 0] * m0 +                                   \
708                     n[i * 4 + 1] * m1 +                                   \
709                     n[i * 4 + 2] * m2 +                                   \
710                     n[i * 4 + 3] * m3);                                   \
711         } while (++i < segend);                                           \
712         segend = i + 4;                                                   \
713     } while (i < opr_sz_n);                                               \
714     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
715 }
716 
717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
723 
724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
725                          void *vfpst, uint32_t desc)
726 {
727     uintptr_t opr_sz = simd_oprsz(desc);
728     float16 *d = vd;
729     float16 *n = vn;
730     float16 *m = vm;
731     float_status *fpst = vfpst;
732     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
733     uint32_t neg_imag = neg_real ^ 1;
734     uintptr_t i;
735 
736     /* Shift boolean to the sign bit so we can xor to negate.  */
737     neg_real <<= 15;
738     neg_imag <<= 15;
739 
740     for (i = 0; i < opr_sz / 2; i += 2) {
741         float16 e0 = n[H2(i)];
742         float16 e1 = m[H2(i + 1)] ^ neg_imag;
743         float16 e2 = n[H2(i + 1)];
744         float16 e3 = m[H2(i)] ^ neg_real;
745 
746         d[H2(i)] = float16_add(e0, e1, fpst);
747         d[H2(i + 1)] = float16_add(e2, e3, fpst);
748     }
749     clear_tail(d, opr_sz, simd_maxsz(desc));
750 }
751 
752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
753                          void *vfpst, uint32_t desc)
754 {
755     uintptr_t opr_sz = simd_oprsz(desc);
756     float32 *d = vd;
757     float32 *n = vn;
758     float32 *m = vm;
759     float_status *fpst = vfpst;
760     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
761     uint32_t neg_imag = neg_real ^ 1;
762     uintptr_t i;
763 
764     /* Shift boolean to the sign bit so we can xor to negate.  */
765     neg_real <<= 31;
766     neg_imag <<= 31;
767 
768     for (i = 0; i < opr_sz / 4; i += 2) {
769         float32 e0 = n[H4(i)];
770         float32 e1 = m[H4(i + 1)] ^ neg_imag;
771         float32 e2 = n[H4(i + 1)];
772         float32 e3 = m[H4(i)] ^ neg_real;
773 
774         d[H4(i)] = float32_add(e0, e1, fpst);
775         d[H4(i + 1)] = float32_add(e2, e3, fpst);
776     }
777     clear_tail(d, opr_sz, simd_maxsz(desc));
778 }
779 
780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
781                          void *vfpst, uint32_t desc)
782 {
783     uintptr_t opr_sz = simd_oprsz(desc);
784     float64 *d = vd;
785     float64 *n = vn;
786     float64 *m = vm;
787     float_status *fpst = vfpst;
788     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
789     uint64_t neg_imag = neg_real ^ 1;
790     uintptr_t i;
791 
792     /* Shift boolean to the sign bit so we can xor to negate.  */
793     neg_real <<= 63;
794     neg_imag <<= 63;
795 
796     for (i = 0; i < opr_sz / 8; i += 2) {
797         float64 e0 = n[i];
798         float64 e1 = m[i + 1] ^ neg_imag;
799         float64 e2 = n[i + 1];
800         float64 e3 = m[i] ^ neg_real;
801 
802         d[i] = float64_add(e0, e1, fpst);
803         d[i + 1] = float64_add(e2, e3, fpst);
804     }
805     clear_tail(d, opr_sz, simd_maxsz(desc));
806 }
807 
808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
809                          void *vfpst, uint32_t desc)
810 {
811     uintptr_t opr_sz = simd_oprsz(desc);
812     float16 *d = vd, *n = vn, *m = vm, *a = va;
813     float_status *fpst = vfpst;
814     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
815     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
816     uint32_t neg_real = flip ^ neg_imag;
817     uintptr_t i;
818 
819     /* Shift boolean to the sign bit so we can xor to negate.  */
820     neg_real <<= 15;
821     neg_imag <<= 15;
822 
823     for (i = 0; i < opr_sz / 2; i += 2) {
824         float16 e2 = n[H2(i + flip)];
825         float16 e1 = m[H2(i + flip)] ^ neg_real;
826         float16 e4 = e2;
827         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
828 
829         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
830         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
831     }
832     clear_tail(d, opr_sz, simd_maxsz(desc));
833 }
834 
835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
836                              void *vfpst, uint32_t desc)
837 {
838     uintptr_t opr_sz = simd_oprsz(desc);
839     float16 *d = vd, *n = vn, *m = vm, *a = va;
840     float_status *fpst = vfpst;
841     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
842     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
843     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
844     uint32_t neg_real = flip ^ neg_imag;
845     intptr_t elements = opr_sz / sizeof(float16);
846     intptr_t eltspersegment = 16 / sizeof(float16);
847     intptr_t i, j;
848 
849     /* Shift boolean to the sign bit so we can xor to negate.  */
850     neg_real <<= 15;
851     neg_imag <<= 15;
852 
853     for (i = 0; i < elements; i += eltspersegment) {
854         float16 mr = m[H2(i + 2 * index + 0)];
855         float16 mi = m[H2(i + 2 * index + 1)];
856         float16 e1 = neg_real ^ (flip ? mi : mr);
857         float16 e3 = neg_imag ^ (flip ? mr : mi);
858 
859         for (j = i; j < i + eltspersegment; j += 2) {
860             float16 e2 = n[H2(j + flip)];
861             float16 e4 = e2;
862 
863             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
864             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
865         }
866     }
867     clear_tail(d, opr_sz, simd_maxsz(desc));
868 }
869 
870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
871                          void *vfpst, uint32_t desc)
872 {
873     uintptr_t opr_sz = simd_oprsz(desc);
874     float32 *d = vd, *n = vn, *m = vm, *a = va;
875     float_status *fpst = vfpst;
876     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
877     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
878     uint32_t neg_real = flip ^ neg_imag;
879     uintptr_t i;
880 
881     /* Shift boolean to the sign bit so we can xor to negate.  */
882     neg_real <<= 31;
883     neg_imag <<= 31;
884 
885     for (i = 0; i < opr_sz / 4; i += 2) {
886         float32 e2 = n[H4(i + flip)];
887         float32 e1 = m[H4(i + flip)] ^ neg_real;
888         float32 e4 = e2;
889         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
890 
891         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
892         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
893     }
894     clear_tail(d, opr_sz, simd_maxsz(desc));
895 }
896 
897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
898                              void *vfpst, uint32_t desc)
899 {
900     uintptr_t opr_sz = simd_oprsz(desc);
901     float32 *d = vd, *n = vn, *m = vm, *a = va;
902     float_status *fpst = vfpst;
903     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
904     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
905     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
906     uint32_t neg_real = flip ^ neg_imag;
907     intptr_t elements = opr_sz / sizeof(float32);
908     intptr_t eltspersegment = 16 / sizeof(float32);
909     intptr_t i, j;
910 
911     /* Shift boolean to the sign bit so we can xor to negate.  */
912     neg_real <<= 31;
913     neg_imag <<= 31;
914 
915     for (i = 0; i < elements; i += eltspersegment) {
916         float32 mr = m[H4(i + 2 * index + 0)];
917         float32 mi = m[H4(i + 2 * index + 1)];
918         float32 e1 = neg_real ^ (flip ? mi : mr);
919         float32 e3 = neg_imag ^ (flip ? mr : mi);
920 
921         for (j = i; j < i + eltspersegment; j += 2) {
922             float32 e2 = n[H4(j + flip)];
923             float32 e4 = e2;
924 
925             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
926             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
927         }
928     }
929     clear_tail(d, opr_sz, simd_maxsz(desc));
930 }
931 
932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
933                          void *vfpst, uint32_t desc)
934 {
935     uintptr_t opr_sz = simd_oprsz(desc);
936     float64 *d = vd, *n = vn, *m = vm, *a = va;
937     float_status *fpst = vfpst;
938     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
939     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
940     uint64_t neg_real = flip ^ neg_imag;
941     uintptr_t i;
942 
943     /* Shift boolean to the sign bit so we can xor to negate.  */
944     neg_real <<= 63;
945     neg_imag <<= 63;
946 
947     for (i = 0; i < opr_sz / 8; i += 2) {
948         float64 e2 = n[i + flip];
949         float64 e1 = m[i + flip] ^ neg_real;
950         float64 e4 = e2;
951         float64 e3 = m[i + 1 - flip] ^ neg_imag;
952 
953         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
954         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
955     }
956     clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958 
959 /*
960  * Floating point comparisons producing an integer result (all 1s or all 0s).
961  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
962  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
963  */
964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
965 {
966     return -float16_eq_quiet(op1, op2, stat);
967 }
968 
969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
970 {
971     return -float32_eq_quiet(op1, op2, stat);
972 }
973 
974 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
975 {
976     return -float64_eq_quiet(op1, op2, stat);
977 }
978 
979 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
980 {
981     return -float16_le(op2, op1, stat);
982 }
983 
984 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
985 {
986     return -float32_le(op2, op1, stat);
987 }
988 
989 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
990 {
991     return -float64_le(op2, op1, stat);
992 }
993 
994 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
995 {
996     return -float16_lt(op2, op1, stat);
997 }
998 
999 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1000 {
1001     return -float32_lt(op2, op1, stat);
1002 }
1003 
1004 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1005 {
1006     return -float64_lt(op2, op1, stat);
1007 }
1008 
1009 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1010 {
1011     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1012 }
1013 
1014 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1015 {
1016     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1017 }
1018 
1019 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1020 {
1021     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1022 }
1023 
1024 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1025 {
1026     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1027 }
1028 
1029 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1030 {
1031     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1032 }
1033 
1034 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1035 {
1036     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1037 }
1038 
1039 static int16_t vfp_tosszh(float16 x, void *fpstp)
1040 {
1041     float_status *fpst = fpstp;
1042     if (float16_is_any_nan(x)) {
1043         float_raise(float_flag_invalid, fpst);
1044         return 0;
1045     }
1046     return float16_to_int16_round_to_zero(x, fpst);
1047 }
1048 
1049 static uint16_t vfp_touszh(float16 x, void *fpstp)
1050 {
1051     float_status *fpst = fpstp;
1052     if (float16_is_any_nan(x)) {
1053         float_raise(float_flag_invalid, fpst);
1054         return 0;
1055     }
1056     return float16_to_uint16_round_to_zero(x, fpst);
1057 }
1058 
1059 #define DO_2OP(NAME, FUNC, TYPE) \
1060 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1061 {                                                                 \
1062     intptr_t i, oprsz = simd_oprsz(desc);                         \
1063     TYPE *d = vd, *n = vn;                                        \
1064     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1065         d[i] = FUNC(n[i], stat);                                  \
1066     }                                                             \
1067     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1068 }
1069 
1070 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1071 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1072 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1073 
1074 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1075 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1076 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1077 
1078 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1079 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1080 
1081 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1082 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1083 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1084 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1085 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1086 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1087 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1088 DO_2OP(gvec_touszh, vfp_touszh, float16)
1089 
1090 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1091     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1092     {                                                           \
1093         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1094     }
1095 
1096 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1097     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1098     {                                                           \
1099         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1100     }
1101 
1102 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1103     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1104     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1105     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1106     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1107 
1108 DO_2OP_CMP0(cgt, cgt, FWD)
1109 DO_2OP_CMP0(cge, cge, FWD)
1110 DO_2OP_CMP0(ceq, ceq, FWD)
1111 DO_2OP_CMP0(clt, cgt, REV)
1112 DO_2OP_CMP0(cle, cge, REV)
1113 
1114 #undef DO_2OP
1115 #undef DO_2OP_CMP0
1116 
1117 /* Floating-point trigonometric starting value.
1118  * See the ARM ARM pseudocode function FPTrigSMul.
1119  */
1120 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1121 {
1122     float16 result = float16_mul(op1, op1, stat);
1123     if (!float16_is_any_nan(result)) {
1124         result = float16_set_sign(result, op2 & 1);
1125     }
1126     return result;
1127 }
1128 
1129 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1130 {
1131     float32 result = float32_mul(op1, op1, stat);
1132     if (!float32_is_any_nan(result)) {
1133         result = float32_set_sign(result, op2 & 1);
1134     }
1135     return result;
1136 }
1137 
1138 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1139 {
1140     float64 result = float64_mul(op1, op1, stat);
1141     if (!float64_is_any_nan(result)) {
1142         result = float64_set_sign(result, op2 & 1);
1143     }
1144     return result;
1145 }
1146 
1147 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1148 {
1149     return float16_abs(float16_sub(op1, op2, stat));
1150 }
1151 
1152 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1153 {
1154     return float32_abs(float32_sub(op1, op2, stat));
1155 }
1156 
1157 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1158 {
1159     return float64_abs(float64_sub(op1, op2, stat));
1160 }
1161 
1162 /*
1163  * Reciprocal step. These are the AArch32 version which uses a
1164  * non-fused multiply-and-subtract.
1165  */
1166 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1167 {
1168     op1 = float16_squash_input_denormal(op1, stat);
1169     op2 = float16_squash_input_denormal(op2, stat);
1170 
1171     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1172         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1173         return float16_two;
1174     }
1175     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1176 }
1177 
1178 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1179 {
1180     op1 = float32_squash_input_denormal(op1, stat);
1181     op2 = float32_squash_input_denormal(op2, stat);
1182 
1183     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1184         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1185         return float32_two;
1186     }
1187     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1188 }
1189 
1190 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1191 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1192 {
1193     op1 = float16_squash_input_denormal(op1, stat);
1194     op2 = float16_squash_input_denormal(op2, stat);
1195 
1196     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1197         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1198         return float16_one_point_five;
1199     }
1200     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1201     return float16_div(op1, float16_two, stat);
1202 }
1203 
1204 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1205 {
1206     op1 = float32_squash_input_denormal(op1, stat);
1207     op2 = float32_squash_input_denormal(op2, stat);
1208 
1209     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1210         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1211         return float32_one_point_five;
1212     }
1213     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1214     return float32_div(op1, float32_two, stat);
1215 }
1216 
1217 #define DO_3OP(NAME, FUNC, TYPE) \
1218 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1219 {                                                                          \
1220     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1221     TYPE *d = vd, *n = vn, *m = vm;                                        \
1222     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1223         d[i] = FUNC(n[i], m[i], stat);                                     \
1224     }                                                                      \
1225     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1226 }
1227 
1228 DO_3OP(gvec_fadd_h, float16_add, float16)
1229 DO_3OP(gvec_fadd_s, float32_add, float32)
1230 DO_3OP(gvec_fadd_d, float64_add, float64)
1231 
1232 DO_3OP(gvec_fsub_h, float16_sub, float16)
1233 DO_3OP(gvec_fsub_s, float32_sub, float32)
1234 DO_3OP(gvec_fsub_d, float64_sub, float64)
1235 
1236 DO_3OP(gvec_fmul_h, float16_mul, float16)
1237 DO_3OP(gvec_fmul_s, float32_mul, float32)
1238 DO_3OP(gvec_fmul_d, float64_mul, float64)
1239 
1240 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1241 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1242 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1243 
1244 DO_3OP(gvec_fabd_h, float16_abd, float16)
1245 DO_3OP(gvec_fabd_s, float32_abd, float32)
1246 DO_3OP(gvec_fabd_d, float64_abd, float64)
1247 
1248 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1249 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1250 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1251 
1252 DO_3OP(gvec_fcge_h, float16_cge, float16)
1253 DO_3OP(gvec_fcge_s, float32_cge, float32)
1254 DO_3OP(gvec_fcge_d, float64_cge, float64)
1255 
1256 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1257 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1258 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1259 
1260 DO_3OP(gvec_facge_h, float16_acge, float16)
1261 DO_3OP(gvec_facge_s, float32_acge, float32)
1262 DO_3OP(gvec_facge_d, float64_acge, float64)
1263 
1264 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1265 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1266 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1267 
1268 DO_3OP(gvec_fmax_h, float16_max, float16)
1269 DO_3OP(gvec_fmax_s, float32_max, float32)
1270 DO_3OP(gvec_fmax_d, float64_max, float64)
1271 
1272 DO_3OP(gvec_fmin_h, float16_min, float16)
1273 DO_3OP(gvec_fmin_s, float32_min, float32)
1274 DO_3OP(gvec_fmin_d, float64_min, float64)
1275 
1276 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1277 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1278 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1279 
1280 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1281 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1282 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1283 
1284 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1285 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1286 
1287 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1288 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1289 
1290 #ifdef TARGET_AARCH64
1291 DO_3OP(gvec_fdiv_h, float16_div, float16)
1292 DO_3OP(gvec_fdiv_s, float32_div, float32)
1293 DO_3OP(gvec_fdiv_d, float64_div, float64)
1294 
1295 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1296 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1297 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1298 
1299 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1300 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1301 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1302 
1303 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1304 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1305 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1306 
1307 #endif
1308 #undef DO_3OP
1309 
1310 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1311 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1312                                  float_status *stat)
1313 {
1314     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1315 }
1316 
1317 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1318                                  float_status *stat)
1319 {
1320     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1321 }
1322 
1323 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1324                                  float_status *stat)
1325 {
1326     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1327 }
1328 
1329 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1330                                  float_status *stat)
1331 {
1332     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1333 }
1334 
1335 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1336 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1337                                 float_status *stat)
1338 {
1339     return float16_muladd(op1, op2, dest, 0, stat);
1340 }
1341 
1342 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1343                                  float_status *stat)
1344 {
1345     return float32_muladd(op1, op2, dest, 0, stat);
1346 }
1347 
1348 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1349                                  float_status *stat)
1350 {
1351     return float64_muladd(op1, op2, dest, 0, stat);
1352 }
1353 
1354 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1355                                  float_status *stat)
1356 {
1357     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1358 }
1359 
1360 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1361                                  float_status *stat)
1362 {
1363     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1364 }
1365 
1366 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1367                                  float_status *stat)
1368 {
1369     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1370 }
1371 
1372 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1373 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1374 {                                                                          \
1375     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1376     TYPE *d = vd, *n = vn, *m = vm;                                        \
1377     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1378         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1379     }                                                                      \
1380     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1381 }
1382 
1383 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1384 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1385 
1386 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1387 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1388 
1389 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1390 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1391 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1392 
1393 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1394 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1395 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1396 
1397 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1398  * For AdvSIMD, there is of course only one such vector segment.
1399  */
1400 
1401 #define DO_MUL_IDX(NAME, TYPE, H) \
1402 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1403 {                                                                          \
1404     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1405     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1406     intptr_t idx = simd_data(desc);                                        \
1407     TYPE *d = vd, *n = vn, *m = vm;                                        \
1408     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1409         TYPE mm = m[H(i + idx)];                                           \
1410         for (j = 0; j < segment; j++) {                                    \
1411             d[i + j] = n[i + j] * mm;                                      \
1412         }                                                                  \
1413     }                                                                      \
1414     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1415 }
1416 
1417 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1418 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1419 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1420 
1421 #undef DO_MUL_IDX
1422 
1423 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1424 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1425 {                                                                          \
1426     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1427     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1428     intptr_t idx = simd_data(desc);                                        \
1429     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1430     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1431         TYPE mm = m[H(i + idx)];                                           \
1432         for (j = 0; j < segment; j++) {                                    \
1433             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1434         }                                                                  \
1435     }                                                                      \
1436     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1437 }
1438 
1439 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1440 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1441 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1442 
1443 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1444 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1445 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1446 
1447 #undef DO_MLA_IDX
1448 
1449 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1450 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1451 {                                                                          \
1452     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1453     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1454     intptr_t idx = simd_data(desc);                                        \
1455     TYPE *d = vd, *n = vn, *m = vm;                                        \
1456     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1457         TYPE mm = m[H(i + idx)];                                           \
1458         for (j = 0; j < segment; j++) {                                    \
1459             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1460         }                                                                  \
1461     }                                                                      \
1462     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1463 }
1464 
1465 #define nop(N, M, S) (M)
1466 
1467 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1468 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1469 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1470 
1471 #ifdef TARGET_AARCH64
1472 
1473 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1474 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1475 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1476 
1477 #endif
1478 
1479 #undef nop
1480 
1481 /*
1482  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1483  * the fused ops below they assume accumulate both from and into Vd.
1484  */
1485 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1486 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1487 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1488 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1489 
1490 #undef DO_FMUL_IDX
1491 
1492 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1493 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1494                   void *stat, uint32_t desc)                               \
1495 {                                                                          \
1496     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1497     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1498     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1499     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1500     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1501     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1502     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1503         TYPE mm = m[H(i + idx)];                                           \
1504         for (j = 0; j < segment; j++) {                                    \
1505             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1506                                      mm, a[i + j], 0, stat);               \
1507         }                                                                  \
1508     }                                                                      \
1509     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1510 }
1511 
1512 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1513 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1514 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1515 
1516 #undef DO_FMLA_IDX
1517 
1518 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1519 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1520 {                                                                          \
1521     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1522     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1523     bool q = false;                                                        \
1524     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1525         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1526         if (dd < MIN) {                                                    \
1527             dd = MIN;                                                      \
1528             q = true;                                                      \
1529         } else if (dd > MAX) {                                             \
1530             dd = MAX;                                                      \
1531             q = true;                                                      \
1532         }                                                                  \
1533         d[i] = dd;                                                         \
1534     }                                                                      \
1535     if (q) {                                                               \
1536         uint32_t *qc = vq;                                                 \
1537         qc[0] = 1;                                                         \
1538     }                                                                      \
1539     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1540 }
1541 
1542 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1543 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1544 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1545 
1546 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1547 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1548 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1549 
1550 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1551 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1552 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1553 
1554 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1555 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1556 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1557 
1558 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1559 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1560 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1561 
1562 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1563 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1564 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1565 
1566 #undef DO_SAT
1567 
1568 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1569                           void *vm, uint32_t desc)
1570 {
1571     intptr_t i, oprsz = simd_oprsz(desc);
1572     uint64_t *d = vd, *n = vn, *m = vm;
1573     bool q = false;
1574 
1575     for (i = 0; i < oprsz / 8; i++) {
1576         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1577         if (dd < nn) {
1578             dd = UINT64_MAX;
1579             q = true;
1580         }
1581         d[i] = dd;
1582     }
1583     if (q) {
1584         uint32_t *qc = vq;
1585         qc[0] = 1;
1586     }
1587     clear_tail(d, oprsz, simd_maxsz(desc));
1588 }
1589 
1590 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1591                           void *vm, uint32_t desc)
1592 {
1593     intptr_t i, oprsz = simd_oprsz(desc);
1594     uint64_t *d = vd, *n = vn, *m = vm;
1595     bool q = false;
1596 
1597     for (i = 0; i < oprsz / 8; i++) {
1598         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1599         if (nn < mm) {
1600             dd = 0;
1601             q = true;
1602         }
1603         d[i] = dd;
1604     }
1605     if (q) {
1606         uint32_t *qc = vq;
1607         qc[0] = 1;
1608     }
1609     clear_tail(d, oprsz, simd_maxsz(desc));
1610 }
1611 
1612 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1613                           void *vm, uint32_t desc)
1614 {
1615     intptr_t i, oprsz = simd_oprsz(desc);
1616     int64_t *d = vd, *n = vn, *m = vm;
1617     bool q = false;
1618 
1619     for (i = 0; i < oprsz / 8; i++) {
1620         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1621         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1622             dd = (nn >> 63) ^ ~INT64_MIN;
1623             q = true;
1624         }
1625         d[i] = dd;
1626     }
1627     if (q) {
1628         uint32_t *qc = vq;
1629         qc[0] = 1;
1630     }
1631     clear_tail(d, oprsz, simd_maxsz(desc));
1632 }
1633 
1634 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1635                           void *vm, uint32_t desc)
1636 {
1637     intptr_t i, oprsz = simd_oprsz(desc);
1638     int64_t *d = vd, *n = vn, *m = vm;
1639     bool q = false;
1640 
1641     for (i = 0; i < oprsz / 8; i++) {
1642         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1643         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1644             dd = (nn >> 63) ^ ~INT64_MIN;
1645             q = true;
1646         }
1647         d[i] = dd;
1648     }
1649     if (q) {
1650         uint32_t *qc = vq;
1651         qc[0] = 1;
1652     }
1653     clear_tail(d, oprsz, simd_maxsz(desc));
1654 }
1655 
1656 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1657                            void *vm, uint32_t desc)
1658 {
1659     intptr_t i, oprsz = simd_oprsz(desc);
1660     uint64_t *d = vd, *n = vn, *m = vm;
1661     bool q = false;
1662 
1663     for (i = 0; i < oprsz / 8; i++) {
1664         uint64_t nn = n[i];
1665         int64_t mm = m[i];
1666         uint64_t dd = nn + mm;
1667 
1668         if (mm < 0) {
1669             if (nn < (uint64_t)-mm) {
1670                 dd = 0;
1671                 q = true;
1672             }
1673         } else {
1674             if (dd < nn) {
1675                 dd = UINT64_MAX;
1676                 q = true;
1677             }
1678         }
1679         d[i] = dd;
1680     }
1681     if (q) {
1682         uint32_t *qc = vq;
1683         qc[0] = 1;
1684     }
1685     clear_tail(d, oprsz, simd_maxsz(desc));
1686 }
1687 
1688 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1689                            void *vm, uint32_t desc)
1690 {
1691     intptr_t i, oprsz = simd_oprsz(desc);
1692     uint64_t *d = vd, *n = vn, *m = vm;
1693     bool q = false;
1694 
1695     for (i = 0; i < oprsz / 8; i++) {
1696         int64_t nn = n[i];
1697         uint64_t mm = m[i];
1698         int64_t dd = nn + mm;
1699 
1700         if (mm > (uint64_t)(INT64_MAX - nn)) {
1701             dd = INT64_MAX;
1702             q = true;
1703         }
1704         d[i] = dd;
1705     }
1706     if (q) {
1707         uint32_t *qc = vq;
1708         qc[0] = 1;
1709     }
1710     clear_tail(d, oprsz, simd_maxsz(desc));
1711 }
1712 
1713 #define DO_SRA(NAME, TYPE)                              \
1714 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1715 {                                                       \
1716     intptr_t i, oprsz = simd_oprsz(desc);               \
1717     int shift = simd_data(desc);                        \
1718     TYPE *d = vd, *n = vn;                              \
1719     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1720         d[i] += n[i] >> shift;                          \
1721     }                                                   \
1722     clear_tail(d, oprsz, simd_maxsz(desc));             \
1723 }
1724 
1725 DO_SRA(gvec_ssra_b, int8_t)
1726 DO_SRA(gvec_ssra_h, int16_t)
1727 DO_SRA(gvec_ssra_s, int32_t)
1728 DO_SRA(gvec_ssra_d, int64_t)
1729 
1730 DO_SRA(gvec_usra_b, uint8_t)
1731 DO_SRA(gvec_usra_h, uint16_t)
1732 DO_SRA(gvec_usra_s, uint32_t)
1733 DO_SRA(gvec_usra_d, uint64_t)
1734 
1735 #undef DO_SRA
1736 
1737 #define DO_RSHR(NAME, TYPE)                             \
1738 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1739 {                                                       \
1740     intptr_t i, oprsz = simd_oprsz(desc);               \
1741     int shift = simd_data(desc);                        \
1742     TYPE *d = vd, *n = vn;                              \
1743     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1744         TYPE tmp = n[i] >> (shift - 1);                 \
1745         d[i] = (tmp >> 1) + (tmp & 1);                  \
1746     }                                                   \
1747     clear_tail(d, oprsz, simd_maxsz(desc));             \
1748 }
1749 
1750 DO_RSHR(gvec_srshr_b, int8_t)
1751 DO_RSHR(gvec_srshr_h, int16_t)
1752 DO_RSHR(gvec_srshr_s, int32_t)
1753 DO_RSHR(gvec_srshr_d, int64_t)
1754 
1755 DO_RSHR(gvec_urshr_b, uint8_t)
1756 DO_RSHR(gvec_urshr_h, uint16_t)
1757 DO_RSHR(gvec_urshr_s, uint32_t)
1758 DO_RSHR(gvec_urshr_d, uint64_t)
1759 
1760 #undef DO_RSHR
1761 
1762 #define DO_RSRA(NAME, TYPE)                             \
1763 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1764 {                                                       \
1765     intptr_t i, oprsz = simd_oprsz(desc);               \
1766     int shift = simd_data(desc);                        \
1767     TYPE *d = vd, *n = vn;                              \
1768     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1769         TYPE tmp = n[i] >> (shift - 1);                 \
1770         d[i] += (tmp >> 1) + (tmp & 1);                 \
1771     }                                                   \
1772     clear_tail(d, oprsz, simd_maxsz(desc));             \
1773 }
1774 
1775 DO_RSRA(gvec_srsra_b, int8_t)
1776 DO_RSRA(gvec_srsra_h, int16_t)
1777 DO_RSRA(gvec_srsra_s, int32_t)
1778 DO_RSRA(gvec_srsra_d, int64_t)
1779 
1780 DO_RSRA(gvec_ursra_b, uint8_t)
1781 DO_RSRA(gvec_ursra_h, uint16_t)
1782 DO_RSRA(gvec_ursra_s, uint32_t)
1783 DO_RSRA(gvec_ursra_d, uint64_t)
1784 
1785 #undef DO_RSRA
1786 
1787 #define DO_SRI(NAME, TYPE)                              \
1788 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1789 {                                                       \
1790     intptr_t i, oprsz = simd_oprsz(desc);               \
1791     int shift = simd_data(desc);                        \
1792     TYPE *d = vd, *n = vn;                              \
1793     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1794         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1795     }                                                   \
1796     clear_tail(d, oprsz, simd_maxsz(desc));             \
1797 }
1798 
1799 DO_SRI(gvec_sri_b, uint8_t)
1800 DO_SRI(gvec_sri_h, uint16_t)
1801 DO_SRI(gvec_sri_s, uint32_t)
1802 DO_SRI(gvec_sri_d, uint64_t)
1803 
1804 #undef DO_SRI
1805 
1806 #define DO_SLI(NAME, TYPE)                              \
1807 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1808 {                                                       \
1809     intptr_t i, oprsz = simd_oprsz(desc);               \
1810     int shift = simd_data(desc);                        \
1811     TYPE *d = vd, *n = vn;                              \
1812     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1813         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1814     }                                                   \
1815     clear_tail(d, oprsz, simd_maxsz(desc));             \
1816 }
1817 
1818 DO_SLI(gvec_sli_b, uint8_t)
1819 DO_SLI(gvec_sli_h, uint16_t)
1820 DO_SLI(gvec_sli_s, uint32_t)
1821 DO_SLI(gvec_sli_d, uint64_t)
1822 
1823 #undef DO_SLI
1824 
1825 /*
1826  * Convert float16 to float32, raising no exceptions and
1827  * preserving exceptional values, including SNaN.
1828  * This is effectively an unpack+repack operation.
1829  */
1830 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1831 {
1832     const int f16_bias = 15;
1833     const int f32_bias = 127;
1834     uint32_t sign = extract32(f16, 15, 1);
1835     uint32_t exp = extract32(f16, 10, 5);
1836     uint32_t frac = extract32(f16, 0, 10);
1837 
1838     if (exp == 0x1f) {
1839         /* Inf or NaN */
1840         exp = 0xff;
1841     } else if (exp == 0) {
1842         /* Zero or denormal.  */
1843         if (frac != 0) {
1844             if (fz16) {
1845                 frac = 0;
1846             } else {
1847                 /*
1848                  * Denormal; these are all normal float32.
1849                  * Shift the fraction so that the msb is at bit 11,
1850                  * then remove bit 11 as the implicit bit of the
1851                  * normalized float32.  Note that we still go through
1852                  * the shift for normal numbers below, to put the
1853                  * float32 fraction at the right place.
1854                  */
1855                 int shift = clz32(frac) - 21;
1856                 frac = (frac << shift) & 0x3ff;
1857                 exp = f32_bias - f16_bias - shift + 1;
1858             }
1859         }
1860     } else {
1861         /* Normal number; adjust the bias.  */
1862         exp += f32_bias - f16_bias;
1863     }
1864     sign <<= 31;
1865     exp <<= 23;
1866     frac <<= 23 - 10;
1867 
1868     return sign | exp | frac;
1869 }
1870 
1871 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1872 {
1873     /*
1874      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1875      * Load the 2nd qword iff is_q & is_2.
1876      * Shift to the 2nd dword iff !is_q & is_2.
1877      * For !is_q & !is_2, the upper bits of the result are garbage.
1878      */
1879     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1880 }
1881 
1882 /*
1883  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1884  * as there is not yet SVE versions that might use blocking.
1885  */
1886 
1887 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1888                      uint32_t desc, bool fz16)
1889 {
1890     intptr_t i, oprsz = simd_oprsz(desc);
1891     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1892     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1893     int is_q = oprsz == 16;
1894     uint64_t n_4, m_4;
1895 
1896     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1897     n_4 = load4_f16(vn, is_q, is_2);
1898     m_4 = load4_f16(vm, is_q, is_2);
1899 
1900     /* Negate all inputs for FMLSL at once.  */
1901     if (is_s) {
1902         n_4 ^= 0x8000800080008000ull;
1903     }
1904 
1905     for (i = 0; i < oprsz / 4; i++) {
1906         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1907         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1908         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1909     }
1910     clear_tail(d, oprsz, simd_maxsz(desc));
1911 }
1912 
1913 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1914                             void *venv, uint32_t desc)
1915 {
1916     CPUARMState *env = venv;
1917     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1918              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1919 }
1920 
1921 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1922                             void *venv, uint32_t desc)
1923 {
1924     CPUARMState *env = venv;
1925     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1926              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1927 }
1928 
1929 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1930                                void *venv, uint32_t desc)
1931 {
1932     intptr_t i, oprsz = simd_oprsz(desc);
1933     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1934     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1935     CPUARMState *env = venv;
1936     float_status *status = &env->vfp.fp_status;
1937     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1938 
1939     for (i = 0; i < oprsz; i += sizeof(float32)) {
1940         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1941         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1942         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1943         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1944         float32 aa = *(float32 *)(va + H1_4(i));
1945 
1946         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1947     }
1948 }
1949 
1950 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1951                          uint32_t desc, bool fz16)
1952 {
1953     intptr_t i, oprsz = simd_oprsz(desc);
1954     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1955     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1956     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1957     int is_q = oprsz == 16;
1958     uint64_t n_4;
1959     float32 m_1;
1960 
1961     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1962     n_4 = load4_f16(vn, is_q, is_2);
1963 
1964     /* Negate all inputs for FMLSL at once.  */
1965     if (is_s) {
1966         n_4 ^= 0x8000800080008000ull;
1967     }
1968 
1969     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1970 
1971     for (i = 0; i < oprsz / 4; i++) {
1972         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1973         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1974     }
1975     clear_tail(d, oprsz, simd_maxsz(desc));
1976 }
1977 
1978 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1979                                 void *venv, uint32_t desc)
1980 {
1981     CPUARMState *env = venv;
1982     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1983                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1984 }
1985 
1986 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1987                                 void *venv, uint32_t desc)
1988 {
1989     CPUARMState *env = venv;
1990     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1991                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1992 }
1993 
1994 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1995                                void *venv, uint32_t desc)
1996 {
1997     intptr_t i, j, oprsz = simd_oprsz(desc);
1998     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1999     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2000     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2001     CPUARMState *env = venv;
2002     float_status *status = &env->vfp.fp_status;
2003     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2004 
2005     for (i = 0; i < oprsz; i += 16) {
2006         float16 mm_16 = *(float16 *)(vm + i + idx);
2007         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2008 
2009         for (j = 0; j < 16; j += sizeof(float32)) {
2010             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2011             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2012             float32 aa = *(float32 *)(va + H1_4(i + j));
2013 
2014             *(float32 *)(vd + H1_4(i + j)) =
2015                 float32_muladd(nn, mm, aa, 0, status);
2016         }
2017     }
2018 }
2019 
2020 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2021 {
2022     intptr_t i, opr_sz = simd_oprsz(desc);
2023     int8_t *d = vd, *n = vn, *m = vm;
2024 
2025     for (i = 0; i < opr_sz; ++i) {
2026         int8_t mm = m[i];
2027         int8_t nn = n[i];
2028         int8_t res = 0;
2029         if (mm >= 0) {
2030             if (mm < 8) {
2031                 res = nn << mm;
2032             }
2033         } else {
2034             res = nn >> (mm > -8 ? -mm : 7);
2035         }
2036         d[i] = res;
2037     }
2038     clear_tail(d, opr_sz, simd_maxsz(desc));
2039 }
2040 
2041 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2042 {
2043     intptr_t i, opr_sz = simd_oprsz(desc);
2044     int16_t *d = vd, *n = vn, *m = vm;
2045 
2046     for (i = 0; i < opr_sz / 2; ++i) {
2047         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2048         int16_t nn = n[i];
2049         int16_t res = 0;
2050         if (mm >= 0) {
2051             if (mm < 16) {
2052                 res = nn << mm;
2053             }
2054         } else {
2055             res = nn >> (mm > -16 ? -mm : 15);
2056         }
2057         d[i] = res;
2058     }
2059     clear_tail(d, opr_sz, simd_maxsz(desc));
2060 }
2061 
2062 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2063 {
2064     intptr_t i, opr_sz = simd_oprsz(desc);
2065     uint8_t *d = vd, *n = vn, *m = vm;
2066 
2067     for (i = 0; i < opr_sz; ++i) {
2068         int8_t mm = m[i];
2069         uint8_t nn = n[i];
2070         uint8_t res = 0;
2071         if (mm >= 0) {
2072             if (mm < 8) {
2073                 res = nn << mm;
2074             }
2075         } else {
2076             if (mm > -8) {
2077                 res = nn >> -mm;
2078             }
2079         }
2080         d[i] = res;
2081     }
2082     clear_tail(d, opr_sz, simd_maxsz(desc));
2083 }
2084 
2085 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2086 {
2087     intptr_t i, opr_sz = simd_oprsz(desc);
2088     uint16_t *d = vd, *n = vn, *m = vm;
2089 
2090     for (i = 0; i < opr_sz / 2; ++i) {
2091         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2092         uint16_t nn = n[i];
2093         uint16_t res = 0;
2094         if (mm >= 0) {
2095             if (mm < 16) {
2096                 res = nn << mm;
2097             }
2098         } else {
2099             if (mm > -16) {
2100                 res = nn >> -mm;
2101             }
2102         }
2103         d[i] = res;
2104     }
2105     clear_tail(d, opr_sz, simd_maxsz(desc));
2106 }
2107 
2108 /*
2109  * 8x8->8 polynomial multiply.
2110  *
2111  * Polynomial multiplication is like integer multiplication except the
2112  * partial products are XORed, not added.
2113  *
2114  * TODO: expose this as a generic vector operation, as it is a common
2115  * crypto building block.
2116  */
2117 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2118 {
2119     intptr_t i, opr_sz = simd_oprsz(desc);
2120     uint64_t *d = vd, *n = vn, *m = vm;
2121 
2122     for (i = 0; i < opr_sz / 8; ++i) {
2123         d[i] = clmul_8x8_low(n[i], m[i]);
2124     }
2125     clear_tail(d, opr_sz, simd_maxsz(desc));
2126 }
2127 
2128 /*
2129  * 64x64->128 polynomial multiply.
2130  * Because of the lanes are not accessed in strict columns,
2131  * this probably cannot be turned into a generic helper.
2132  */
2133 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2134 {
2135     intptr_t i, opr_sz = simd_oprsz(desc);
2136     intptr_t hi = simd_data(desc);
2137     uint64_t *d = vd, *n = vn, *m = vm;
2138 
2139     for (i = 0; i < opr_sz / 8; i += 2) {
2140         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2141         d[i] = int128_getlo(r);
2142         d[i + 1] = int128_gethi(r);
2143     }
2144     clear_tail(d, opr_sz, simd_maxsz(desc));
2145 }
2146 
2147 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2148 {
2149     int hi = simd_data(desc);
2150     uint64_t *d = vd, *n = vn, *m = vm;
2151     uint64_t nn = n[hi], mm = m[hi];
2152 
2153     d[0] = clmul_8x4_packed(nn, mm);
2154     nn >>= 32;
2155     mm >>= 32;
2156     d[1] = clmul_8x4_packed(nn, mm);
2157 
2158     clear_tail(d, 16, simd_maxsz(desc));
2159 }
2160 
2161 #ifdef TARGET_AARCH64
2162 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2163 {
2164     int shift = simd_data(desc) * 8;
2165     intptr_t i, opr_sz = simd_oprsz(desc);
2166     uint64_t *d = vd, *n = vn, *m = vm;
2167 
2168     for (i = 0; i < opr_sz / 8; ++i) {
2169         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2170     }
2171 }
2172 
2173 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2174 {
2175     intptr_t sel = H4(simd_data(desc));
2176     intptr_t i, opr_sz = simd_oprsz(desc);
2177     uint32_t *n = vn, *m = vm;
2178     uint64_t *d = vd;
2179 
2180     for (i = 0; i < opr_sz / 8; ++i) {
2181         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2182     }
2183 }
2184 #endif
2185 
2186 #define DO_CMP0(NAME, TYPE, OP)                         \
2187 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2188 {                                                       \
2189     intptr_t i, opr_sz = simd_oprsz(desc);              \
2190     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2191         TYPE nn = *(TYPE *)(vn + i);                    \
2192         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2193     }                                                   \
2194     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2195 }
2196 
2197 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2198 DO_CMP0(gvec_clt0_b, int8_t, <)
2199 DO_CMP0(gvec_cle0_b, int8_t, <=)
2200 DO_CMP0(gvec_cgt0_b, int8_t, >)
2201 DO_CMP0(gvec_cge0_b, int8_t, >=)
2202 
2203 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2204 DO_CMP0(gvec_clt0_h, int16_t, <)
2205 DO_CMP0(gvec_cle0_h, int16_t, <=)
2206 DO_CMP0(gvec_cgt0_h, int16_t, >)
2207 DO_CMP0(gvec_cge0_h, int16_t, >=)
2208 
2209 #undef DO_CMP0
2210 
2211 #define DO_ABD(NAME, TYPE)                                      \
2212 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2213 {                                                               \
2214     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2215     TYPE *d = vd, *n = vn, *m = vm;                             \
2216                                                                 \
2217     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2218         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2219     }                                                           \
2220     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2221 }
2222 
2223 DO_ABD(gvec_sabd_b, int8_t)
2224 DO_ABD(gvec_sabd_h, int16_t)
2225 DO_ABD(gvec_sabd_s, int32_t)
2226 DO_ABD(gvec_sabd_d, int64_t)
2227 
2228 DO_ABD(gvec_uabd_b, uint8_t)
2229 DO_ABD(gvec_uabd_h, uint16_t)
2230 DO_ABD(gvec_uabd_s, uint32_t)
2231 DO_ABD(gvec_uabd_d, uint64_t)
2232 
2233 #undef DO_ABD
2234 
2235 #define DO_ABA(NAME, TYPE)                                      \
2236 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2237 {                                                               \
2238     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2239     TYPE *d = vd, *n = vn, *m = vm;                             \
2240                                                                 \
2241     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2242         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2243     }                                                           \
2244     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2245 }
2246 
2247 DO_ABA(gvec_saba_b, int8_t)
2248 DO_ABA(gvec_saba_h, int16_t)
2249 DO_ABA(gvec_saba_s, int32_t)
2250 DO_ABA(gvec_saba_d, int64_t)
2251 
2252 DO_ABA(gvec_uaba_b, uint8_t)
2253 DO_ABA(gvec_uaba_h, uint16_t)
2254 DO_ABA(gvec_uaba_s, uint32_t)
2255 DO_ABA(gvec_uaba_d, uint64_t)
2256 
2257 #undef DO_ABA
2258 
2259 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2260 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2261 {                                                                          \
2262     ARMVectorReg scratch;                                                  \
2263     intptr_t oprsz = simd_oprsz(desc);                                     \
2264     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2265     TYPE *d = vd, *n = vn, *m = vm;                                        \
2266     if (unlikely(d == m)) {                                                \
2267         m = memcpy(&scratch, m, oprsz);                                    \
2268     }                                                                      \
2269     for (intptr_t i = 0; i < half; ++i) {                                  \
2270         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2271     }                                                                      \
2272     for (intptr_t i = 0; i < half; ++i) {                                  \
2273         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2274     }                                                                      \
2275     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2276 }
2277 
2278 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2279 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2280 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2281 
2282 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2283 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2284 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2285 
2286 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2287 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2288 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2289 
2290 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2291 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2292 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2293 
2294 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2295 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2296 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2297 
2298 #undef DO_3OP_PAIR
2299 
2300 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2301 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2302 {                                                               \
2303     ARMVectorReg scratch;                                       \
2304     intptr_t oprsz = simd_oprsz(desc);                          \
2305     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2306     TYPE *d = vd, *n = vn, *m = vm;                             \
2307     if (unlikely(d == m)) {                                     \
2308         m = memcpy(&scratch, m, oprsz);                         \
2309     }                                                           \
2310     for (intptr_t i = 0; i < half; ++i) {                       \
2311         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2312     }                                                           \
2313     for (intptr_t i = 0; i < half; ++i) {                       \
2314         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2315     }                                                           \
2316     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2317 }
2318 
2319 #define ADD(A, B) (A + B)
2320 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2321 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2322 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2323 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2324 #undef  ADD
2325 
2326 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2327 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2328 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2329 
2330 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2331 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2332 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2333 
2334 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2335 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2336 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2337 
2338 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2339 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2340 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2341 
2342 #undef DO_3OP_PAIR
2343 
2344 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2345     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2346     {                                                                   \
2347         intptr_t i, oprsz = simd_oprsz(desc);                           \
2348         int shift = simd_data(desc);                                    \
2349         TYPE *d = vd, *n = vn;                                          \
2350         float_status *fpst = stat;                                      \
2351         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2352             d[i] = FUNC(n[i], shift, fpst);                             \
2353         }                                                               \
2354         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2355     }
2356 
2357 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2358 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2359 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2360 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2361 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2362 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2363 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2364 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2365 
2366 #undef DO_VCVT_FIXED
2367 
2368 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2369     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2370     {                                                                   \
2371         float_status *fpst = stat;                                      \
2372         intptr_t i, oprsz = simd_oprsz(desc);                           \
2373         uint32_t rmode = simd_data(desc);                               \
2374         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2375         TYPE *d = vd, *n = vn;                                          \
2376         set_float_rounding_mode(rmode, fpst);                           \
2377         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2378             d[i] = FUNC(n[i], 0, fpst);                                 \
2379         }                                                               \
2380         set_float_rounding_mode(prev_rmode, fpst);                      \
2381         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2382     }
2383 
2384 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2385 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2386 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2387 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2388 
2389 #undef DO_VCVT_RMODE
2390 
2391 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2392     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2393     {                                                                   \
2394         float_status *fpst = stat;                                      \
2395         intptr_t i, oprsz = simd_oprsz(desc);                           \
2396         uint32_t rmode = simd_data(desc);                               \
2397         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2398         TYPE *d = vd, *n = vn;                                          \
2399         set_float_rounding_mode(rmode, fpst);                           \
2400         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2401             d[i] = FUNC(n[i], fpst);                                    \
2402         }                                                               \
2403         set_float_rounding_mode(prev_rmode, fpst);                      \
2404         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2405     }
2406 
2407 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2408 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2409 
2410 #undef DO_VRINT_RMODE
2411 
2412 #ifdef TARGET_AARCH64
2413 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2414 {
2415     const uint8_t *indices = vm;
2416     CPUARMState *env = venv;
2417     size_t oprsz = simd_oprsz(desc);
2418     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2419     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2420     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2421     union {
2422         uint8_t b[16];
2423         uint64_t d[2];
2424     } result;
2425 
2426     /*
2427      * We must construct the final result in a temp, lest the output
2428      * overlaps the input table.  For TBL, begin with zero; for TBX,
2429      * begin with the original register contents.  Note that we always
2430      * copy 16 bytes here to avoid an extra branch; clearing the high
2431      * bits of the register for oprsz == 8 is handled below.
2432      */
2433     if (is_tbx) {
2434         memcpy(&result, vd, 16);
2435     } else {
2436         memset(&result, 0, 16);
2437     }
2438 
2439     for (size_t i = 0; i < oprsz; ++i) {
2440         uint32_t index = indices[H1(i)];
2441 
2442         if (index < table_len) {
2443             /*
2444              * Convert index (a byte offset into the virtual table
2445              * which is a series of 128-bit vectors concatenated)
2446              * into the correct register element, bearing in mind
2447              * that the table can wrap around from V31 to V0.
2448              */
2449             const uint8_t *table = (const uint8_t *)
2450                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2451             result.b[H1(i)] = table[H1(index % 16)];
2452         }
2453     }
2454 
2455     memcpy(vd, &result, 16);
2456     clear_tail(vd, oprsz, simd_maxsz(desc));
2457 }
2458 #endif
2459 
2460 /*
2461  * NxN -> N highpart multiply
2462  *
2463  * TODO: expose this as a generic vector operation.
2464  */
2465 
2466 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2467 {
2468     intptr_t i, opr_sz = simd_oprsz(desc);
2469     int8_t *d = vd, *n = vn, *m = vm;
2470 
2471     for (i = 0; i < opr_sz; ++i) {
2472         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2473     }
2474     clear_tail(d, opr_sz, simd_maxsz(desc));
2475 }
2476 
2477 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2478 {
2479     intptr_t i, opr_sz = simd_oprsz(desc);
2480     int16_t *d = vd, *n = vn, *m = vm;
2481 
2482     for (i = 0; i < opr_sz / 2; ++i) {
2483         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2484     }
2485     clear_tail(d, opr_sz, simd_maxsz(desc));
2486 }
2487 
2488 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2489 {
2490     intptr_t i, opr_sz = simd_oprsz(desc);
2491     int32_t *d = vd, *n = vn, *m = vm;
2492 
2493     for (i = 0; i < opr_sz / 4; ++i) {
2494         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2495     }
2496     clear_tail(d, opr_sz, simd_maxsz(desc));
2497 }
2498 
2499 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2500 {
2501     intptr_t i, opr_sz = simd_oprsz(desc);
2502     uint64_t *d = vd, *n = vn, *m = vm;
2503     uint64_t discard;
2504 
2505     for (i = 0; i < opr_sz / 8; ++i) {
2506         muls64(&discard, &d[i], n[i], m[i]);
2507     }
2508     clear_tail(d, opr_sz, simd_maxsz(desc));
2509 }
2510 
2511 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2512 {
2513     intptr_t i, opr_sz = simd_oprsz(desc);
2514     uint8_t *d = vd, *n = vn, *m = vm;
2515 
2516     for (i = 0; i < opr_sz; ++i) {
2517         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2518     }
2519     clear_tail(d, opr_sz, simd_maxsz(desc));
2520 }
2521 
2522 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2523 {
2524     intptr_t i, opr_sz = simd_oprsz(desc);
2525     uint16_t *d = vd, *n = vn, *m = vm;
2526 
2527     for (i = 0; i < opr_sz / 2; ++i) {
2528         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2529     }
2530     clear_tail(d, opr_sz, simd_maxsz(desc));
2531 }
2532 
2533 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2534 {
2535     intptr_t i, opr_sz = simd_oprsz(desc);
2536     uint32_t *d = vd, *n = vn, *m = vm;
2537 
2538     for (i = 0; i < opr_sz / 4; ++i) {
2539         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2540     }
2541     clear_tail(d, opr_sz, simd_maxsz(desc));
2542 }
2543 
2544 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2545 {
2546     intptr_t i, opr_sz = simd_oprsz(desc);
2547     uint64_t *d = vd, *n = vn, *m = vm;
2548     uint64_t discard;
2549 
2550     for (i = 0; i < opr_sz / 8; ++i) {
2551         mulu64(&discard, &d[i], n[i], m[i]);
2552     }
2553     clear_tail(d, opr_sz, simd_maxsz(desc));
2554 }
2555 
2556 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2557 {
2558     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2559     int shr = simd_data(desc);
2560     uint64_t *d = vd, *n = vn, *m = vm;
2561 
2562     for (i = 0; i < opr_sz; ++i) {
2563         d[i] = ror64(n[i] ^ m[i], shr);
2564     }
2565     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2566 }
2567 
2568 /*
2569  * Integer matrix-multiply accumulate
2570  */
2571 
2572 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2573 {
2574     int8_t *n = vn, *m = vm;
2575 
2576     for (intptr_t k = 0; k < 8; ++k) {
2577         sum += n[H1(k)] * m[H1(k)];
2578     }
2579     return sum;
2580 }
2581 
2582 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2583 {
2584     uint8_t *n = vn, *m = vm;
2585 
2586     for (intptr_t k = 0; k < 8; ++k) {
2587         sum += n[H1(k)] * m[H1(k)];
2588     }
2589     return sum;
2590 }
2591 
2592 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2593 {
2594     uint8_t *n = vn;
2595     int8_t *m = vm;
2596 
2597     for (intptr_t k = 0; k < 8; ++k) {
2598         sum += n[H1(k)] * m[H1(k)];
2599     }
2600     return sum;
2601 }
2602 
2603 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2604                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2605 {
2606     intptr_t seg, opr_sz = simd_oprsz(desc);
2607 
2608     for (seg = 0; seg < opr_sz; seg += 16) {
2609         uint32_t *d = vd + seg;
2610         uint32_t *a = va + seg;
2611         uint32_t sum0, sum1, sum2, sum3;
2612 
2613         /*
2614          * Process the entire segment at once, writing back the
2615          * results only after we've consumed all of the inputs.
2616          *
2617          * Key to indices by column:
2618          *          i   j                  i             j
2619          */
2620         sum0 = a[H4(0 + 0)];
2621         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2622         sum1 = a[H4(0 + 1)];
2623         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2624         sum2 = a[H4(2 + 0)];
2625         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2626         sum3 = a[H4(2 + 1)];
2627         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2628 
2629         d[H4(0)] = sum0;
2630         d[H4(1)] = sum1;
2631         d[H4(2)] = sum2;
2632         d[H4(3)] = sum3;
2633     }
2634     clear_tail(vd, opr_sz, simd_maxsz(desc));
2635 }
2636 
2637 #define DO_MMLA_B(NAME, INNER) \
2638     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2639     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2640 
2641 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2642 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2643 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2644 
2645 /*
2646  * BFloat16 Dot Product
2647  */
2648 
2649 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2650 {
2651     /* FPCR is ignored for BFDOT and BFMMLA. */
2652     float_status bf_status = {
2653         .tininess_before_rounding = float_tininess_before_rounding,
2654         .float_rounding_mode = float_round_to_odd_inf,
2655         .flush_to_zero = true,
2656         .flush_inputs_to_zero = true,
2657         .default_nan_mode = true,
2658     };
2659     float32 t1, t2;
2660 
2661     /*
2662      * Extract each BFloat16 from the element pair, and shift
2663      * them such that they become float32.
2664      */
2665     t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2666     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2667     t1 = float32_add(t1, t2, &bf_status);
2668     t1 = float32_add(sum, t1, &bf_status);
2669 
2670     return t1;
2671 }
2672 
2673 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2674 {
2675     intptr_t i, opr_sz = simd_oprsz(desc);
2676     float32 *d = vd, *a = va;
2677     uint32_t *n = vn, *m = vm;
2678 
2679     for (i = 0; i < opr_sz / 4; ++i) {
2680         d[i] = bfdotadd(a[i], n[i], m[i]);
2681     }
2682     clear_tail(d, opr_sz, simd_maxsz(desc));
2683 }
2684 
2685 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2686                             void *va, uint32_t desc)
2687 {
2688     intptr_t i, j, opr_sz = simd_oprsz(desc);
2689     intptr_t index = simd_data(desc);
2690     intptr_t elements = opr_sz / 4;
2691     intptr_t eltspersegment = MIN(16 / 4, elements);
2692     float32 *d = vd, *a = va;
2693     uint32_t *n = vn, *m = vm;
2694 
2695     for (i = 0; i < elements; i += eltspersegment) {
2696         uint32_t m_idx = m[i + H4(index)];
2697 
2698         for (j = i; j < i + eltspersegment; j++) {
2699             d[j] = bfdotadd(a[j], n[j], m_idx);
2700         }
2701     }
2702     clear_tail(d, opr_sz, simd_maxsz(desc));
2703 }
2704 
2705 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2706 {
2707     intptr_t s, opr_sz = simd_oprsz(desc);
2708     float32 *d = vd, *a = va;
2709     uint32_t *n = vn, *m = vm;
2710 
2711     for (s = 0; s < opr_sz / 4; s += 4) {
2712         float32 sum00, sum01, sum10, sum11;
2713 
2714         /*
2715          * Process the entire segment at once, writing back the
2716          * results only after we've consumed all of the inputs.
2717          *
2718          * Key to indices by column:
2719          *               i   j           i   k             j   k
2720          */
2721         sum00 = a[s + H4(0 + 0)];
2722         sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2723         sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2724 
2725         sum01 = a[s + H4(0 + 1)];
2726         sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2727         sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2728 
2729         sum10 = a[s + H4(2 + 0)];
2730         sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2731         sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2732 
2733         sum11 = a[s + H4(2 + 1)];
2734         sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2735         sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2736 
2737         d[s + H4(0 + 0)] = sum00;
2738         d[s + H4(0 + 1)] = sum01;
2739         d[s + H4(2 + 0)] = sum10;
2740         d[s + H4(2 + 1)] = sum11;
2741     }
2742     clear_tail(d, opr_sz, simd_maxsz(desc));
2743 }
2744 
2745 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2746                          void *stat, uint32_t desc)
2747 {
2748     intptr_t i, opr_sz = simd_oprsz(desc);
2749     intptr_t sel = simd_data(desc);
2750     float32 *d = vd, *a = va;
2751     bfloat16 *n = vn, *m = vm;
2752 
2753     for (i = 0; i < opr_sz / 4; ++i) {
2754         float32 nn = n[H2(i * 2 + sel)] << 16;
2755         float32 mm = m[H2(i * 2 + sel)] << 16;
2756         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2757     }
2758     clear_tail(d, opr_sz, simd_maxsz(desc));
2759 }
2760 
2761 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2762                              void *va, void *stat, uint32_t desc)
2763 {
2764     intptr_t i, j, opr_sz = simd_oprsz(desc);
2765     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2766     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2767     intptr_t elements = opr_sz / 4;
2768     intptr_t eltspersegment = MIN(16 / 4, elements);
2769     float32 *d = vd, *a = va;
2770     bfloat16 *n = vn, *m = vm;
2771 
2772     for (i = 0; i < elements; i += eltspersegment) {
2773         float32 m_idx = m[H2(2 * i + index)] << 16;
2774 
2775         for (j = i; j < i + eltspersegment; j++) {
2776             float32 n_j = n[H2(2 * j + sel)] << 16;
2777             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2778         }
2779     }
2780     clear_tail(d, opr_sz, simd_maxsz(desc));
2781 }
2782 
2783 #define DO_CLAMP(NAME, TYPE) \
2784 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2785 {                                                                       \
2786     intptr_t i, opr_sz = simd_oprsz(desc);                              \
2787     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2788         TYPE aa = *(TYPE *)(a + i);                                     \
2789         TYPE nn = *(TYPE *)(n + i);                                     \
2790         TYPE mm = *(TYPE *)(m + i);                                     \
2791         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2792         *(TYPE *)(d + i) = dd;                                          \
2793     }                                                                   \
2794     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2795 }
2796 
2797 DO_CLAMP(gvec_sclamp_b, int8_t)
2798 DO_CLAMP(gvec_sclamp_h, int16_t)
2799 DO_CLAMP(gvec_sclamp_s, int32_t)
2800 DO_CLAMP(gvec_sclamp_d, int64_t)
2801 
2802 DO_CLAMP(gvec_uclamp_b, uint8_t)
2803 DO_CLAMP(gvec_uclamp_h, uint16_t)
2804 DO_CLAMP(gvec_uclamp_s, uint32_t)
2805 DO_CLAMP(gvec_uclamp_d, uint64_t)
2806