xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision a13f9fb5)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
315                              void *va, uint32_t desc)
316 {
317     intptr_t i, opr_sz = simd_oprsz(desc);
318     int16_t *d = vd, *n = vn, *m = vm, *a = va;
319     uint32_t discard;
320 
321     for (i = 0; i < opr_sz / 2; ++i) {
322         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
323     }
324 }
325 
326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
327                              void *va, uint32_t desc)
328 {
329     intptr_t i, opr_sz = simd_oprsz(desc);
330     int16_t *d = vd, *n = vn, *m = vm, *a = va;
331     uint32_t discard;
332 
333     for (i = 0; i < opr_sz / 2; ++i) {
334         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
335     }
336 }
337 
338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
339 {
340     intptr_t i, opr_sz = simd_oprsz(desc);
341     int16_t *d = vd, *n = vn, *m = vm;
342     uint32_t discard;
343 
344     for (i = 0; i < opr_sz / 2; ++i) {
345         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
346     }
347 }
348 
349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
350 {
351     intptr_t i, opr_sz = simd_oprsz(desc);
352     int16_t *d = vd, *n = vn, *m = vm;
353     uint32_t discard;
354 
355     for (i = 0; i < opr_sz / 2; ++i) {
356         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
357     }
358 }
359 
360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
361 {
362     intptr_t i, j, opr_sz = simd_oprsz(desc);
363     int idx = simd_data(desc);
364     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
365     uint32_t discard;
366 
367     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
368         int16_t mm = m[i];
369         for (j = 0; j < 16 / 2; ++j) {
370             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
371         }
372     }
373 }
374 
375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
376 {
377     intptr_t i, j, opr_sz = simd_oprsz(desc);
378     int idx = simd_data(desc);
379     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
380     uint32_t discard;
381 
382     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
383         int16_t mm = m[i];
384         for (j = 0; j < 16 / 2; ++j) {
385             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
386         }
387     }
388 }
389 
390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
392                       bool neg, bool round, uint32_t *sat)
393 {
394     /* Simplify similarly to do_sqrdmlah_b above.  */
395     int64_t ret = (int64_t)src1 * src2;
396     if (neg) {
397         ret = -ret;
398     }
399     ret += ((int64_t)src3 << 31) + (round << 30);
400     ret >>= 31;
401 
402     if (ret != (int32_t)ret) {
403         *sat = 1;
404         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
405     }
406     return ret;
407 }
408 
409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
410                                   int32_t src2, int32_t src3)
411 {
412     uint32_t *sat = &env->vfp.qc[0];
413     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
414 }
415 
416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
417                               void *vq, uint32_t desc)
418 {
419     uintptr_t opr_sz = simd_oprsz(desc);
420     int32_t *d = vd;
421     int32_t *n = vn;
422     int32_t *m = vm;
423     uintptr_t i;
424 
425     for (i = 0; i < opr_sz / 4; ++i) {
426         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
427     }
428     clear_tail(d, opr_sz, simd_maxsz(desc));
429 }
430 
431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
432                                   int32_t src2, int32_t src3)
433 {
434     uint32_t *sat = &env->vfp.qc[0];
435     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
436 }
437 
438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
439                               void *vq, uint32_t desc)
440 {
441     uintptr_t opr_sz = simd_oprsz(desc);
442     int32_t *d = vd;
443     int32_t *n = vn;
444     int32_t *m = vm;
445     uintptr_t i;
446 
447     for (i = 0; i < opr_sz / 4; ++i) {
448         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
449     }
450     clear_tail(d, opr_sz, simd_maxsz(desc));
451 }
452 
453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
454                             void *vq, uint32_t desc)
455 {
456     intptr_t i, opr_sz = simd_oprsz(desc);
457     int32_t *d = vd, *n = vn, *m = vm;
458 
459     for (i = 0; i < opr_sz / 4; ++i) {
460         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
461     }
462     clear_tail(d, opr_sz, simd_maxsz(desc));
463 }
464 
465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
466                              void *vq, uint32_t desc)
467 {
468     intptr_t i, opr_sz = simd_oprsz(desc);
469     int32_t *d = vd, *n = vn, *m = vm;
470 
471     for (i = 0; i < opr_sz / 4; ++i) {
472         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
473     }
474     clear_tail(d, opr_sz, simd_maxsz(desc));
475 }
476 
477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
478                              void *va, uint32_t desc)
479 {
480     intptr_t i, opr_sz = simd_oprsz(desc);
481     int32_t *d = vd, *n = vn, *m = vm, *a = va;
482     uint32_t discard;
483 
484     for (i = 0; i < opr_sz / 4; ++i) {
485         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
486     }
487 }
488 
489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
490                              void *va, uint32_t desc)
491 {
492     intptr_t i, opr_sz = simd_oprsz(desc);
493     int32_t *d = vd, *n = vn, *m = vm, *a = va;
494     uint32_t discard;
495 
496     for (i = 0; i < opr_sz / 4; ++i) {
497         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
498     }
499 }
500 
501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
502 {
503     intptr_t i, opr_sz = simd_oprsz(desc);
504     int32_t *d = vd, *n = vn, *m = vm;
505     uint32_t discard;
506 
507     for (i = 0; i < opr_sz / 4; ++i) {
508         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
509     }
510 }
511 
512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
513 {
514     intptr_t i, opr_sz = simd_oprsz(desc);
515     int32_t *d = vd, *n = vn, *m = vm;
516     uint32_t discard;
517 
518     for (i = 0; i < opr_sz / 4; ++i) {
519         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
520     }
521 }
522 
523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
524 {
525     intptr_t i, j, opr_sz = simd_oprsz(desc);
526     int idx = simd_data(desc);
527     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
528     uint32_t discard;
529 
530     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
531         int32_t mm = m[i];
532         for (j = 0; j < 16 / 4; ++j) {
533             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
534         }
535     }
536 }
537 
538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
539 {
540     intptr_t i, j, opr_sz = simd_oprsz(desc);
541     int idx = simd_data(desc);
542     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
543     uint32_t discard;
544 
545     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
546         int32_t mm = m[i];
547         for (j = 0; j < 16 / 4; ++j) {
548             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
549         }
550     }
551 }
552 
553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
554 static int64_t do_sat128_d(Int128 r)
555 {
556     int64_t ls = int128_getlo(r);
557     int64_t hs = int128_gethi(r);
558 
559     if (unlikely(hs != (ls >> 63))) {
560         return hs < 0 ? INT64_MIN : INT64_MAX;
561     }
562     return ls;
563 }
564 
565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
566 {
567     uint64_t l, h;
568     Int128 r, t;
569 
570     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
571     muls64(&l, &h, m, n);
572     r = int128_make128(l, h);
573     if (neg) {
574         r = int128_neg(r);
575     }
576     if (a) {
577         t = int128_exts64(a);
578         t = int128_lshift(t, 63);
579         r = int128_add(r, t);
580     }
581     if (round) {
582         t = int128_exts64(1ll << 62);
583         r = int128_add(r, t);
584     }
585     r = int128_rshift(r, 63);
586 
587     return do_sat128_d(r);
588 }
589 
590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
591                              void *va, uint32_t desc)
592 {
593     intptr_t i, opr_sz = simd_oprsz(desc);
594     int64_t *d = vd, *n = vn, *m = vm, *a = va;
595 
596     for (i = 0; i < opr_sz / 8; ++i) {
597         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
598     }
599 }
600 
601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
602                              void *va, uint32_t desc)
603 {
604     intptr_t i, opr_sz = simd_oprsz(desc);
605     int64_t *d = vd, *n = vn, *m = vm, *a = va;
606 
607     for (i = 0; i < opr_sz / 8; ++i) {
608         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
609     }
610 }
611 
612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
613 {
614     intptr_t i, opr_sz = simd_oprsz(desc);
615     int64_t *d = vd, *n = vn, *m = vm;
616 
617     for (i = 0; i < opr_sz / 8; ++i) {
618         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
619     }
620 }
621 
622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int64_t *d = vd, *n = vn, *m = vm;
626 
627     for (i = 0; i < opr_sz / 8; ++i) {
628         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
629     }
630 }
631 
632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
633 {
634     intptr_t i, j, opr_sz = simd_oprsz(desc);
635     int idx = simd_data(desc);
636     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
637 
638     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
639         int64_t mm = m[i];
640         for (j = 0; j < 16 / 8; ++j) {
641             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
642         }
643     }
644 }
645 
646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
647 {
648     intptr_t i, j, opr_sz = simd_oprsz(desc);
649     int idx = simd_data(desc);
650     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
651 
652     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
653         int64_t mm = m[i];
654         for (j = 0; j < 16 / 8; ++j) {
655             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
656         }
657     }
658 }
659 
660 /* Integer 8 and 16-bit dot-product.
661  *
662  * Note that for the loops herein, host endianness does not matter
663  * with respect to the ordering of data within the quad-width lanes.
664  * All elements are treated equally, no matter where they are.
665  */
666 
667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
669 {                                                                         \
670     intptr_t i, opr_sz = simd_oprsz(desc);                                \
671     TYPED *d = vd, *a = va;                                               \
672     TYPEN *n = vn;                                                        \
673     TYPEM *m = vm;                                                        \
674     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
675         d[i] = (a[i] +                                                    \
676                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
677                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
678                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
679                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
680     }                                                                     \
681     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
682 }
683 
684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
689 
690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
692 {                                                                         \
693     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
694     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
695     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
696     intptr_t index = simd_data(desc);                                     \
697     TYPED *d = vd, *a = va;                                               \
698     TYPEN *n = vn;                                                        \
699     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
700     do {                                                                  \
701         TYPED m0 = m_indexed[i * 4 + 0];                                  \
702         TYPED m1 = m_indexed[i * 4 + 1];                                  \
703         TYPED m2 = m_indexed[i * 4 + 2];                                  \
704         TYPED m3 = m_indexed[i * 4 + 3];                                  \
705         do {                                                              \
706             d[i] = (a[i] +                                                \
707                     n[i * 4 + 0] * m0 +                                   \
708                     n[i * 4 + 1] * m1 +                                   \
709                     n[i * 4 + 2] * m2 +                                   \
710                     n[i * 4 + 3] * m3);                                   \
711         } while (++i < segend);                                           \
712         segend = i + 4;                                                   \
713     } while (i < opr_sz_n);                                               \
714     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
715 }
716 
717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
723 
724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
725                          void *vfpst, uint32_t desc)
726 {
727     uintptr_t opr_sz = simd_oprsz(desc);
728     float16 *d = vd;
729     float16 *n = vn;
730     float16 *m = vm;
731     float_status *fpst = vfpst;
732     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
733     uint32_t neg_imag = neg_real ^ 1;
734     uintptr_t i;
735 
736     /* Shift boolean to the sign bit so we can xor to negate.  */
737     neg_real <<= 15;
738     neg_imag <<= 15;
739 
740     for (i = 0; i < opr_sz / 2; i += 2) {
741         float16 e0 = n[H2(i)];
742         float16 e1 = m[H2(i + 1)] ^ neg_imag;
743         float16 e2 = n[H2(i + 1)];
744         float16 e3 = m[H2(i)] ^ neg_real;
745 
746         d[H2(i)] = float16_add(e0, e1, fpst);
747         d[H2(i + 1)] = float16_add(e2, e3, fpst);
748     }
749     clear_tail(d, opr_sz, simd_maxsz(desc));
750 }
751 
752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
753                          void *vfpst, uint32_t desc)
754 {
755     uintptr_t opr_sz = simd_oprsz(desc);
756     float32 *d = vd;
757     float32 *n = vn;
758     float32 *m = vm;
759     float_status *fpst = vfpst;
760     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
761     uint32_t neg_imag = neg_real ^ 1;
762     uintptr_t i;
763 
764     /* Shift boolean to the sign bit so we can xor to negate.  */
765     neg_real <<= 31;
766     neg_imag <<= 31;
767 
768     for (i = 0; i < opr_sz / 4; i += 2) {
769         float32 e0 = n[H4(i)];
770         float32 e1 = m[H4(i + 1)] ^ neg_imag;
771         float32 e2 = n[H4(i + 1)];
772         float32 e3 = m[H4(i)] ^ neg_real;
773 
774         d[H4(i)] = float32_add(e0, e1, fpst);
775         d[H4(i + 1)] = float32_add(e2, e3, fpst);
776     }
777     clear_tail(d, opr_sz, simd_maxsz(desc));
778 }
779 
780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
781                          void *vfpst, uint32_t desc)
782 {
783     uintptr_t opr_sz = simd_oprsz(desc);
784     float64 *d = vd;
785     float64 *n = vn;
786     float64 *m = vm;
787     float_status *fpst = vfpst;
788     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
789     uint64_t neg_imag = neg_real ^ 1;
790     uintptr_t i;
791 
792     /* Shift boolean to the sign bit so we can xor to negate.  */
793     neg_real <<= 63;
794     neg_imag <<= 63;
795 
796     for (i = 0; i < opr_sz / 8; i += 2) {
797         float64 e0 = n[i];
798         float64 e1 = m[i + 1] ^ neg_imag;
799         float64 e2 = n[i + 1];
800         float64 e3 = m[i] ^ neg_real;
801 
802         d[i] = float64_add(e0, e1, fpst);
803         d[i + 1] = float64_add(e2, e3, fpst);
804     }
805     clear_tail(d, opr_sz, simd_maxsz(desc));
806 }
807 
808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
809                          void *vfpst, uint32_t desc)
810 {
811     uintptr_t opr_sz = simd_oprsz(desc);
812     float16 *d = vd, *n = vn, *m = vm, *a = va;
813     float_status *fpst = vfpst;
814     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
815     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
816     uint32_t neg_real = flip ^ neg_imag;
817     uintptr_t i;
818 
819     /* Shift boolean to the sign bit so we can xor to negate.  */
820     neg_real <<= 15;
821     neg_imag <<= 15;
822 
823     for (i = 0; i < opr_sz / 2; i += 2) {
824         float16 e2 = n[H2(i + flip)];
825         float16 e1 = m[H2(i + flip)] ^ neg_real;
826         float16 e4 = e2;
827         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
828 
829         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
830         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
831     }
832     clear_tail(d, opr_sz, simd_maxsz(desc));
833 }
834 
835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
836                              void *vfpst, uint32_t desc)
837 {
838     uintptr_t opr_sz = simd_oprsz(desc);
839     float16 *d = vd, *n = vn, *m = vm, *a = va;
840     float_status *fpst = vfpst;
841     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
842     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
843     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
844     uint32_t neg_real = flip ^ neg_imag;
845     intptr_t elements = opr_sz / sizeof(float16);
846     intptr_t eltspersegment = 16 / sizeof(float16);
847     intptr_t i, j;
848 
849     /* Shift boolean to the sign bit so we can xor to negate.  */
850     neg_real <<= 15;
851     neg_imag <<= 15;
852 
853     for (i = 0; i < elements; i += eltspersegment) {
854         float16 mr = m[H2(i + 2 * index + 0)];
855         float16 mi = m[H2(i + 2 * index + 1)];
856         float16 e1 = neg_real ^ (flip ? mi : mr);
857         float16 e3 = neg_imag ^ (flip ? mr : mi);
858 
859         for (j = i; j < i + eltspersegment; j += 2) {
860             float16 e2 = n[H2(j + flip)];
861             float16 e4 = e2;
862 
863             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
864             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
865         }
866     }
867     clear_tail(d, opr_sz, simd_maxsz(desc));
868 }
869 
870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
871                          void *vfpst, uint32_t desc)
872 {
873     uintptr_t opr_sz = simd_oprsz(desc);
874     float32 *d = vd, *n = vn, *m = vm, *a = va;
875     float_status *fpst = vfpst;
876     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
877     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
878     uint32_t neg_real = flip ^ neg_imag;
879     uintptr_t i;
880 
881     /* Shift boolean to the sign bit so we can xor to negate.  */
882     neg_real <<= 31;
883     neg_imag <<= 31;
884 
885     for (i = 0; i < opr_sz / 4; i += 2) {
886         float32 e2 = n[H4(i + flip)];
887         float32 e1 = m[H4(i + flip)] ^ neg_real;
888         float32 e4 = e2;
889         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
890 
891         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
892         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
893     }
894     clear_tail(d, opr_sz, simd_maxsz(desc));
895 }
896 
897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
898                              void *vfpst, uint32_t desc)
899 {
900     uintptr_t opr_sz = simd_oprsz(desc);
901     float32 *d = vd, *n = vn, *m = vm, *a = va;
902     float_status *fpst = vfpst;
903     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
904     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
905     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
906     uint32_t neg_real = flip ^ neg_imag;
907     intptr_t elements = opr_sz / sizeof(float32);
908     intptr_t eltspersegment = 16 / sizeof(float32);
909     intptr_t i, j;
910 
911     /* Shift boolean to the sign bit so we can xor to negate.  */
912     neg_real <<= 31;
913     neg_imag <<= 31;
914 
915     for (i = 0; i < elements; i += eltspersegment) {
916         float32 mr = m[H4(i + 2 * index + 0)];
917         float32 mi = m[H4(i + 2 * index + 1)];
918         float32 e1 = neg_real ^ (flip ? mi : mr);
919         float32 e3 = neg_imag ^ (flip ? mr : mi);
920 
921         for (j = i; j < i + eltspersegment; j += 2) {
922             float32 e2 = n[H4(j + flip)];
923             float32 e4 = e2;
924 
925             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
926             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
927         }
928     }
929     clear_tail(d, opr_sz, simd_maxsz(desc));
930 }
931 
932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
933                          void *vfpst, uint32_t desc)
934 {
935     uintptr_t opr_sz = simd_oprsz(desc);
936     float64 *d = vd, *n = vn, *m = vm, *a = va;
937     float_status *fpst = vfpst;
938     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
939     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
940     uint64_t neg_real = flip ^ neg_imag;
941     uintptr_t i;
942 
943     /* Shift boolean to the sign bit so we can xor to negate.  */
944     neg_real <<= 63;
945     neg_imag <<= 63;
946 
947     for (i = 0; i < opr_sz / 8; i += 2) {
948         float64 e2 = n[i + flip];
949         float64 e1 = m[i + flip] ^ neg_real;
950         float64 e4 = e2;
951         float64 e3 = m[i + 1 - flip] ^ neg_imag;
952 
953         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
954         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
955     }
956     clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958 
959 /*
960  * Floating point comparisons producing an integer result (all 1s or all 0s).
961  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
962  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
963  */
964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
965 {
966     return -float16_eq_quiet(op1, op2, stat);
967 }
968 
969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
970 {
971     return -float32_eq_quiet(op1, op2, stat);
972 }
973 
974 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
975 {
976     return -float64_eq_quiet(op1, op2, stat);
977 }
978 
979 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
980 {
981     return -float16_le(op2, op1, stat);
982 }
983 
984 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
985 {
986     return -float32_le(op2, op1, stat);
987 }
988 
989 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
990 {
991     return -float64_le(op2, op1, stat);
992 }
993 
994 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
995 {
996     return -float16_lt(op2, op1, stat);
997 }
998 
999 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1000 {
1001     return -float32_lt(op2, op1, stat);
1002 }
1003 
1004 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1005 {
1006     return -float64_lt(op2, op1, stat);
1007 }
1008 
1009 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1010 {
1011     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1012 }
1013 
1014 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1015 {
1016     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1017 }
1018 
1019 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1020 {
1021     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1022 }
1023 
1024 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1025 {
1026     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1027 }
1028 
1029 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1030 {
1031     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1032 }
1033 
1034 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1035 {
1036     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1037 }
1038 
1039 static int16_t vfp_tosszh(float16 x, void *fpstp)
1040 {
1041     float_status *fpst = fpstp;
1042     if (float16_is_any_nan(x)) {
1043         float_raise(float_flag_invalid, fpst);
1044         return 0;
1045     }
1046     return float16_to_int16_round_to_zero(x, fpst);
1047 }
1048 
1049 static uint16_t vfp_touszh(float16 x, void *fpstp)
1050 {
1051     float_status *fpst = fpstp;
1052     if (float16_is_any_nan(x)) {
1053         float_raise(float_flag_invalid, fpst);
1054         return 0;
1055     }
1056     return float16_to_uint16_round_to_zero(x, fpst);
1057 }
1058 
1059 #define DO_2OP(NAME, FUNC, TYPE) \
1060 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1061 {                                                                 \
1062     intptr_t i, oprsz = simd_oprsz(desc);                         \
1063     TYPE *d = vd, *n = vn;                                        \
1064     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1065         d[i] = FUNC(n[i], stat);                                  \
1066     }                                                             \
1067     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1068 }
1069 
1070 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1071 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1072 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1073 
1074 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1075 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1076 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1077 
1078 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1079 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1080 
1081 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1082 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1083 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1084 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1085 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1086 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1087 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1088 DO_2OP(gvec_touszh, vfp_touszh, float16)
1089 
1090 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1091     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1092     {                                                           \
1093         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1094     }
1095 
1096 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1097     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1098     {                                                           \
1099         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1100     }
1101 
1102 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1103     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1104     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1105     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1106     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1107 
1108 DO_2OP_CMP0(cgt, cgt, FWD)
1109 DO_2OP_CMP0(cge, cge, FWD)
1110 DO_2OP_CMP0(ceq, ceq, FWD)
1111 DO_2OP_CMP0(clt, cgt, REV)
1112 DO_2OP_CMP0(cle, cge, REV)
1113 
1114 #undef DO_2OP
1115 #undef DO_2OP_CMP0
1116 
1117 /* Floating-point trigonometric starting value.
1118  * See the ARM ARM pseudocode function FPTrigSMul.
1119  */
1120 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1121 {
1122     float16 result = float16_mul(op1, op1, stat);
1123     if (!float16_is_any_nan(result)) {
1124         result = float16_set_sign(result, op2 & 1);
1125     }
1126     return result;
1127 }
1128 
1129 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1130 {
1131     float32 result = float32_mul(op1, op1, stat);
1132     if (!float32_is_any_nan(result)) {
1133         result = float32_set_sign(result, op2 & 1);
1134     }
1135     return result;
1136 }
1137 
1138 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1139 {
1140     float64 result = float64_mul(op1, op1, stat);
1141     if (!float64_is_any_nan(result)) {
1142         result = float64_set_sign(result, op2 & 1);
1143     }
1144     return result;
1145 }
1146 
1147 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1148 {
1149     return float16_abs(float16_sub(op1, op2, stat));
1150 }
1151 
1152 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1153 {
1154     return float32_abs(float32_sub(op1, op2, stat));
1155 }
1156 
1157 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1158 {
1159     return float64_abs(float64_sub(op1, op2, stat));
1160 }
1161 
1162 /*
1163  * Reciprocal step. These are the AArch32 version which uses a
1164  * non-fused multiply-and-subtract.
1165  */
1166 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1167 {
1168     op1 = float16_squash_input_denormal(op1, stat);
1169     op2 = float16_squash_input_denormal(op2, stat);
1170 
1171     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1172         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1173         return float16_two;
1174     }
1175     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1176 }
1177 
1178 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1179 {
1180     op1 = float32_squash_input_denormal(op1, stat);
1181     op2 = float32_squash_input_denormal(op2, stat);
1182 
1183     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1184         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1185         return float32_two;
1186     }
1187     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1188 }
1189 
1190 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1191 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1192 {
1193     op1 = float16_squash_input_denormal(op1, stat);
1194     op2 = float16_squash_input_denormal(op2, stat);
1195 
1196     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1197         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1198         return float16_one_point_five;
1199     }
1200     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1201     return float16_div(op1, float16_two, stat);
1202 }
1203 
1204 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1205 {
1206     op1 = float32_squash_input_denormal(op1, stat);
1207     op2 = float32_squash_input_denormal(op2, stat);
1208 
1209     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1210         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1211         return float32_one_point_five;
1212     }
1213     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1214     return float32_div(op1, float32_two, stat);
1215 }
1216 
1217 #define DO_3OP(NAME, FUNC, TYPE) \
1218 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1219 {                                                                          \
1220     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1221     TYPE *d = vd, *n = vn, *m = vm;                                        \
1222     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1223         d[i] = FUNC(n[i], m[i], stat);                                     \
1224     }                                                                      \
1225     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1226 }
1227 
1228 DO_3OP(gvec_fadd_h, float16_add, float16)
1229 DO_3OP(gvec_fadd_s, float32_add, float32)
1230 DO_3OP(gvec_fadd_d, float64_add, float64)
1231 
1232 DO_3OP(gvec_fsub_h, float16_sub, float16)
1233 DO_3OP(gvec_fsub_s, float32_sub, float32)
1234 DO_3OP(gvec_fsub_d, float64_sub, float64)
1235 
1236 DO_3OP(gvec_fmul_h, float16_mul, float16)
1237 DO_3OP(gvec_fmul_s, float32_mul, float32)
1238 DO_3OP(gvec_fmul_d, float64_mul, float64)
1239 
1240 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1241 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1242 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1243 
1244 DO_3OP(gvec_fabd_h, float16_abd, float16)
1245 DO_3OP(gvec_fabd_s, float32_abd, float32)
1246 DO_3OP(gvec_fabd_d, float64_abd, float64)
1247 
1248 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1249 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1250 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1251 
1252 DO_3OP(gvec_fcge_h, float16_cge, float16)
1253 DO_3OP(gvec_fcge_s, float32_cge, float32)
1254 DO_3OP(gvec_fcge_d, float64_cge, float64)
1255 
1256 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1257 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1258 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1259 
1260 DO_3OP(gvec_facge_h, float16_acge, float16)
1261 DO_3OP(gvec_facge_s, float32_acge, float32)
1262 DO_3OP(gvec_facge_d, float64_acge, float64)
1263 
1264 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1265 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1266 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1267 
1268 DO_3OP(gvec_fmax_h, float16_max, float16)
1269 DO_3OP(gvec_fmax_s, float32_max, float32)
1270 DO_3OP(gvec_fmax_d, float64_max, float64)
1271 
1272 DO_3OP(gvec_fmin_h, float16_min, float16)
1273 DO_3OP(gvec_fmin_s, float32_min, float32)
1274 DO_3OP(gvec_fmin_d, float64_min, float64)
1275 
1276 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1277 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1278 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1279 
1280 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1281 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1282 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1283 
1284 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1285 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1286 
1287 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1288 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1289 
1290 #ifdef TARGET_AARCH64
1291 DO_3OP(gvec_fdiv_h, float16_div, float16)
1292 DO_3OP(gvec_fdiv_s, float32_div, float32)
1293 DO_3OP(gvec_fdiv_d, float64_div, float64)
1294 
1295 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1296 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1297 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1298 
1299 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1300 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1301 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1302 
1303 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1304 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1305 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1306 
1307 #endif
1308 #undef DO_3OP
1309 
1310 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1311 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1312                                  float_status *stat)
1313 {
1314     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1315 }
1316 
1317 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1318                                  float_status *stat)
1319 {
1320     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1321 }
1322 
1323 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1324                                  float_status *stat)
1325 {
1326     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1327 }
1328 
1329 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1330                                  float_status *stat)
1331 {
1332     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1333 }
1334 
1335 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1336 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1337                                 float_status *stat)
1338 {
1339     return float16_muladd(op1, op2, dest, 0, stat);
1340 }
1341 
1342 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1343                                  float_status *stat)
1344 {
1345     return float32_muladd(op1, op2, dest, 0, stat);
1346 }
1347 
1348 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1349                                  float_status *stat)
1350 {
1351     return float64_muladd(op1, op2, dest, 0, stat);
1352 }
1353 
1354 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1355                                  float_status *stat)
1356 {
1357     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1358 }
1359 
1360 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1361                                  float_status *stat)
1362 {
1363     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1364 }
1365 
1366 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1367                                  float_status *stat)
1368 {
1369     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1370 }
1371 
1372 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1373 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1374 {                                                                          \
1375     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1376     TYPE *d = vd, *n = vn, *m = vm;                                        \
1377     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1378         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1379     }                                                                      \
1380     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1381 }
1382 
1383 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1384 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1385 
1386 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1387 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1388 
1389 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1390 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1391 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1392 
1393 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1394 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1395 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1396 
1397 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1398  * For AdvSIMD, there is of course only one such vector segment.
1399  */
1400 
1401 #define DO_MUL_IDX(NAME, TYPE, H) \
1402 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1403 {                                                                          \
1404     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1405     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1406     intptr_t idx = simd_data(desc);                                        \
1407     TYPE *d = vd, *n = vn, *m = vm;                                        \
1408     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1409         TYPE mm = m[H(i + idx)];                                           \
1410         for (j = 0; j < segment; j++) {                                    \
1411             d[i + j] = n[i + j] * mm;                                      \
1412         }                                                                  \
1413     }                                                                      \
1414     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1415 }
1416 
1417 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1418 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1419 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1420 
1421 #undef DO_MUL_IDX
1422 
1423 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1424 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1425 {                                                                          \
1426     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1427     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1428     intptr_t idx = simd_data(desc);                                        \
1429     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1430     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1431         TYPE mm = m[H(i + idx)];                                           \
1432         for (j = 0; j < segment; j++) {                                    \
1433             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1434         }                                                                  \
1435     }                                                                      \
1436     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1437 }
1438 
1439 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1440 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1441 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1442 
1443 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1444 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1445 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1446 
1447 #undef DO_MLA_IDX
1448 
1449 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1450 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1451 {                                                                          \
1452     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1453     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1454     intptr_t idx = simd_data(desc);                                        \
1455     TYPE *d = vd, *n = vn, *m = vm;                                        \
1456     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1457         TYPE mm = m[H(i + idx)];                                           \
1458         for (j = 0; j < segment; j++) {                                    \
1459             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1460         }                                                                  \
1461     }                                                                      \
1462     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1463 }
1464 
1465 #define nop(N, M, S) (M)
1466 
1467 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1468 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1469 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1470 
1471 #ifdef TARGET_AARCH64
1472 
1473 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1474 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1475 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1476 
1477 #endif
1478 
1479 #undef nop
1480 
1481 /*
1482  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1483  * the fused ops below they assume accumulate both from and into Vd.
1484  */
1485 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1486 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1487 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1488 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1489 
1490 #undef DO_FMUL_IDX
1491 
1492 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1493 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1494                   void *stat, uint32_t desc)                               \
1495 {                                                                          \
1496     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1497     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1498     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1499     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1500     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1501     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1502     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1503         TYPE mm = m[H(i + idx)];                                           \
1504         for (j = 0; j < segment; j++) {                                    \
1505             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1506                                      mm, a[i + j], 0, stat);               \
1507         }                                                                  \
1508     }                                                                      \
1509     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1510 }
1511 
1512 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1513 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1514 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1515 
1516 #undef DO_FMLA_IDX
1517 
1518 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1519 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1520 {                                                                          \
1521     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1522     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1523     bool q = false;                                                        \
1524     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1525         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1526         if (dd < MIN) {                                                    \
1527             dd = MIN;                                                      \
1528             q = true;                                                      \
1529         } else if (dd > MAX) {                                             \
1530             dd = MAX;                                                      \
1531             q = true;                                                      \
1532         }                                                                  \
1533         d[i] = dd;                                                         \
1534     }                                                                      \
1535     if (q) {                                                               \
1536         uint32_t *qc = vq;                                                 \
1537         qc[0] = 1;                                                         \
1538     }                                                                      \
1539     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1540 }
1541 
1542 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1543 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1544 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1545 
1546 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1547 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1548 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1549 
1550 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1551 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1552 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1553 
1554 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1555 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1556 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1557 
1558 #undef DO_SAT
1559 
1560 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1561                           void *vm, uint32_t desc)
1562 {
1563     intptr_t i, oprsz = simd_oprsz(desc);
1564     uint64_t *d = vd, *n = vn, *m = vm;
1565     bool q = false;
1566 
1567     for (i = 0; i < oprsz / 8; i++) {
1568         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1569         if (dd < nn) {
1570             dd = UINT64_MAX;
1571             q = true;
1572         }
1573         d[i] = dd;
1574     }
1575     if (q) {
1576         uint32_t *qc = vq;
1577         qc[0] = 1;
1578     }
1579     clear_tail(d, oprsz, simd_maxsz(desc));
1580 }
1581 
1582 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1583                           void *vm, uint32_t desc)
1584 {
1585     intptr_t i, oprsz = simd_oprsz(desc);
1586     uint64_t *d = vd, *n = vn, *m = vm;
1587     bool q = false;
1588 
1589     for (i = 0; i < oprsz / 8; i++) {
1590         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1591         if (nn < mm) {
1592             dd = 0;
1593             q = true;
1594         }
1595         d[i] = dd;
1596     }
1597     if (q) {
1598         uint32_t *qc = vq;
1599         qc[0] = 1;
1600     }
1601     clear_tail(d, oprsz, simd_maxsz(desc));
1602 }
1603 
1604 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1605                           void *vm, uint32_t desc)
1606 {
1607     intptr_t i, oprsz = simd_oprsz(desc);
1608     int64_t *d = vd, *n = vn, *m = vm;
1609     bool q = false;
1610 
1611     for (i = 0; i < oprsz / 8; i++) {
1612         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1613         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1614             dd = (nn >> 63) ^ ~INT64_MIN;
1615             q = true;
1616         }
1617         d[i] = dd;
1618     }
1619     if (q) {
1620         uint32_t *qc = vq;
1621         qc[0] = 1;
1622     }
1623     clear_tail(d, oprsz, simd_maxsz(desc));
1624 }
1625 
1626 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1627                           void *vm, uint32_t desc)
1628 {
1629     intptr_t i, oprsz = simd_oprsz(desc);
1630     int64_t *d = vd, *n = vn, *m = vm;
1631     bool q = false;
1632 
1633     for (i = 0; i < oprsz / 8; i++) {
1634         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1635         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1636             dd = (nn >> 63) ^ ~INT64_MIN;
1637             q = true;
1638         }
1639         d[i] = dd;
1640     }
1641     if (q) {
1642         uint32_t *qc = vq;
1643         qc[0] = 1;
1644     }
1645     clear_tail(d, oprsz, simd_maxsz(desc));
1646 }
1647 
1648 
1649 #define DO_SRA(NAME, TYPE)                              \
1650 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1651 {                                                       \
1652     intptr_t i, oprsz = simd_oprsz(desc);               \
1653     int shift = simd_data(desc);                        \
1654     TYPE *d = vd, *n = vn;                              \
1655     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1656         d[i] += n[i] >> shift;                          \
1657     }                                                   \
1658     clear_tail(d, oprsz, simd_maxsz(desc));             \
1659 }
1660 
1661 DO_SRA(gvec_ssra_b, int8_t)
1662 DO_SRA(gvec_ssra_h, int16_t)
1663 DO_SRA(gvec_ssra_s, int32_t)
1664 DO_SRA(gvec_ssra_d, int64_t)
1665 
1666 DO_SRA(gvec_usra_b, uint8_t)
1667 DO_SRA(gvec_usra_h, uint16_t)
1668 DO_SRA(gvec_usra_s, uint32_t)
1669 DO_SRA(gvec_usra_d, uint64_t)
1670 
1671 #undef DO_SRA
1672 
1673 #define DO_RSHR(NAME, TYPE)                             \
1674 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1675 {                                                       \
1676     intptr_t i, oprsz = simd_oprsz(desc);               \
1677     int shift = simd_data(desc);                        \
1678     TYPE *d = vd, *n = vn;                              \
1679     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1680         TYPE tmp = n[i] >> (shift - 1);                 \
1681         d[i] = (tmp >> 1) + (tmp & 1);                  \
1682     }                                                   \
1683     clear_tail(d, oprsz, simd_maxsz(desc));             \
1684 }
1685 
1686 DO_RSHR(gvec_srshr_b, int8_t)
1687 DO_RSHR(gvec_srshr_h, int16_t)
1688 DO_RSHR(gvec_srshr_s, int32_t)
1689 DO_RSHR(gvec_srshr_d, int64_t)
1690 
1691 DO_RSHR(gvec_urshr_b, uint8_t)
1692 DO_RSHR(gvec_urshr_h, uint16_t)
1693 DO_RSHR(gvec_urshr_s, uint32_t)
1694 DO_RSHR(gvec_urshr_d, uint64_t)
1695 
1696 #undef DO_RSHR
1697 
1698 #define DO_RSRA(NAME, TYPE)                             \
1699 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1700 {                                                       \
1701     intptr_t i, oprsz = simd_oprsz(desc);               \
1702     int shift = simd_data(desc);                        \
1703     TYPE *d = vd, *n = vn;                              \
1704     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1705         TYPE tmp = n[i] >> (shift - 1);                 \
1706         d[i] += (tmp >> 1) + (tmp & 1);                 \
1707     }                                                   \
1708     clear_tail(d, oprsz, simd_maxsz(desc));             \
1709 }
1710 
1711 DO_RSRA(gvec_srsra_b, int8_t)
1712 DO_RSRA(gvec_srsra_h, int16_t)
1713 DO_RSRA(gvec_srsra_s, int32_t)
1714 DO_RSRA(gvec_srsra_d, int64_t)
1715 
1716 DO_RSRA(gvec_ursra_b, uint8_t)
1717 DO_RSRA(gvec_ursra_h, uint16_t)
1718 DO_RSRA(gvec_ursra_s, uint32_t)
1719 DO_RSRA(gvec_ursra_d, uint64_t)
1720 
1721 #undef DO_RSRA
1722 
1723 #define DO_SRI(NAME, TYPE)                              \
1724 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1725 {                                                       \
1726     intptr_t i, oprsz = simd_oprsz(desc);               \
1727     int shift = simd_data(desc);                        \
1728     TYPE *d = vd, *n = vn;                              \
1729     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1730         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1731     }                                                   \
1732     clear_tail(d, oprsz, simd_maxsz(desc));             \
1733 }
1734 
1735 DO_SRI(gvec_sri_b, uint8_t)
1736 DO_SRI(gvec_sri_h, uint16_t)
1737 DO_SRI(gvec_sri_s, uint32_t)
1738 DO_SRI(gvec_sri_d, uint64_t)
1739 
1740 #undef DO_SRI
1741 
1742 #define DO_SLI(NAME, TYPE)                              \
1743 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1744 {                                                       \
1745     intptr_t i, oprsz = simd_oprsz(desc);               \
1746     int shift = simd_data(desc);                        \
1747     TYPE *d = vd, *n = vn;                              \
1748     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1749         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1750     }                                                   \
1751     clear_tail(d, oprsz, simd_maxsz(desc));             \
1752 }
1753 
1754 DO_SLI(gvec_sli_b, uint8_t)
1755 DO_SLI(gvec_sli_h, uint16_t)
1756 DO_SLI(gvec_sli_s, uint32_t)
1757 DO_SLI(gvec_sli_d, uint64_t)
1758 
1759 #undef DO_SLI
1760 
1761 /*
1762  * Convert float16 to float32, raising no exceptions and
1763  * preserving exceptional values, including SNaN.
1764  * This is effectively an unpack+repack operation.
1765  */
1766 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1767 {
1768     const int f16_bias = 15;
1769     const int f32_bias = 127;
1770     uint32_t sign = extract32(f16, 15, 1);
1771     uint32_t exp = extract32(f16, 10, 5);
1772     uint32_t frac = extract32(f16, 0, 10);
1773 
1774     if (exp == 0x1f) {
1775         /* Inf or NaN */
1776         exp = 0xff;
1777     } else if (exp == 0) {
1778         /* Zero or denormal.  */
1779         if (frac != 0) {
1780             if (fz16) {
1781                 frac = 0;
1782             } else {
1783                 /*
1784                  * Denormal; these are all normal float32.
1785                  * Shift the fraction so that the msb is at bit 11,
1786                  * then remove bit 11 as the implicit bit of the
1787                  * normalized float32.  Note that we still go through
1788                  * the shift for normal numbers below, to put the
1789                  * float32 fraction at the right place.
1790                  */
1791                 int shift = clz32(frac) - 21;
1792                 frac = (frac << shift) & 0x3ff;
1793                 exp = f32_bias - f16_bias - shift + 1;
1794             }
1795         }
1796     } else {
1797         /* Normal number; adjust the bias.  */
1798         exp += f32_bias - f16_bias;
1799     }
1800     sign <<= 31;
1801     exp <<= 23;
1802     frac <<= 23 - 10;
1803 
1804     return sign | exp | frac;
1805 }
1806 
1807 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1808 {
1809     /*
1810      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1811      * Load the 2nd qword iff is_q & is_2.
1812      * Shift to the 2nd dword iff !is_q & is_2.
1813      * For !is_q & !is_2, the upper bits of the result are garbage.
1814      */
1815     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1816 }
1817 
1818 /*
1819  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1820  * as there is not yet SVE versions that might use blocking.
1821  */
1822 
1823 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1824                      uint32_t desc, bool fz16)
1825 {
1826     intptr_t i, oprsz = simd_oprsz(desc);
1827     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1828     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1829     int is_q = oprsz == 16;
1830     uint64_t n_4, m_4;
1831 
1832     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1833     n_4 = load4_f16(vn, is_q, is_2);
1834     m_4 = load4_f16(vm, is_q, is_2);
1835 
1836     /* Negate all inputs for FMLSL at once.  */
1837     if (is_s) {
1838         n_4 ^= 0x8000800080008000ull;
1839     }
1840 
1841     for (i = 0; i < oprsz / 4; i++) {
1842         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1843         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1844         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1845     }
1846     clear_tail(d, oprsz, simd_maxsz(desc));
1847 }
1848 
1849 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1850                             void *venv, uint32_t desc)
1851 {
1852     CPUARMState *env = venv;
1853     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1854              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1855 }
1856 
1857 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1858                             void *venv, uint32_t desc)
1859 {
1860     CPUARMState *env = venv;
1861     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1862              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1863 }
1864 
1865 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1866                                void *venv, uint32_t desc)
1867 {
1868     intptr_t i, oprsz = simd_oprsz(desc);
1869     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1870     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1871     CPUARMState *env = venv;
1872     float_status *status = &env->vfp.fp_status;
1873     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1874 
1875     for (i = 0; i < oprsz; i += sizeof(float32)) {
1876         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1877         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1878         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1879         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1880         float32 aa = *(float32 *)(va + H1_4(i));
1881 
1882         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1883     }
1884 }
1885 
1886 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1887                          uint32_t desc, bool fz16)
1888 {
1889     intptr_t i, oprsz = simd_oprsz(desc);
1890     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1891     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1892     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1893     int is_q = oprsz == 16;
1894     uint64_t n_4;
1895     float32 m_1;
1896 
1897     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1898     n_4 = load4_f16(vn, is_q, is_2);
1899 
1900     /* Negate all inputs for FMLSL at once.  */
1901     if (is_s) {
1902         n_4 ^= 0x8000800080008000ull;
1903     }
1904 
1905     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1906 
1907     for (i = 0; i < oprsz / 4; i++) {
1908         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1909         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1910     }
1911     clear_tail(d, oprsz, simd_maxsz(desc));
1912 }
1913 
1914 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1915                                 void *venv, uint32_t desc)
1916 {
1917     CPUARMState *env = venv;
1918     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1919                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1920 }
1921 
1922 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1923                                 void *venv, uint32_t desc)
1924 {
1925     CPUARMState *env = venv;
1926     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1927                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1928 }
1929 
1930 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1931                                void *venv, uint32_t desc)
1932 {
1933     intptr_t i, j, oprsz = simd_oprsz(desc);
1934     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1935     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1936     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1937     CPUARMState *env = venv;
1938     float_status *status = &env->vfp.fp_status;
1939     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1940 
1941     for (i = 0; i < oprsz; i += 16) {
1942         float16 mm_16 = *(float16 *)(vm + i + idx);
1943         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1944 
1945         for (j = 0; j < 16; j += sizeof(float32)) {
1946             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1947             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1948             float32 aa = *(float32 *)(va + H1_4(i + j));
1949 
1950             *(float32 *)(vd + H1_4(i + j)) =
1951                 float32_muladd(nn, mm, aa, 0, status);
1952         }
1953     }
1954 }
1955 
1956 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1957 {
1958     intptr_t i, opr_sz = simd_oprsz(desc);
1959     int8_t *d = vd, *n = vn, *m = vm;
1960 
1961     for (i = 0; i < opr_sz; ++i) {
1962         int8_t mm = m[i];
1963         int8_t nn = n[i];
1964         int8_t res = 0;
1965         if (mm >= 0) {
1966             if (mm < 8) {
1967                 res = nn << mm;
1968             }
1969         } else {
1970             res = nn >> (mm > -8 ? -mm : 7);
1971         }
1972         d[i] = res;
1973     }
1974     clear_tail(d, opr_sz, simd_maxsz(desc));
1975 }
1976 
1977 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1978 {
1979     intptr_t i, opr_sz = simd_oprsz(desc);
1980     int16_t *d = vd, *n = vn, *m = vm;
1981 
1982     for (i = 0; i < opr_sz / 2; ++i) {
1983         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1984         int16_t nn = n[i];
1985         int16_t res = 0;
1986         if (mm >= 0) {
1987             if (mm < 16) {
1988                 res = nn << mm;
1989             }
1990         } else {
1991             res = nn >> (mm > -16 ? -mm : 15);
1992         }
1993         d[i] = res;
1994     }
1995     clear_tail(d, opr_sz, simd_maxsz(desc));
1996 }
1997 
1998 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1999 {
2000     intptr_t i, opr_sz = simd_oprsz(desc);
2001     uint8_t *d = vd, *n = vn, *m = vm;
2002 
2003     for (i = 0; i < opr_sz; ++i) {
2004         int8_t mm = m[i];
2005         uint8_t nn = n[i];
2006         uint8_t res = 0;
2007         if (mm >= 0) {
2008             if (mm < 8) {
2009                 res = nn << mm;
2010             }
2011         } else {
2012             if (mm > -8) {
2013                 res = nn >> -mm;
2014             }
2015         }
2016         d[i] = res;
2017     }
2018     clear_tail(d, opr_sz, simd_maxsz(desc));
2019 }
2020 
2021 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2022 {
2023     intptr_t i, opr_sz = simd_oprsz(desc);
2024     uint16_t *d = vd, *n = vn, *m = vm;
2025 
2026     for (i = 0; i < opr_sz / 2; ++i) {
2027         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2028         uint16_t nn = n[i];
2029         uint16_t res = 0;
2030         if (mm >= 0) {
2031             if (mm < 16) {
2032                 res = nn << mm;
2033             }
2034         } else {
2035             if (mm > -16) {
2036                 res = nn >> -mm;
2037             }
2038         }
2039         d[i] = res;
2040     }
2041     clear_tail(d, opr_sz, simd_maxsz(desc));
2042 }
2043 
2044 /*
2045  * 8x8->8 polynomial multiply.
2046  *
2047  * Polynomial multiplication is like integer multiplication except the
2048  * partial products are XORed, not added.
2049  *
2050  * TODO: expose this as a generic vector operation, as it is a common
2051  * crypto building block.
2052  */
2053 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2054 {
2055     intptr_t i, opr_sz = simd_oprsz(desc);
2056     uint64_t *d = vd, *n = vn, *m = vm;
2057 
2058     for (i = 0; i < opr_sz / 8; ++i) {
2059         d[i] = clmul_8x8_low(n[i], m[i]);
2060     }
2061     clear_tail(d, opr_sz, simd_maxsz(desc));
2062 }
2063 
2064 /*
2065  * 64x64->128 polynomial multiply.
2066  * Because of the lanes are not accessed in strict columns,
2067  * this probably cannot be turned into a generic helper.
2068  */
2069 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2070 {
2071     intptr_t i, opr_sz = simd_oprsz(desc);
2072     intptr_t hi = simd_data(desc);
2073     uint64_t *d = vd, *n = vn, *m = vm;
2074 
2075     for (i = 0; i < opr_sz / 8; i += 2) {
2076         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2077         d[i] = int128_getlo(r);
2078         d[i + 1] = int128_gethi(r);
2079     }
2080     clear_tail(d, opr_sz, simd_maxsz(desc));
2081 }
2082 
2083 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2084 {
2085     int hi = simd_data(desc);
2086     uint64_t *d = vd, *n = vn, *m = vm;
2087     uint64_t nn = n[hi], mm = m[hi];
2088 
2089     d[0] = clmul_8x4_packed(nn, mm);
2090     nn >>= 32;
2091     mm >>= 32;
2092     d[1] = clmul_8x4_packed(nn, mm);
2093 
2094     clear_tail(d, 16, simd_maxsz(desc));
2095 }
2096 
2097 #ifdef TARGET_AARCH64
2098 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2099 {
2100     int shift = simd_data(desc) * 8;
2101     intptr_t i, opr_sz = simd_oprsz(desc);
2102     uint64_t *d = vd, *n = vn, *m = vm;
2103 
2104     for (i = 0; i < opr_sz / 8; ++i) {
2105         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2106     }
2107 }
2108 
2109 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2110 {
2111     intptr_t sel = H4(simd_data(desc));
2112     intptr_t i, opr_sz = simd_oprsz(desc);
2113     uint32_t *n = vn, *m = vm;
2114     uint64_t *d = vd;
2115 
2116     for (i = 0; i < opr_sz / 8; ++i) {
2117         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2118     }
2119 }
2120 #endif
2121 
2122 #define DO_CMP0(NAME, TYPE, OP)                         \
2123 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2124 {                                                       \
2125     intptr_t i, opr_sz = simd_oprsz(desc);              \
2126     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2127         TYPE nn = *(TYPE *)(vn + i);                    \
2128         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2129     }                                                   \
2130     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2131 }
2132 
2133 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2134 DO_CMP0(gvec_clt0_b, int8_t, <)
2135 DO_CMP0(gvec_cle0_b, int8_t, <=)
2136 DO_CMP0(gvec_cgt0_b, int8_t, >)
2137 DO_CMP0(gvec_cge0_b, int8_t, >=)
2138 
2139 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2140 DO_CMP0(gvec_clt0_h, int16_t, <)
2141 DO_CMP0(gvec_cle0_h, int16_t, <=)
2142 DO_CMP0(gvec_cgt0_h, int16_t, >)
2143 DO_CMP0(gvec_cge0_h, int16_t, >=)
2144 
2145 #undef DO_CMP0
2146 
2147 #define DO_ABD(NAME, TYPE)                                      \
2148 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2149 {                                                               \
2150     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2151     TYPE *d = vd, *n = vn, *m = vm;                             \
2152                                                                 \
2153     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2154         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2155     }                                                           \
2156     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2157 }
2158 
2159 DO_ABD(gvec_sabd_b, int8_t)
2160 DO_ABD(gvec_sabd_h, int16_t)
2161 DO_ABD(gvec_sabd_s, int32_t)
2162 DO_ABD(gvec_sabd_d, int64_t)
2163 
2164 DO_ABD(gvec_uabd_b, uint8_t)
2165 DO_ABD(gvec_uabd_h, uint16_t)
2166 DO_ABD(gvec_uabd_s, uint32_t)
2167 DO_ABD(gvec_uabd_d, uint64_t)
2168 
2169 #undef DO_ABD
2170 
2171 #define DO_ABA(NAME, TYPE)                                      \
2172 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2173 {                                                               \
2174     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2175     TYPE *d = vd, *n = vn, *m = vm;                             \
2176                                                                 \
2177     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2178         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2179     }                                                           \
2180     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2181 }
2182 
2183 DO_ABA(gvec_saba_b, int8_t)
2184 DO_ABA(gvec_saba_h, int16_t)
2185 DO_ABA(gvec_saba_s, int32_t)
2186 DO_ABA(gvec_saba_d, int64_t)
2187 
2188 DO_ABA(gvec_uaba_b, uint8_t)
2189 DO_ABA(gvec_uaba_h, uint16_t)
2190 DO_ABA(gvec_uaba_s, uint32_t)
2191 DO_ABA(gvec_uaba_d, uint64_t)
2192 
2193 #undef DO_ABA
2194 
2195 #define DO_NEON_PAIRWISE(NAME, OP)                                      \
2196     void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
2197                          void *stat, uint32_t oprsz)                    \
2198     {                                                                   \
2199         float_status *fpst = stat;                                      \
2200         float32 *d = vd;                                                \
2201         float32 *n = vn;                                                \
2202         float32 *m = vm;                                                \
2203         float32 r0, r1;                                                 \
2204                                                                         \
2205         /* Read all inputs before writing outputs in case vm == vd */   \
2206         r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
2207         r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
2208                                                                         \
2209         d[H4(0)] = r0;                                                  \
2210         d[H4(1)] = r1;                                                  \
2211     }                                                                   \
2212                                                                         \
2213     void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
2214                          void *stat, uint32_t oprsz)                    \
2215     {                                                                   \
2216         float_status *fpst = stat;                                      \
2217         float16 *d = vd;                                                \
2218         float16 *n = vn;                                                \
2219         float16 *m = vm;                                                \
2220         float16 r0, r1, r2, r3;                                         \
2221                                                                         \
2222         /* Read all inputs before writing outputs in case vm == vd */   \
2223         r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
2224         r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
2225         r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
2226         r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
2227                                                                         \
2228         d[H2(0)] = r0;                                                  \
2229         d[H2(1)] = r1;                                                  \
2230         d[H2(2)] = r2;                                                  \
2231         d[H2(3)] = r3;                                                  \
2232     }
2233 
2234 DO_NEON_PAIRWISE(neon_padd, add)
2235 DO_NEON_PAIRWISE(neon_pmax, max)
2236 DO_NEON_PAIRWISE(neon_pmin, min)
2237 
2238 #undef DO_NEON_PAIRWISE
2239 
2240 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2241 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2242 {                                                                          \
2243     ARMVectorReg scratch;                                                  \
2244     intptr_t oprsz = simd_oprsz(desc);                                     \
2245     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2246     TYPE *d = vd, *n = vn, *m = vm;                                        \
2247     if (unlikely(d == m)) {                                                \
2248         m = memcpy(&scratch, m, oprsz);                                    \
2249     }                                                                      \
2250     for (intptr_t i = 0; i < half; ++i) {                                  \
2251         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2252     }                                                                      \
2253     for (intptr_t i = 0; i < half; ++i) {                                  \
2254         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2255     }                                                                      \
2256     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2257 }
2258 
2259 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2260 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2261 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2262 
2263 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2264 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2265 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2266 
2267 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2268 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2269 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2270 
2271 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2272 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2273 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2274 
2275 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2276 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2277 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2278 
2279 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2280     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2281     {                                                                   \
2282         intptr_t i, oprsz = simd_oprsz(desc);                           \
2283         int shift = simd_data(desc);                                    \
2284         TYPE *d = vd, *n = vn;                                          \
2285         float_status *fpst = stat;                                      \
2286         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2287             d[i] = FUNC(n[i], shift, fpst);                             \
2288         }                                                               \
2289         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2290     }
2291 
2292 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2293 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2294 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2295 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2296 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2297 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2298 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2299 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2300 
2301 #undef DO_VCVT_FIXED
2302 
2303 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2304     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2305     {                                                                   \
2306         float_status *fpst = stat;                                      \
2307         intptr_t i, oprsz = simd_oprsz(desc);                           \
2308         uint32_t rmode = simd_data(desc);                               \
2309         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2310         TYPE *d = vd, *n = vn;                                          \
2311         set_float_rounding_mode(rmode, fpst);                           \
2312         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2313             d[i] = FUNC(n[i], 0, fpst);                                 \
2314         }                                                               \
2315         set_float_rounding_mode(prev_rmode, fpst);                      \
2316         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2317     }
2318 
2319 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2320 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2321 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2322 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2323 
2324 #undef DO_VCVT_RMODE
2325 
2326 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2327     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2328     {                                                                   \
2329         float_status *fpst = stat;                                      \
2330         intptr_t i, oprsz = simd_oprsz(desc);                           \
2331         uint32_t rmode = simd_data(desc);                               \
2332         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2333         TYPE *d = vd, *n = vn;                                          \
2334         set_float_rounding_mode(rmode, fpst);                           \
2335         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2336             d[i] = FUNC(n[i], fpst);                                    \
2337         }                                                               \
2338         set_float_rounding_mode(prev_rmode, fpst);                      \
2339         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2340     }
2341 
2342 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2343 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2344 
2345 #undef DO_VRINT_RMODE
2346 
2347 #ifdef TARGET_AARCH64
2348 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2349 {
2350     const uint8_t *indices = vm;
2351     CPUARMState *env = venv;
2352     size_t oprsz = simd_oprsz(desc);
2353     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2354     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2355     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2356     union {
2357         uint8_t b[16];
2358         uint64_t d[2];
2359     } result;
2360 
2361     /*
2362      * We must construct the final result in a temp, lest the output
2363      * overlaps the input table.  For TBL, begin with zero; for TBX,
2364      * begin with the original register contents.  Note that we always
2365      * copy 16 bytes here to avoid an extra branch; clearing the high
2366      * bits of the register for oprsz == 8 is handled below.
2367      */
2368     if (is_tbx) {
2369         memcpy(&result, vd, 16);
2370     } else {
2371         memset(&result, 0, 16);
2372     }
2373 
2374     for (size_t i = 0; i < oprsz; ++i) {
2375         uint32_t index = indices[H1(i)];
2376 
2377         if (index < table_len) {
2378             /*
2379              * Convert index (a byte offset into the virtual table
2380              * which is a series of 128-bit vectors concatenated)
2381              * into the correct register element, bearing in mind
2382              * that the table can wrap around from V31 to V0.
2383              */
2384             const uint8_t *table = (const uint8_t *)
2385                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2386             result.b[H1(i)] = table[H1(index % 16)];
2387         }
2388     }
2389 
2390     memcpy(vd, &result, 16);
2391     clear_tail(vd, oprsz, simd_maxsz(desc));
2392 }
2393 #endif
2394 
2395 /*
2396  * NxN -> N highpart multiply
2397  *
2398  * TODO: expose this as a generic vector operation.
2399  */
2400 
2401 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2402 {
2403     intptr_t i, opr_sz = simd_oprsz(desc);
2404     int8_t *d = vd, *n = vn, *m = vm;
2405 
2406     for (i = 0; i < opr_sz; ++i) {
2407         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2408     }
2409     clear_tail(d, opr_sz, simd_maxsz(desc));
2410 }
2411 
2412 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2413 {
2414     intptr_t i, opr_sz = simd_oprsz(desc);
2415     int16_t *d = vd, *n = vn, *m = vm;
2416 
2417     for (i = 0; i < opr_sz / 2; ++i) {
2418         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2419     }
2420     clear_tail(d, opr_sz, simd_maxsz(desc));
2421 }
2422 
2423 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2424 {
2425     intptr_t i, opr_sz = simd_oprsz(desc);
2426     int32_t *d = vd, *n = vn, *m = vm;
2427 
2428     for (i = 0; i < opr_sz / 4; ++i) {
2429         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2430     }
2431     clear_tail(d, opr_sz, simd_maxsz(desc));
2432 }
2433 
2434 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2435 {
2436     intptr_t i, opr_sz = simd_oprsz(desc);
2437     uint64_t *d = vd, *n = vn, *m = vm;
2438     uint64_t discard;
2439 
2440     for (i = 0; i < opr_sz / 8; ++i) {
2441         muls64(&discard, &d[i], n[i], m[i]);
2442     }
2443     clear_tail(d, opr_sz, simd_maxsz(desc));
2444 }
2445 
2446 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2447 {
2448     intptr_t i, opr_sz = simd_oprsz(desc);
2449     uint8_t *d = vd, *n = vn, *m = vm;
2450 
2451     for (i = 0; i < opr_sz; ++i) {
2452         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2453     }
2454     clear_tail(d, opr_sz, simd_maxsz(desc));
2455 }
2456 
2457 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2458 {
2459     intptr_t i, opr_sz = simd_oprsz(desc);
2460     uint16_t *d = vd, *n = vn, *m = vm;
2461 
2462     for (i = 0; i < opr_sz / 2; ++i) {
2463         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2464     }
2465     clear_tail(d, opr_sz, simd_maxsz(desc));
2466 }
2467 
2468 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2469 {
2470     intptr_t i, opr_sz = simd_oprsz(desc);
2471     uint32_t *d = vd, *n = vn, *m = vm;
2472 
2473     for (i = 0; i < opr_sz / 4; ++i) {
2474         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2475     }
2476     clear_tail(d, opr_sz, simd_maxsz(desc));
2477 }
2478 
2479 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2480 {
2481     intptr_t i, opr_sz = simd_oprsz(desc);
2482     uint64_t *d = vd, *n = vn, *m = vm;
2483     uint64_t discard;
2484 
2485     for (i = 0; i < opr_sz / 8; ++i) {
2486         mulu64(&discard, &d[i], n[i], m[i]);
2487     }
2488     clear_tail(d, opr_sz, simd_maxsz(desc));
2489 }
2490 
2491 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2492 {
2493     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2494     int shr = simd_data(desc);
2495     uint64_t *d = vd, *n = vn, *m = vm;
2496 
2497     for (i = 0; i < opr_sz; ++i) {
2498         d[i] = ror64(n[i] ^ m[i], shr);
2499     }
2500     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2501 }
2502 
2503 /*
2504  * Integer matrix-multiply accumulate
2505  */
2506 
2507 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2508 {
2509     int8_t *n = vn, *m = vm;
2510 
2511     for (intptr_t k = 0; k < 8; ++k) {
2512         sum += n[H1(k)] * m[H1(k)];
2513     }
2514     return sum;
2515 }
2516 
2517 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2518 {
2519     uint8_t *n = vn, *m = vm;
2520 
2521     for (intptr_t k = 0; k < 8; ++k) {
2522         sum += n[H1(k)] * m[H1(k)];
2523     }
2524     return sum;
2525 }
2526 
2527 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2528 {
2529     uint8_t *n = vn;
2530     int8_t *m = vm;
2531 
2532     for (intptr_t k = 0; k < 8; ++k) {
2533         sum += n[H1(k)] * m[H1(k)];
2534     }
2535     return sum;
2536 }
2537 
2538 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2539                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2540 {
2541     intptr_t seg, opr_sz = simd_oprsz(desc);
2542 
2543     for (seg = 0; seg < opr_sz; seg += 16) {
2544         uint32_t *d = vd + seg;
2545         uint32_t *a = va + seg;
2546         uint32_t sum0, sum1, sum2, sum3;
2547 
2548         /*
2549          * Process the entire segment at once, writing back the
2550          * results only after we've consumed all of the inputs.
2551          *
2552          * Key to indices by column:
2553          *          i   j                  i             j
2554          */
2555         sum0 = a[H4(0 + 0)];
2556         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2557         sum1 = a[H4(0 + 1)];
2558         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2559         sum2 = a[H4(2 + 0)];
2560         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2561         sum3 = a[H4(2 + 1)];
2562         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2563 
2564         d[H4(0)] = sum0;
2565         d[H4(1)] = sum1;
2566         d[H4(2)] = sum2;
2567         d[H4(3)] = sum3;
2568     }
2569     clear_tail(vd, opr_sz, simd_maxsz(desc));
2570 }
2571 
2572 #define DO_MMLA_B(NAME, INNER) \
2573     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2574     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2575 
2576 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2577 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2578 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2579 
2580 /*
2581  * BFloat16 Dot Product
2582  */
2583 
2584 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2585 {
2586     /* FPCR is ignored for BFDOT and BFMMLA. */
2587     float_status bf_status = {
2588         .tininess_before_rounding = float_tininess_before_rounding,
2589         .float_rounding_mode = float_round_to_odd_inf,
2590         .flush_to_zero = true,
2591         .flush_inputs_to_zero = true,
2592         .default_nan_mode = true,
2593     };
2594     float32 t1, t2;
2595 
2596     /*
2597      * Extract each BFloat16 from the element pair, and shift
2598      * them such that they become float32.
2599      */
2600     t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2601     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2602     t1 = float32_add(t1, t2, &bf_status);
2603     t1 = float32_add(sum, t1, &bf_status);
2604 
2605     return t1;
2606 }
2607 
2608 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2609 {
2610     intptr_t i, opr_sz = simd_oprsz(desc);
2611     float32 *d = vd, *a = va;
2612     uint32_t *n = vn, *m = vm;
2613 
2614     for (i = 0; i < opr_sz / 4; ++i) {
2615         d[i] = bfdotadd(a[i], n[i], m[i]);
2616     }
2617     clear_tail(d, opr_sz, simd_maxsz(desc));
2618 }
2619 
2620 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2621                             void *va, uint32_t desc)
2622 {
2623     intptr_t i, j, opr_sz = simd_oprsz(desc);
2624     intptr_t index = simd_data(desc);
2625     intptr_t elements = opr_sz / 4;
2626     intptr_t eltspersegment = MIN(16 / 4, elements);
2627     float32 *d = vd, *a = va;
2628     uint32_t *n = vn, *m = vm;
2629 
2630     for (i = 0; i < elements; i += eltspersegment) {
2631         uint32_t m_idx = m[i + H4(index)];
2632 
2633         for (j = i; j < i + eltspersegment; j++) {
2634             d[j] = bfdotadd(a[j], n[j], m_idx);
2635         }
2636     }
2637     clear_tail(d, opr_sz, simd_maxsz(desc));
2638 }
2639 
2640 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2641 {
2642     intptr_t s, opr_sz = simd_oprsz(desc);
2643     float32 *d = vd, *a = va;
2644     uint32_t *n = vn, *m = vm;
2645 
2646     for (s = 0; s < opr_sz / 4; s += 4) {
2647         float32 sum00, sum01, sum10, sum11;
2648 
2649         /*
2650          * Process the entire segment at once, writing back the
2651          * results only after we've consumed all of the inputs.
2652          *
2653          * Key to indices by column:
2654          *               i   j           i   k             j   k
2655          */
2656         sum00 = a[s + H4(0 + 0)];
2657         sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2658         sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2659 
2660         sum01 = a[s + H4(0 + 1)];
2661         sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2662         sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2663 
2664         sum10 = a[s + H4(2 + 0)];
2665         sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2666         sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2667 
2668         sum11 = a[s + H4(2 + 1)];
2669         sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2670         sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2671 
2672         d[s + H4(0 + 0)] = sum00;
2673         d[s + H4(0 + 1)] = sum01;
2674         d[s + H4(2 + 0)] = sum10;
2675         d[s + H4(2 + 1)] = sum11;
2676     }
2677     clear_tail(d, opr_sz, simd_maxsz(desc));
2678 }
2679 
2680 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2681                          void *stat, uint32_t desc)
2682 {
2683     intptr_t i, opr_sz = simd_oprsz(desc);
2684     intptr_t sel = simd_data(desc);
2685     float32 *d = vd, *a = va;
2686     bfloat16 *n = vn, *m = vm;
2687 
2688     for (i = 0; i < opr_sz / 4; ++i) {
2689         float32 nn = n[H2(i * 2 + sel)] << 16;
2690         float32 mm = m[H2(i * 2 + sel)] << 16;
2691         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2692     }
2693     clear_tail(d, opr_sz, simd_maxsz(desc));
2694 }
2695 
2696 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2697                              void *va, void *stat, uint32_t desc)
2698 {
2699     intptr_t i, j, opr_sz = simd_oprsz(desc);
2700     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2701     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2702     intptr_t elements = opr_sz / 4;
2703     intptr_t eltspersegment = MIN(16 / 4, elements);
2704     float32 *d = vd, *a = va;
2705     bfloat16 *n = vn, *m = vm;
2706 
2707     for (i = 0; i < elements; i += eltspersegment) {
2708         float32 m_idx = m[H2(2 * i + index)] << 16;
2709 
2710         for (j = i; j < i + eltspersegment; j++) {
2711             float32 n_j = n[H2(2 * j + sel)] << 16;
2712             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2713         }
2714     }
2715     clear_tail(d, opr_sz, simd_maxsz(desc));
2716 }
2717 
2718 #define DO_CLAMP(NAME, TYPE) \
2719 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2720 {                                                                       \
2721     intptr_t i, opr_sz = simd_oprsz(desc);                              \
2722     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2723         TYPE aa = *(TYPE *)(a + i);                                     \
2724         TYPE nn = *(TYPE *)(n + i);                                     \
2725         TYPE mm = *(TYPE *)(m + i);                                     \
2726         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2727         *(TYPE *)(d + i) = dd;                                          \
2728     }                                                                   \
2729     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2730 }
2731 
2732 DO_CLAMP(gvec_sclamp_b, int8_t)
2733 DO_CLAMP(gvec_sclamp_h, int16_t)
2734 DO_CLAMP(gvec_sclamp_s, int32_t)
2735 DO_CLAMP(gvec_sclamp_d, int64_t)
2736 
2737 DO_CLAMP(gvec_uclamp_b, uint8_t)
2738 DO_CLAMP(gvec_uclamp_h, uint16_t)
2739 DO_CLAMP(gvec_uclamp_s, uint32_t)
2740 DO_CLAMP(gvec_uclamp_d, uint64_t)
2741