xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision 4fe068fa)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
315                              void *va, uint32_t desc)
316 {
317     intptr_t i, opr_sz = simd_oprsz(desc);
318     int16_t *d = vd, *n = vn, *m = vm, *a = va;
319     uint32_t discard;
320 
321     for (i = 0; i < opr_sz / 2; ++i) {
322         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
323     }
324 }
325 
326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
327                              void *va, uint32_t desc)
328 {
329     intptr_t i, opr_sz = simd_oprsz(desc);
330     int16_t *d = vd, *n = vn, *m = vm, *a = va;
331     uint32_t discard;
332 
333     for (i = 0; i < opr_sz / 2; ++i) {
334         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
335     }
336 }
337 
338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
339 {
340     intptr_t i, opr_sz = simd_oprsz(desc);
341     int16_t *d = vd, *n = vn, *m = vm;
342     uint32_t discard;
343 
344     for (i = 0; i < opr_sz / 2; ++i) {
345         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
346     }
347 }
348 
349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
350 {
351     intptr_t i, opr_sz = simd_oprsz(desc);
352     int16_t *d = vd, *n = vn, *m = vm;
353     uint32_t discard;
354 
355     for (i = 0; i < opr_sz / 2; ++i) {
356         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
357     }
358 }
359 
360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
361 {
362     intptr_t i, j, opr_sz = simd_oprsz(desc);
363     int idx = simd_data(desc);
364     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
365     uint32_t discard;
366 
367     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
368         int16_t mm = m[i];
369         for (j = 0; j < 16 / 2; ++j) {
370             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
371         }
372     }
373 }
374 
375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
376 {
377     intptr_t i, j, opr_sz = simd_oprsz(desc);
378     int idx = simd_data(desc);
379     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
380     uint32_t discard;
381 
382     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
383         int16_t mm = m[i];
384         for (j = 0; j < 16 / 2; ++j) {
385             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
386         }
387     }
388 }
389 
390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
392                       bool neg, bool round, uint32_t *sat)
393 {
394     /* Simplify similarly to do_sqrdmlah_b above.  */
395     int64_t ret = (int64_t)src1 * src2;
396     if (neg) {
397         ret = -ret;
398     }
399     ret += ((int64_t)src3 << 31) + (round << 30);
400     ret >>= 31;
401 
402     if (ret != (int32_t)ret) {
403         *sat = 1;
404         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
405     }
406     return ret;
407 }
408 
409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
410                                   int32_t src2, int32_t src3)
411 {
412     uint32_t *sat = &env->vfp.qc[0];
413     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
414 }
415 
416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
417                               void *vq, uint32_t desc)
418 {
419     uintptr_t opr_sz = simd_oprsz(desc);
420     int32_t *d = vd;
421     int32_t *n = vn;
422     int32_t *m = vm;
423     uintptr_t i;
424 
425     for (i = 0; i < opr_sz / 4; ++i) {
426         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
427     }
428     clear_tail(d, opr_sz, simd_maxsz(desc));
429 }
430 
431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
432                                   int32_t src2, int32_t src3)
433 {
434     uint32_t *sat = &env->vfp.qc[0];
435     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
436 }
437 
438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
439                               void *vq, uint32_t desc)
440 {
441     uintptr_t opr_sz = simd_oprsz(desc);
442     int32_t *d = vd;
443     int32_t *n = vn;
444     int32_t *m = vm;
445     uintptr_t i;
446 
447     for (i = 0; i < opr_sz / 4; ++i) {
448         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
449     }
450     clear_tail(d, opr_sz, simd_maxsz(desc));
451 }
452 
453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
454                             void *vq, uint32_t desc)
455 {
456     intptr_t i, opr_sz = simd_oprsz(desc);
457     int32_t *d = vd, *n = vn, *m = vm;
458 
459     for (i = 0; i < opr_sz / 4; ++i) {
460         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
461     }
462     clear_tail(d, opr_sz, simd_maxsz(desc));
463 }
464 
465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
466                              void *vq, uint32_t desc)
467 {
468     intptr_t i, opr_sz = simd_oprsz(desc);
469     int32_t *d = vd, *n = vn, *m = vm;
470 
471     for (i = 0; i < opr_sz / 4; ++i) {
472         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
473     }
474     clear_tail(d, opr_sz, simd_maxsz(desc));
475 }
476 
477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
478                              void *va, uint32_t desc)
479 {
480     intptr_t i, opr_sz = simd_oprsz(desc);
481     int32_t *d = vd, *n = vn, *m = vm, *a = va;
482     uint32_t discard;
483 
484     for (i = 0; i < opr_sz / 4; ++i) {
485         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
486     }
487 }
488 
489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
490                              void *va, uint32_t desc)
491 {
492     intptr_t i, opr_sz = simd_oprsz(desc);
493     int32_t *d = vd, *n = vn, *m = vm, *a = va;
494     uint32_t discard;
495 
496     for (i = 0; i < opr_sz / 4; ++i) {
497         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
498     }
499 }
500 
501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
502 {
503     intptr_t i, opr_sz = simd_oprsz(desc);
504     int32_t *d = vd, *n = vn, *m = vm;
505     uint32_t discard;
506 
507     for (i = 0; i < opr_sz / 4; ++i) {
508         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
509     }
510 }
511 
512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
513 {
514     intptr_t i, opr_sz = simd_oprsz(desc);
515     int32_t *d = vd, *n = vn, *m = vm;
516     uint32_t discard;
517 
518     for (i = 0; i < opr_sz / 4; ++i) {
519         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
520     }
521 }
522 
523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
524 {
525     intptr_t i, j, opr_sz = simd_oprsz(desc);
526     int idx = simd_data(desc);
527     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
528     uint32_t discard;
529 
530     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
531         int32_t mm = m[i];
532         for (j = 0; j < 16 / 4; ++j) {
533             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
534         }
535     }
536 }
537 
538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
539 {
540     intptr_t i, j, opr_sz = simd_oprsz(desc);
541     int idx = simd_data(desc);
542     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
543     uint32_t discard;
544 
545     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
546         int32_t mm = m[i];
547         for (j = 0; j < 16 / 4; ++j) {
548             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
549         }
550     }
551 }
552 
553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
554 static int64_t do_sat128_d(Int128 r)
555 {
556     int64_t ls = int128_getlo(r);
557     int64_t hs = int128_gethi(r);
558 
559     if (unlikely(hs != (ls >> 63))) {
560         return hs < 0 ? INT64_MIN : INT64_MAX;
561     }
562     return ls;
563 }
564 
565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
566 {
567     uint64_t l, h;
568     Int128 r, t;
569 
570     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
571     muls64(&l, &h, m, n);
572     r = int128_make128(l, h);
573     if (neg) {
574         r = int128_neg(r);
575     }
576     if (a) {
577         t = int128_exts64(a);
578         t = int128_lshift(t, 63);
579         r = int128_add(r, t);
580     }
581     if (round) {
582         t = int128_exts64(1ll << 62);
583         r = int128_add(r, t);
584     }
585     r = int128_rshift(r, 63);
586 
587     return do_sat128_d(r);
588 }
589 
590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
591                              void *va, uint32_t desc)
592 {
593     intptr_t i, opr_sz = simd_oprsz(desc);
594     int64_t *d = vd, *n = vn, *m = vm, *a = va;
595 
596     for (i = 0; i < opr_sz / 8; ++i) {
597         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
598     }
599 }
600 
601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
602                              void *va, uint32_t desc)
603 {
604     intptr_t i, opr_sz = simd_oprsz(desc);
605     int64_t *d = vd, *n = vn, *m = vm, *a = va;
606 
607     for (i = 0; i < opr_sz / 8; ++i) {
608         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
609     }
610 }
611 
612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
613 {
614     intptr_t i, opr_sz = simd_oprsz(desc);
615     int64_t *d = vd, *n = vn, *m = vm;
616 
617     for (i = 0; i < opr_sz / 8; ++i) {
618         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
619     }
620 }
621 
622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int64_t *d = vd, *n = vn, *m = vm;
626 
627     for (i = 0; i < opr_sz / 8; ++i) {
628         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
629     }
630 }
631 
632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
633 {
634     intptr_t i, j, opr_sz = simd_oprsz(desc);
635     int idx = simd_data(desc);
636     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
637 
638     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
639         int64_t mm = m[i];
640         for (j = 0; j < 16 / 8; ++j) {
641             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
642         }
643     }
644 }
645 
646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
647 {
648     intptr_t i, j, opr_sz = simd_oprsz(desc);
649     int idx = simd_data(desc);
650     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
651 
652     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
653         int64_t mm = m[i];
654         for (j = 0; j < 16 / 8; ++j) {
655             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
656         }
657     }
658 }
659 
660 /* Integer 8 and 16-bit dot-product.
661  *
662  * Note that for the loops herein, host endianness does not matter
663  * with respect to the ordering of data within the quad-width lanes.
664  * All elements are treated equally, no matter where they are.
665  */
666 
667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
669 {                                                                         \
670     intptr_t i, opr_sz = simd_oprsz(desc);                                \
671     TYPED *d = vd, *a = va;                                               \
672     TYPEN *n = vn;                                                        \
673     TYPEM *m = vm;                                                        \
674     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
675         d[i] = (a[i] +                                                    \
676                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
677                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
678                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
679                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
680     }                                                                     \
681     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
682 }
683 
684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
689 
690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
692 {                                                                         \
693     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
694     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
695     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
696     intptr_t index = simd_data(desc);                                     \
697     TYPED *d = vd, *a = va;                                               \
698     TYPEN *n = vn;                                                        \
699     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
700     do {                                                                  \
701         TYPED m0 = m_indexed[i * 4 + 0];                                  \
702         TYPED m1 = m_indexed[i * 4 + 1];                                  \
703         TYPED m2 = m_indexed[i * 4 + 2];                                  \
704         TYPED m3 = m_indexed[i * 4 + 3];                                  \
705         do {                                                              \
706             d[i] = (a[i] +                                                \
707                     n[i * 4 + 0] * m0 +                                   \
708                     n[i * 4 + 1] * m1 +                                   \
709                     n[i * 4 + 2] * m2 +                                   \
710                     n[i * 4 + 3] * m3);                                   \
711         } while (++i < segend);                                           \
712         segend = i + 4;                                                   \
713     } while (i < opr_sz_n);                                               \
714     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
715 }
716 
717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
723 
724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
725                          void *vfpst, uint32_t desc)
726 {
727     uintptr_t opr_sz = simd_oprsz(desc);
728     float16 *d = vd;
729     float16 *n = vn;
730     float16 *m = vm;
731     float_status *fpst = vfpst;
732     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
733     uint32_t neg_imag = neg_real ^ 1;
734     uintptr_t i;
735 
736     /* Shift boolean to the sign bit so we can xor to negate.  */
737     neg_real <<= 15;
738     neg_imag <<= 15;
739 
740     for (i = 0; i < opr_sz / 2; i += 2) {
741         float16 e0 = n[H2(i)];
742         float16 e1 = m[H2(i + 1)] ^ neg_imag;
743         float16 e2 = n[H2(i + 1)];
744         float16 e3 = m[H2(i)] ^ neg_real;
745 
746         d[H2(i)] = float16_add(e0, e1, fpst);
747         d[H2(i + 1)] = float16_add(e2, e3, fpst);
748     }
749     clear_tail(d, opr_sz, simd_maxsz(desc));
750 }
751 
752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
753                          void *vfpst, uint32_t desc)
754 {
755     uintptr_t opr_sz = simd_oprsz(desc);
756     float32 *d = vd;
757     float32 *n = vn;
758     float32 *m = vm;
759     float_status *fpst = vfpst;
760     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
761     uint32_t neg_imag = neg_real ^ 1;
762     uintptr_t i;
763 
764     /* Shift boolean to the sign bit so we can xor to negate.  */
765     neg_real <<= 31;
766     neg_imag <<= 31;
767 
768     for (i = 0; i < opr_sz / 4; i += 2) {
769         float32 e0 = n[H4(i)];
770         float32 e1 = m[H4(i + 1)] ^ neg_imag;
771         float32 e2 = n[H4(i + 1)];
772         float32 e3 = m[H4(i)] ^ neg_real;
773 
774         d[H4(i)] = float32_add(e0, e1, fpst);
775         d[H4(i + 1)] = float32_add(e2, e3, fpst);
776     }
777     clear_tail(d, opr_sz, simd_maxsz(desc));
778 }
779 
780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
781                          void *vfpst, uint32_t desc)
782 {
783     uintptr_t opr_sz = simd_oprsz(desc);
784     float64 *d = vd;
785     float64 *n = vn;
786     float64 *m = vm;
787     float_status *fpst = vfpst;
788     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
789     uint64_t neg_imag = neg_real ^ 1;
790     uintptr_t i;
791 
792     /* Shift boolean to the sign bit so we can xor to negate.  */
793     neg_real <<= 63;
794     neg_imag <<= 63;
795 
796     for (i = 0; i < opr_sz / 8; i += 2) {
797         float64 e0 = n[i];
798         float64 e1 = m[i + 1] ^ neg_imag;
799         float64 e2 = n[i + 1];
800         float64 e3 = m[i] ^ neg_real;
801 
802         d[i] = float64_add(e0, e1, fpst);
803         d[i + 1] = float64_add(e2, e3, fpst);
804     }
805     clear_tail(d, opr_sz, simd_maxsz(desc));
806 }
807 
808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
809                          void *vfpst, uint32_t desc)
810 {
811     uintptr_t opr_sz = simd_oprsz(desc);
812     float16 *d = vd, *n = vn, *m = vm, *a = va;
813     float_status *fpst = vfpst;
814     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
815     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
816     uint32_t neg_real = flip ^ neg_imag;
817     uintptr_t i;
818 
819     /* Shift boolean to the sign bit so we can xor to negate.  */
820     neg_real <<= 15;
821     neg_imag <<= 15;
822 
823     for (i = 0; i < opr_sz / 2; i += 2) {
824         float16 e2 = n[H2(i + flip)];
825         float16 e1 = m[H2(i + flip)] ^ neg_real;
826         float16 e4 = e2;
827         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
828 
829         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
830         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
831     }
832     clear_tail(d, opr_sz, simd_maxsz(desc));
833 }
834 
835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
836                              void *vfpst, uint32_t desc)
837 {
838     uintptr_t opr_sz = simd_oprsz(desc);
839     float16 *d = vd, *n = vn, *m = vm, *a = va;
840     float_status *fpst = vfpst;
841     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
842     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
843     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
844     uint32_t neg_real = flip ^ neg_imag;
845     intptr_t elements = opr_sz / sizeof(float16);
846     intptr_t eltspersegment = 16 / sizeof(float16);
847     intptr_t i, j;
848 
849     /* Shift boolean to the sign bit so we can xor to negate.  */
850     neg_real <<= 15;
851     neg_imag <<= 15;
852 
853     for (i = 0; i < elements; i += eltspersegment) {
854         float16 mr = m[H2(i + 2 * index + 0)];
855         float16 mi = m[H2(i + 2 * index + 1)];
856         float16 e1 = neg_real ^ (flip ? mi : mr);
857         float16 e3 = neg_imag ^ (flip ? mr : mi);
858 
859         for (j = i; j < i + eltspersegment; j += 2) {
860             float16 e2 = n[H2(j + flip)];
861             float16 e4 = e2;
862 
863             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
864             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
865         }
866     }
867     clear_tail(d, opr_sz, simd_maxsz(desc));
868 }
869 
870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
871                          void *vfpst, uint32_t desc)
872 {
873     uintptr_t opr_sz = simd_oprsz(desc);
874     float32 *d = vd, *n = vn, *m = vm, *a = va;
875     float_status *fpst = vfpst;
876     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
877     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
878     uint32_t neg_real = flip ^ neg_imag;
879     uintptr_t i;
880 
881     /* Shift boolean to the sign bit so we can xor to negate.  */
882     neg_real <<= 31;
883     neg_imag <<= 31;
884 
885     for (i = 0; i < opr_sz / 4; i += 2) {
886         float32 e2 = n[H4(i + flip)];
887         float32 e1 = m[H4(i + flip)] ^ neg_real;
888         float32 e4 = e2;
889         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
890 
891         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
892         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
893     }
894     clear_tail(d, opr_sz, simd_maxsz(desc));
895 }
896 
897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
898                              void *vfpst, uint32_t desc)
899 {
900     uintptr_t opr_sz = simd_oprsz(desc);
901     float32 *d = vd, *n = vn, *m = vm, *a = va;
902     float_status *fpst = vfpst;
903     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
904     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
905     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
906     uint32_t neg_real = flip ^ neg_imag;
907     intptr_t elements = opr_sz / sizeof(float32);
908     intptr_t eltspersegment = 16 / sizeof(float32);
909     intptr_t i, j;
910 
911     /* Shift boolean to the sign bit so we can xor to negate.  */
912     neg_real <<= 31;
913     neg_imag <<= 31;
914 
915     for (i = 0; i < elements; i += eltspersegment) {
916         float32 mr = m[H4(i + 2 * index + 0)];
917         float32 mi = m[H4(i + 2 * index + 1)];
918         float32 e1 = neg_real ^ (flip ? mi : mr);
919         float32 e3 = neg_imag ^ (flip ? mr : mi);
920 
921         for (j = i; j < i + eltspersegment; j += 2) {
922             float32 e2 = n[H4(j + flip)];
923             float32 e4 = e2;
924 
925             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
926             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
927         }
928     }
929     clear_tail(d, opr_sz, simd_maxsz(desc));
930 }
931 
932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
933                          void *vfpst, uint32_t desc)
934 {
935     uintptr_t opr_sz = simd_oprsz(desc);
936     float64 *d = vd, *n = vn, *m = vm, *a = va;
937     float_status *fpst = vfpst;
938     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
939     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
940     uint64_t neg_real = flip ^ neg_imag;
941     uintptr_t i;
942 
943     /* Shift boolean to the sign bit so we can xor to negate.  */
944     neg_real <<= 63;
945     neg_imag <<= 63;
946 
947     for (i = 0; i < opr_sz / 8; i += 2) {
948         float64 e2 = n[i + flip];
949         float64 e1 = m[i + flip] ^ neg_real;
950         float64 e4 = e2;
951         float64 e3 = m[i + 1 - flip] ^ neg_imag;
952 
953         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
954         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
955     }
956     clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958 
959 /*
960  * Floating point comparisons producing an integer result (all 1s or all 0s).
961  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
962  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
963  */
964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
965 {
966     return -float16_eq_quiet(op1, op2, stat);
967 }
968 
969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
970 {
971     return -float32_eq_quiet(op1, op2, stat);
972 }
973 
974 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
975 {
976     return -float64_eq_quiet(op1, op2, stat);
977 }
978 
979 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
980 {
981     return -float16_le(op2, op1, stat);
982 }
983 
984 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
985 {
986     return -float32_le(op2, op1, stat);
987 }
988 
989 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
990 {
991     return -float64_le(op2, op1, stat);
992 }
993 
994 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
995 {
996     return -float16_lt(op2, op1, stat);
997 }
998 
999 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1000 {
1001     return -float32_lt(op2, op1, stat);
1002 }
1003 
1004 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1005 {
1006     return -float64_lt(op2, op1, stat);
1007 }
1008 
1009 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1010 {
1011     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1012 }
1013 
1014 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1015 {
1016     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1017 }
1018 
1019 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1020 {
1021     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1022 }
1023 
1024 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1025 {
1026     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1027 }
1028 
1029 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1030 {
1031     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1032 }
1033 
1034 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1035 {
1036     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1037 }
1038 
1039 static int16_t vfp_tosszh(float16 x, void *fpstp)
1040 {
1041     float_status *fpst = fpstp;
1042     if (float16_is_any_nan(x)) {
1043         float_raise(float_flag_invalid, fpst);
1044         return 0;
1045     }
1046     return float16_to_int16_round_to_zero(x, fpst);
1047 }
1048 
1049 static uint16_t vfp_touszh(float16 x, void *fpstp)
1050 {
1051     float_status *fpst = fpstp;
1052     if (float16_is_any_nan(x)) {
1053         float_raise(float_flag_invalid, fpst);
1054         return 0;
1055     }
1056     return float16_to_uint16_round_to_zero(x, fpst);
1057 }
1058 
1059 #define DO_2OP(NAME, FUNC, TYPE) \
1060 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1061 {                                                                 \
1062     intptr_t i, oprsz = simd_oprsz(desc);                         \
1063     TYPE *d = vd, *n = vn;                                        \
1064     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1065         d[i] = FUNC(n[i], stat);                                  \
1066     }                                                             \
1067     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1068 }
1069 
1070 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1071 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1072 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1073 
1074 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1075 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1076 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1077 
1078 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1079 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1080 
1081 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1082 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1083 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1084 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1085 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1086 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1087 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1088 DO_2OP(gvec_touszh, vfp_touszh, float16)
1089 
1090 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1091     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1092     {                                                           \
1093         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1094     }
1095 
1096 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1097     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1098     {                                                           \
1099         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1100     }
1101 
1102 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1103     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1104     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1105     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1106     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1107 
1108 DO_2OP_CMP0(cgt, cgt, FWD)
1109 DO_2OP_CMP0(cge, cge, FWD)
1110 DO_2OP_CMP0(ceq, ceq, FWD)
1111 DO_2OP_CMP0(clt, cgt, REV)
1112 DO_2OP_CMP0(cle, cge, REV)
1113 
1114 #undef DO_2OP
1115 #undef DO_2OP_CMP0
1116 
1117 /* Floating-point trigonometric starting value.
1118  * See the ARM ARM pseudocode function FPTrigSMul.
1119  */
1120 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1121 {
1122     float16 result = float16_mul(op1, op1, stat);
1123     if (!float16_is_any_nan(result)) {
1124         result = float16_set_sign(result, op2 & 1);
1125     }
1126     return result;
1127 }
1128 
1129 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1130 {
1131     float32 result = float32_mul(op1, op1, stat);
1132     if (!float32_is_any_nan(result)) {
1133         result = float32_set_sign(result, op2 & 1);
1134     }
1135     return result;
1136 }
1137 
1138 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1139 {
1140     float64 result = float64_mul(op1, op1, stat);
1141     if (!float64_is_any_nan(result)) {
1142         result = float64_set_sign(result, op2 & 1);
1143     }
1144     return result;
1145 }
1146 
1147 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1148 {
1149     return float16_abs(float16_sub(op1, op2, stat));
1150 }
1151 
1152 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1153 {
1154     return float32_abs(float32_sub(op1, op2, stat));
1155 }
1156 
1157 /*
1158  * Reciprocal step. These are the AArch32 version which uses a
1159  * non-fused multiply-and-subtract.
1160  */
1161 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1162 {
1163     op1 = float16_squash_input_denormal(op1, stat);
1164     op2 = float16_squash_input_denormal(op2, stat);
1165 
1166     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1167         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1168         return float16_two;
1169     }
1170     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1171 }
1172 
1173 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1174 {
1175     op1 = float32_squash_input_denormal(op1, stat);
1176     op2 = float32_squash_input_denormal(op2, stat);
1177 
1178     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1179         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1180         return float32_two;
1181     }
1182     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1183 }
1184 
1185 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1186 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1187 {
1188     op1 = float16_squash_input_denormal(op1, stat);
1189     op2 = float16_squash_input_denormal(op2, stat);
1190 
1191     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1192         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1193         return float16_one_point_five;
1194     }
1195     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1196     return float16_div(op1, float16_two, stat);
1197 }
1198 
1199 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1200 {
1201     op1 = float32_squash_input_denormal(op1, stat);
1202     op2 = float32_squash_input_denormal(op2, stat);
1203 
1204     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1205         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1206         return float32_one_point_five;
1207     }
1208     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1209     return float32_div(op1, float32_two, stat);
1210 }
1211 
1212 #define DO_3OP(NAME, FUNC, TYPE) \
1213 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1214 {                                                                          \
1215     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1216     TYPE *d = vd, *n = vn, *m = vm;                                        \
1217     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1218         d[i] = FUNC(n[i], m[i], stat);                                     \
1219     }                                                                      \
1220     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1221 }
1222 
1223 DO_3OP(gvec_fadd_h, float16_add, float16)
1224 DO_3OP(gvec_fadd_s, float32_add, float32)
1225 DO_3OP(gvec_fadd_d, float64_add, float64)
1226 
1227 DO_3OP(gvec_fsub_h, float16_sub, float16)
1228 DO_3OP(gvec_fsub_s, float32_sub, float32)
1229 DO_3OP(gvec_fsub_d, float64_sub, float64)
1230 
1231 DO_3OP(gvec_fmul_h, float16_mul, float16)
1232 DO_3OP(gvec_fmul_s, float32_mul, float32)
1233 DO_3OP(gvec_fmul_d, float64_mul, float64)
1234 
1235 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1236 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1237 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1238 
1239 DO_3OP(gvec_fabd_h, float16_abd, float16)
1240 DO_3OP(gvec_fabd_s, float32_abd, float32)
1241 
1242 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1243 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1244 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1245 
1246 DO_3OP(gvec_fcge_h, float16_cge, float16)
1247 DO_3OP(gvec_fcge_s, float32_cge, float32)
1248 DO_3OP(gvec_fcge_d, float64_cge, float64)
1249 
1250 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1251 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1252 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1253 
1254 DO_3OP(gvec_facge_h, float16_acge, float16)
1255 DO_3OP(gvec_facge_s, float32_acge, float32)
1256 DO_3OP(gvec_facge_d, float64_acge, float64)
1257 
1258 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1259 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1260 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1261 
1262 DO_3OP(gvec_fmax_h, float16_max, float16)
1263 DO_3OP(gvec_fmax_s, float32_max, float32)
1264 DO_3OP(gvec_fmax_d, float64_max, float64)
1265 
1266 DO_3OP(gvec_fmin_h, float16_min, float16)
1267 DO_3OP(gvec_fmin_s, float32_min, float32)
1268 DO_3OP(gvec_fmin_d, float64_min, float64)
1269 
1270 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1271 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1272 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1273 
1274 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1275 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1276 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1277 
1278 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1279 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1280 
1281 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1282 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1283 
1284 #ifdef TARGET_AARCH64
1285 DO_3OP(gvec_fdiv_h, float16_div, float16)
1286 DO_3OP(gvec_fdiv_s, float32_div, float32)
1287 DO_3OP(gvec_fdiv_d, float64_div, float64)
1288 
1289 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1290 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1291 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1292 
1293 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1294 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1295 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1296 
1297 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1298 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1299 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1300 
1301 #endif
1302 #undef DO_3OP
1303 
1304 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1305 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1306                                  float_status *stat)
1307 {
1308     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1309 }
1310 
1311 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1312                                  float_status *stat)
1313 {
1314     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1315 }
1316 
1317 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1318                                  float_status *stat)
1319 {
1320     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1321 }
1322 
1323 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1324                                  float_status *stat)
1325 {
1326     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1327 }
1328 
1329 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1330 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1331                                 float_status *stat)
1332 {
1333     return float16_muladd(op1, op2, dest, 0, stat);
1334 }
1335 
1336 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1337                                  float_status *stat)
1338 {
1339     return float32_muladd(op1, op2, dest, 0, stat);
1340 }
1341 
1342 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1343                                  float_status *stat)
1344 {
1345     return float64_muladd(op1, op2, dest, 0, stat);
1346 }
1347 
1348 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1349                                  float_status *stat)
1350 {
1351     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1352 }
1353 
1354 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1355                                  float_status *stat)
1356 {
1357     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1358 }
1359 
1360 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1361                                  float_status *stat)
1362 {
1363     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1364 }
1365 
1366 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1367 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1368 {                                                                          \
1369     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1370     TYPE *d = vd, *n = vn, *m = vm;                                        \
1371     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1372         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1373     }                                                                      \
1374     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1375 }
1376 
1377 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1378 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1379 
1380 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1381 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1382 
1383 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1384 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1385 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1386 
1387 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1388 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1389 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1390 
1391 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1392  * For AdvSIMD, there is of course only one such vector segment.
1393  */
1394 
1395 #define DO_MUL_IDX(NAME, TYPE, H) \
1396 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1397 {                                                                          \
1398     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1399     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1400     intptr_t idx = simd_data(desc);                                        \
1401     TYPE *d = vd, *n = vn, *m = vm;                                        \
1402     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1403         TYPE mm = m[H(i + idx)];                                           \
1404         for (j = 0; j < segment; j++) {                                    \
1405             d[i + j] = n[i + j] * mm;                                      \
1406         }                                                                  \
1407     }                                                                      \
1408     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1409 }
1410 
1411 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1412 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1413 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1414 
1415 #undef DO_MUL_IDX
1416 
1417 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1418 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1419 {                                                                          \
1420     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1421     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1422     intptr_t idx = simd_data(desc);                                        \
1423     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1424     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1425         TYPE mm = m[H(i + idx)];                                           \
1426         for (j = 0; j < segment; j++) {                                    \
1427             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1428         }                                                                  \
1429     }                                                                      \
1430     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1431 }
1432 
1433 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1434 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1435 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1436 
1437 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1438 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1439 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1440 
1441 #undef DO_MLA_IDX
1442 
1443 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1444 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1445 {                                                                          \
1446     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1447     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1448     intptr_t idx = simd_data(desc);                                        \
1449     TYPE *d = vd, *n = vn, *m = vm;                                        \
1450     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1451         TYPE mm = m[H(i + idx)];                                           \
1452         for (j = 0; j < segment; j++) {                                    \
1453             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1454         }                                                                  \
1455     }                                                                      \
1456     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1457 }
1458 
1459 #define nop(N, M, S) (M)
1460 
1461 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1462 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1463 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1464 
1465 #ifdef TARGET_AARCH64
1466 
1467 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1468 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1469 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1470 
1471 #endif
1472 
1473 #undef nop
1474 
1475 /*
1476  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1477  * the fused ops below they assume accumulate both from and into Vd.
1478  */
1479 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1480 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1481 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1482 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1483 
1484 #undef DO_FMUL_IDX
1485 
1486 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1487 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1488                   void *stat, uint32_t desc)                               \
1489 {                                                                          \
1490     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1491     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1492     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1493     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1494     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1495     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1496     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1497         TYPE mm = m[H(i + idx)];                                           \
1498         for (j = 0; j < segment; j++) {                                    \
1499             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1500                                      mm, a[i + j], 0, stat);               \
1501         }                                                                  \
1502     }                                                                      \
1503     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1504 }
1505 
1506 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1507 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1508 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1509 
1510 #undef DO_FMLA_IDX
1511 
1512 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1513 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1514 {                                                                          \
1515     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1516     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1517     bool q = false;                                                        \
1518     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1519         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1520         if (dd < MIN) {                                                    \
1521             dd = MIN;                                                      \
1522             q = true;                                                      \
1523         } else if (dd > MAX) {                                             \
1524             dd = MAX;                                                      \
1525             q = true;                                                      \
1526         }                                                                  \
1527         d[i] = dd;                                                         \
1528     }                                                                      \
1529     if (q) {                                                               \
1530         uint32_t *qc = vq;                                                 \
1531         qc[0] = 1;                                                         \
1532     }                                                                      \
1533     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1534 }
1535 
1536 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1537 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1538 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1539 
1540 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1541 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1542 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1543 
1544 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1545 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1546 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1547 
1548 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1549 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1550 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1551 
1552 #undef DO_SAT
1553 
1554 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1555                           void *vm, uint32_t desc)
1556 {
1557     intptr_t i, oprsz = simd_oprsz(desc);
1558     uint64_t *d = vd, *n = vn, *m = vm;
1559     bool q = false;
1560 
1561     for (i = 0; i < oprsz / 8; i++) {
1562         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1563         if (dd < nn) {
1564             dd = UINT64_MAX;
1565             q = true;
1566         }
1567         d[i] = dd;
1568     }
1569     if (q) {
1570         uint32_t *qc = vq;
1571         qc[0] = 1;
1572     }
1573     clear_tail(d, oprsz, simd_maxsz(desc));
1574 }
1575 
1576 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1577                           void *vm, uint32_t desc)
1578 {
1579     intptr_t i, oprsz = simd_oprsz(desc);
1580     uint64_t *d = vd, *n = vn, *m = vm;
1581     bool q = false;
1582 
1583     for (i = 0; i < oprsz / 8; i++) {
1584         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1585         if (nn < mm) {
1586             dd = 0;
1587             q = true;
1588         }
1589         d[i] = dd;
1590     }
1591     if (q) {
1592         uint32_t *qc = vq;
1593         qc[0] = 1;
1594     }
1595     clear_tail(d, oprsz, simd_maxsz(desc));
1596 }
1597 
1598 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1599                           void *vm, uint32_t desc)
1600 {
1601     intptr_t i, oprsz = simd_oprsz(desc);
1602     int64_t *d = vd, *n = vn, *m = vm;
1603     bool q = false;
1604 
1605     for (i = 0; i < oprsz / 8; i++) {
1606         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1607         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1608             dd = (nn >> 63) ^ ~INT64_MIN;
1609             q = true;
1610         }
1611         d[i] = dd;
1612     }
1613     if (q) {
1614         uint32_t *qc = vq;
1615         qc[0] = 1;
1616     }
1617     clear_tail(d, oprsz, simd_maxsz(desc));
1618 }
1619 
1620 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1621                           void *vm, uint32_t desc)
1622 {
1623     intptr_t i, oprsz = simd_oprsz(desc);
1624     int64_t *d = vd, *n = vn, *m = vm;
1625     bool q = false;
1626 
1627     for (i = 0; i < oprsz / 8; i++) {
1628         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1629         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1630             dd = (nn >> 63) ^ ~INT64_MIN;
1631             q = true;
1632         }
1633         d[i] = dd;
1634     }
1635     if (q) {
1636         uint32_t *qc = vq;
1637         qc[0] = 1;
1638     }
1639     clear_tail(d, oprsz, simd_maxsz(desc));
1640 }
1641 
1642 
1643 #define DO_SRA(NAME, TYPE)                              \
1644 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1645 {                                                       \
1646     intptr_t i, oprsz = simd_oprsz(desc);               \
1647     int shift = simd_data(desc);                        \
1648     TYPE *d = vd, *n = vn;                              \
1649     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1650         d[i] += n[i] >> shift;                          \
1651     }                                                   \
1652     clear_tail(d, oprsz, simd_maxsz(desc));             \
1653 }
1654 
1655 DO_SRA(gvec_ssra_b, int8_t)
1656 DO_SRA(gvec_ssra_h, int16_t)
1657 DO_SRA(gvec_ssra_s, int32_t)
1658 DO_SRA(gvec_ssra_d, int64_t)
1659 
1660 DO_SRA(gvec_usra_b, uint8_t)
1661 DO_SRA(gvec_usra_h, uint16_t)
1662 DO_SRA(gvec_usra_s, uint32_t)
1663 DO_SRA(gvec_usra_d, uint64_t)
1664 
1665 #undef DO_SRA
1666 
1667 #define DO_RSHR(NAME, TYPE)                             \
1668 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1669 {                                                       \
1670     intptr_t i, oprsz = simd_oprsz(desc);               \
1671     int shift = simd_data(desc);                        \
1672     TYPE *d = vd, *n = vn;                              \
1673     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1674         TYPE tmp = n[i] >> (shift - 1);                 \
1675         d[i] = (tmp >> 1) + (tmp & 1);                  \
1676     }                                                   \
1677     clear_tail(d, oprsz, simd_maxsz(desc));             \
1678 }
1679 
1680 DO_RSHR(gvec_srshr_b, int8_t)
1681 DO_RSHR(gvec_srshr_h, int16_t)
1682 DO_RSHR(gvec_srshr_s, int32_t)
1683 DO_RSHR(gvec_srshr_d, int64_t)
1684 
1685 DO_RSHR(gvec_urshr_b, uint8_t)
1686 DO_RSHR(gvec_urshr_h, uint16_t)
1687 DO_RSHR(gvec_urshr_s, uint32_t)
1688 DO_RSHR(gvec_urshr_d, uint64_t)
1689 
1690 #undef DO_RSHR
1691 
1692 #define DO_RSRA(NAME, TYPE)                             \
1693 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1694 {                                                       \
1695     intptr_t i, oprsz = simd_oprsz(desc);               \
1696     int shift = simd_data(desc);                        \
1697     TYPE *d = vd, *n = vn;                              \
1698     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1699         TYPE tmp = n[i] >> (shift - 1);                 \
1700         d[i] += (tmp >> 1) + (tmp & 1);                 \
1701     }                                                   \
1702     clear_tail(d, oprsz, simd_maxsz(desc));             \
1703 }
1704 
1705 DO_RSRA(gvec_srsra_b, int8_t)
1706 DO_RSRA(gvec_srsra_h, int16_t)
1707 DO_RSRA(gvec_srsra_s, int32_t)
1708 DO_RSRA(gvec_srsra_d, int64_t)
1709 
1710 DO_RSRA(gvec_ursra_b, uint8_t)
1711 DO_RSRA(gvec_ursra_h, uint16_t)
1712 DO_RSRA(gvec_ursra_s, uint32_t)
1713 DO_RSRA(gvec_ursra_d, uint64_t)
1714 
1715 #undef DO_RSRA
1716 
1717 #define DO_SRI(NAME, TYPE)                              \
1718 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1719 {                                                       \
1720     intptr_t i, oprsz = simd_oprsz(desc);               \
1721     int shift = simd_data(desc);                        \
1722     TYPE *d = vd, *n = vn;                              \
1723     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1724         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1725     }                                                   \
1726     clear_tail(d, oprsz, simd_maxsz(desc));             \
1727 }
1728 
1729 DO_SRI(gvec_sri_b, uint8_t)
1730 DO_SRI(gvec_sri_h, uint16_t)
1731 DO_SRI(gvec_sri_s, uint32_t)
1732 DO_SRI(gvec_sri_d, uint64_t)
1733 
1734 #undef DO_SRI
1735 
1736 #define DO_SLI(NAME, TYPE)                              \
1737 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1738 {                                                       \
1739     intptr_t i, oprsz = simd_oprsz(desc);               \
1740     int shift = simd_data(desc);                        \
1741     TYPE *d = vd, *n = vn;                              \
1742     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1743         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1744     }                                                   \
1745     clear_tail(d, oprsz, simd_maxsz(desc));             \
1746 }
1747 
1748 DO_SLI(gvec_sli_b, uint8_t)
1749 DO_SLI(gvec_sli_h, uint16_t)
1750 DO_SLI(gvec_sli_s, uint32_t)
1751 DO_SLI(gvec_sli_d, uint64_t)
1752 
1753 #undef DO_SLI
1754 
1755 /*
1756  * Convert float16 to float32, raising no exceptions and
1757  * preserving exceptional values, including SNaN.
1758  * This is effectively an unpack+repack operation.
1759  */
1760 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1761 {
1762     const int f16_bias = 15;
1763     const int f32_bias = 127;
1764     uint32_t sign = extract32(f16, 15, 1);
1765     uint32_t exp = extract32(f16, 10, 5);
1766     uint32_t frac = extract32(f16, 0, 10);
1767 
1768     if (exp == 0x1f) {
1769         /* Inf or NaN */
1770         exp = 0xff;
1771     } else if (exp == 0) {
1772         /* Zero or denormal.  */
1773         if (frac != 0) {
1774             if (fz16) {
1775                 frac = 0;
1776             } else {
1777                 /*
1778                  * Denormal; these are all normal float32.
1779                  * Shift the fraction so that the msb is at bit 11,
1780                  * then remove bit 11 as the implicit bit of the
1781                  * normalized float32.  Note that we still go through
1782                  * the shift for normal numbers below, to put the
1783                  * float32 fraction at the right place.
1784                  */
1785                 int shift = clz32(frac) - 21;
1786                 frac = (frac << shift) & 0x3ff;
1787                 exp = f32_bias - f16_bias - shift + 1;
1788             }
1789         }
1790     } else {
1791         /* Normal number; adjust the bias.  */
1792         exp += f32_bias - f16_bias;
1793     }
1794     sign <<= 31;
1795     exp <<= 23;
1796     frac <<= 23 - 10;
1797 
1798     return sign | exp | frac;
1799 }
1800 
1801 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1802 {
1803     /*
1804      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1805      * Load the 2nd qword iff is_q & is_2.
1806      * Shift to the 2nd dword iff !is_q & is_2.
1807      * For !is_q & !is_2, the upper bits of the result are garbage.
1808      */
1809     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1810 }
1811 
1812 /*
1813  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1814  * as there is not yet SVE versions that might use blocking.
1815  */
1816 
1817 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1818                      uint32_t desc, bool fz16)
1819 {
1820     intptr_t i, oprsz = simd_oprsz(desc);
1821     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1822     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1823     int is_q = oprsz == 16;
1824     uint64_t n_4, m_4;
1825 
1826     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1827     n_4 = load4_f16(vn, is_q, is_2);
1828     m_4 = load4_f16(vm, is_q, is_2);
1829 
1830     /* Negate all inputs for FMLSL at once.  */
1831     if (is_s) {
1832         n_4 ^= 0x8000800080008000ull;
1833     }
1834 
1835     for (i = 0; i < oprsz / 4; i++) {
1836         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1837         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1838         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1839     }
1840     clear_tail(d, oprsz, simd_maxsz(desc));
1841 }
1842 
1843 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1844                             void *venv, uint32_t desc)
1845 {
1846     CPUARMState *env = venv;
1847     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1848              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1849 }
1850 
1851 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1852                             void *venv, uint32_t desc)
1853 {
1854     CPUARMState *env = venv;
1855     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1856              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1857 }
1858 
1859 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1860                                void *venv, uint32_t desc)
1861 {
1862     intptr_t i, oprsz = simd_oprsz(desc);
1863     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1864     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1865     CPUARMState *env = venv;
1866     float_status *status = &env->vfp.fp_status;
1867     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1868 
1869     for (i = 0; i < oprsz; i += sizeof(float32)) {
1870         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1871         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1872         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1873         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1874         float32 aa = *(float32 *)(va + H1_4(i));
1875 
1876         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1877     }
1878 }
1879 
1880 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1881                          uint32_t desc, bool fz16)
1882 {
1883     intptr_t i, oprsz = simd_oprsz(desc);
1884     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1885     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1886     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1887     int is_q = oprsz == 16;
1888     uint64_t n_4;
1889     float32 m_1;
1890 
1891     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1892     n_4 = load4_f16(vn, is_q, is_2);
1893 
1894     /* Negate all inputs for FMLSL at once.  */
1895     if (is_s) {
1896         n_4 ^= 0x8000800080008000ull;
1897     }
1898 
1899     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1900 
1901     for (i = 0; i < oprsz / 4; i++) {
1902         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1903         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1904     }
1905     clear_tail(d, oprsz, simd_maxsz(desc));
1906 }
1907 
1908 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1909                                 void *venv, uint32_t desc)
1910 {
1911     CPUARMState *env = venv;
1912     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1913                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1914 }
1915 
1916 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1917                                 void *venv, uint32_t desc)
1918 {
1919     CPUARMState *env = venv;
1920     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1921                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1922 }
1923 
1924 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1925                                void *venv, uint32_t desc)
1926 {
1927     intptr_t i, j, oprsz = simd_oprsz(desc);
1928     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1929     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1930     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1931     CPUARMState *env = venv;
1932     float_status *status = &env->vfp.fp_status;
1933     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1934 
1935     for (i = 0; i < oprsz; i += 16) {
1936         float16 mm_16 = *(float16 *)(vm + i + idx);
1937         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1938 
1939         for (j = 0; j < 16; j += sizeof(float32)) {
1940             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1941             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1942             float32 aa = *(float32 *)(va + H1_4(i + j));
1943 
1944             *(float32 *)(vd + H1_4(i + j)) =
1945                 float32_muladd(nn, mm, aa, 0, status);
1946         }
1947     }
1948 }
1949 
1950 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1951 {
1952     intptr_t i, opr_sz = simd_oprsz(desc);
1953     int8_t *d = vd, *n = vn, *m = vm;
1954 
1955     for (i = 0; i < opr_sz; ++i) {
1956         int8_t mm = m[i];
1957         int8_t nn = n[i];
1958         int8_t res = 0;
1959         if (mm >= 0) {
1960             if (mm < 8) {
1961                 res = nn << mm;
1962             }
1963         } else {
1964             res = nn >> (mm > -8 ? -mm : 7);
1965         }
1966         d[i] = res;
1967     }
1968     clear_tail(d, opr_sz, simd_maxsz(desc));
1969 }
1970 
1971 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1972 {
1973     intptr_t i, opr_sz = simd_oprsz(desc);
1974     int16_t *d = vd, *n = vn, *m = vm;
1975 
1976     for (i = 0; i < opr_sz / 2; ++i) {
1977         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1978         int16_t nn = n[i];
1979         int16_t res = 0;
1980         if (mm >= 0) {
1981             if (mm < 16) {
1982                 res = nn << mm;
1983             }
1984         } else {
1985             res = nn >> (mm > -16 ? -mm : 15);
1986         }
1987         d[i] = res;
1988     }
1989     clear_tail(d, opr_sz, simd_maxsz(desc));
1990 }
1991 
1992 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1993 {
1994     intptr_t i, opr_sz = simd_oprsz(desc);
1995     uint8_t *d = vd, *n = vn, *m = vm;
1996 
1997     for (i = 0; i < opr_sz; ++i) {
1998         int8_t mm = m[i];
1999         uint8_t nn = n[i];
2000         uint8_t res = 0;
2001         if (mm >= 0) {
2002             if (mm < 8) {
2003                 res = nn << mm;
2004             }
2005         } else {
2006             if (mm > -8) {
2007                 res = nn >> -mm;
2008             }
2009         }
2010         d[i] = res;
2011     }
2012     clear_tail(d, opr_sz, simd_maxsz(desc));
2013 }
2014 
2015 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2016 {
2017     intptr_t i, opr_sz = simd_oprsz(desc);
2018     uint16_t *d = vd, *n = vn, *m = vm;
2019 
2020     for (i = 0; i < opr_sz / 2; ++i) {
2021         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2022         uint16_t nn = n[i];
2023         uint16_t res = 0;
2024         if (mm >= 0) {
2025             if (mm < 16) {
2026                 res = nn << mm;
2027             }
2028         } else {
2029             if (mm > -16) {
2030                 res = nn >> -mm;
2031             }
2032         }
2033         d[i] = res;
2034     }
2035     clear_tail(d, opr_sz, simd_maxsz(desc));
2036 }
2037 
2038 /*
2039  * 8x8->8 polynomial multiply.
2040  *
2041  * Polynomial multiplication is like integer multiplication except the
2042  * partial products are XORed, not added.
2043  *
2044  * TODO: expose this as a generic vector operation, as it is a common
2045  * crypto building block.
2046  */
2047 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2048 {
2049     intptr_t i, opr_sz = simd_oprsz(desc);
2050     uint64_t *d = vd, *n = vn, *m = vm;
2051 
2052     for (i = 0; i < opr_sz / 8; ++i) {
2053         d[i] = clmul_8x8_low(n[i], m[i]);
2054     }
2055     clear_tail(d, opr_sz, simd_maxsz(desc));
2056 }
2057 
2058 /*
2059  * 64x64->128 polynomial multiply.
2060  * Because of the lanes are not accessed in strict columns,
2061  * this probably cannot be turned into a generic helper.
2062  */
2063 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2064 {
2065     intptr_t i, opr_sz = simd_oprsz(desc);
2066     intptr_t hi = simd_data(desc);
2067     uint64_t *d = vd, *n = vn, *m = vm;
2068 
2069     for (i = 0; i < opr_sz / 8; i += 2) {
2070         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2071         d[i] = int128_getlo(r);
2072         d[i + 1] = int128_gethi(r);
2073     }
2074     clear_tail(d, opr_sz, simd_maxsz(desc));
2075 }
2076 
2077 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2078 {
2079     int hi = simd_data(desc);
2080     uint64_t *d = vd, *n = vn, *m = vm;
2081     uint64_t nn = n[hi], mm = m[hi];
2082 
2083     d[0] = clmul_8x4_packed(nn, mm);
2084     nn >>= 32;
2085     mm >>= 32;
2086     d[1] = clmul_8x4_packed(nn, mm);
2087 
2088     clear_tail(d, 16, simd_maxsz(desc));
2089 }
2090 
2091 #ifdef TARGET_AARCH64
2092 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2093 {
2094     int shift = simd_data(desc) * 8;
2095     intptr_t i, opr_sz = simd_oprsz(desc);
2096     uint64_t *d = vd, *n = vn, *m = vm;
2097 
2098     for (i = 0; i < opr_sz / 8; ++i) {
2099         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2100     }
2101 }
2102 
2103 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2104 {
2105     intptr_t sel = H4(simd_data(desc));
2106     intptr_t i, opr_sz = simd_oprsz(desc);
2107     uint32_t *n = vn, *m = vm;
2108     uint64_t *d = vd;
2109 
2110     for (i = 0; i < opr_sz / 8; ++i) {
2111         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2112     }
2113 }
2114 #endif
2115 
2116 #define DO_CMP0(NAME, TYPE, OP)                         \
2117 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2118 {                                                       \
2119     intptr_t i, opr_sz = simd_oprsz(desc);              \
2120     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2121         TYPE nn = *(TYPE *)(vn + i);                    \
2122         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2123     }                                                   \
2124     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2125 }
2126 
2127 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2128 DO_CMP0(gvec_clt0_b, int8_t, <)
2129 DO_CMP0(gvec_cle0_b, int8_t, <=)
2130 DO_CMP0(gvec_cgt0_b, int8_t, >)
2131 DO_CMP0(gvec_cge0_b, int8_t, >=)
2132 
2133 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2134 DO_CMP0(gvec_clt0_h, int16_t, <)
2135 DO_CMP0(gvec_cle0_h, int16_t, <=)
2136 DO_CMP0(gvec_cgt0_h, int16_t, >)
2137 DO_CMP0(gvec_cge0_h, int16_t, >=)
2138 
2139 #undef DO_CMP0
2140 
2141 #define DO_ABD(NAME, TYPE)                                      \
2142 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2143 {                                                               \
2144     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2145     TYPE *d = vd, *n = vn, *m = vm;                             \
2146                                                                 \
2147     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2148         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2149     }                                                           \
2150     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2151 }
2152 
2153 DO_ABD(gvec_sabd_b, int8_t)
2154 DO_ABD(gvec_sabd_h, int16_t)
2155 DO_ABD(gvec_sabd_s, int32_t)
2156 DO_ABD(gvec_sabd_d, int64_t)
2157 
2158 DO_ABD(gvec_uabd_b, uint8_t)
2159 DO_ABD(gvec_uabd_h, uint16_t)
2160 DO_ABD(gvec_uabd_s, uint32_t)
2161 DO_ABD(gvec_uabd_d, uint64_t)
2162 
2163 #undef DO_ABD
2164 
2165 #define DO_ABA(NAME, TYPE)                                      \
2166 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2167 {                                                               \
2168     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2169     TYPE *d = vd, *n = vn, *m = vm;                             \
2170                                                                 \
2171     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2172         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2173     }                                                           \
2174     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2175 }
2176 
2177 DO_ABA(gvec_saba_b, int8_t)
2178 DO_ABA(gvec_saba_h, int16_t)
2179 DO_ABA(gvec_saba_s, int32_t)
2180 DO_ABA(gvec_saba_d, int64_t)
2181 
2182 DO_ABA(gvec_uaba_b, uint8_t)
2183 DO_ABA(gvec_uaba_h, uint16_t)
2184 DO_ABA(gvec_uaba_s, uint32_t)
2185 DO_ABA(gvec_uaba_d, uint64_t)
2186 
2187 #undef DO_ABA
2188 
2189 #define DO_NEON_PAIRWISE(NAME, OP)                                      \
2190     void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
2191                          void *stat, uint32_t oprsz)                    \
2192     {                                                                   \
2193         float_status *fpst = stat;                                      \
2194         float32 *d = vd;                                                \
2195         float32 *n = vn;                                                \
2196         float32 *m = vm;                                                \
2197         float32 r0, r1;                                                 \
2198                                                                         \
2199         /* Read all inputs before writing outputs in case vm == vd */   \
2200         r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
2201         r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
2202                                                                         \
2203         d[H4(0)] = r0;                                                  \
2204         d[H4(1)] = r1;                                                  \
2205     }                                                                   \
2206                                                                         \
2207     void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
2208                          void *stat, uint32_t oprsz)                    \
2209     {                                                                   \
2210         float_status *fpst = stat;                                      \
2211         float16 *d = vd;                                                \
2212         float16 *n = vn;                                                \
2213         float16 *m = vm;                                                \
2214         float16 r0, r1, r2, r3;                                         \
2215                                                                         \
2216         /* Read all inputs before writing outputs in case vm == vd */   \
2217         r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
2218         r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
2219         r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
2220         r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
2221                                                                         \
2222         d[H2(0)] = r0;                                                  \
2223         d[H2(1)] = r1;                                                  \
2224         d[H2(2)] = r2;                                                  \
2225         d[H2(3)] = r3;                                                  \
2226     }
2227 
2228 DO_NEON_PAIRWISE(neon_padd, add)
2229 DO_NEON_PAIRWISE(neon_pmax, max)
2230 DO_NEON_PAIRWISE(neon_pmin, min)
2231 
2232 #undef DO_NEON_PAIRWISE
2233 
2234 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2235     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2236     {                                                                   \
2237         intptr_t i, oprsz = simd_oprsz(desc);                           \
2238         int shift = simd_data(desc);                                    \
2239         TYPE *d = vd, *n = vn;                                          \
2240         float_status *fpst = stat;                                      \
2241         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2242             d[i] = FUNC(n[i], shift, fpst);                             \
2243         }                                                               \
2244         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2245     }
2246 
2247 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2248 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2249 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2250 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2251 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2252 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2253 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2254 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2255 
2256 #undef DO_VCVT_FIXED
2257 
2258 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2259     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2260     {                                                                   \
2261         float_status *fpst = stat;                                      \
2262         intptr_t i, oprsz = simd_oprsz(desc);                           \
2263         uint32_t rmode = simd_data(desc);                               \
2264         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2265         TYPE *d = vd, *n = vn;                                          \
2266         set_float_rounding_mode(rmode, fpst);                           \
2267         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2268             d[i] = FUNC(n[i], 0, fpst);                                 \
2269         }                                                               \
2270         set_float_rounding_mode(prev_rmode, fpst);                      \
2271         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2272     }
2273 
2274 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2275 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2276 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2277 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2278 
2279 #undef DO_VCVT_RMODE
2280 
2281 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2282     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2283     {                                                                   \
2284         float_status *fpst = stat;                                      \
2285         intptr_t i, oprsz = simd_oprsz(desc);                           \
2286         uint32_t rmode = simd_data(desc);                               \
2287         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2288         TYPE *d = vd, *n = vn;                                          \
2289         set_float_rounding_mode(rmode, fpst);                           \
2290         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2291             d[i] = FUNC(n[i], fpst);                                    \
2292         }                                                               \
2293         set_float_rounding_mode(prev_rmode, fpst);                      \
2294         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2295     }
2296 
2297 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2298 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2299 
2300 #undef DO_VRINT_RMODE
2301 
2302 #ifdef TARGET_AARCH64
2303 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2304 {
2305     const uint8_t *indices = vm;
2306     CPUARMState *env = venv;
2307     size_t oprsz = simd_oprsz(desc);
2308     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2309     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2310     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2311     union {
2312         uint8_t b[16];
2313         uint64_t d[2];
2314     } result;
2315 
2316     /*
2317      * We must construct the final result in a temp, lest the output
2318      * overlaps the input table.  For TBL, begin with zero; for TBX,
2319      * begin with the original register contents.  Note that we always
2320      * copy 16 bytes here to avoid an extra branch; clearing the high
2321      * bits of the register for oprsz == 8 is handled below.
2322      */
2323     if (is_tbx) {
2324         memcpy(&result, vd, 16);
2325     } else {
2326         memset(&result, 0, 16);
2327     }
2328 
2329     for (size_t i = 0; i < oprsz; ++i) {
2330         uint32_t index = indices[H1(i)];
2331 
2332         if (index < table_len) {
2333             /*
2334              * Convert index (a byte offset into the virtual table
2335              * which is a series of 128-bit vectors concatenated)
2336              * into the correct register element, bearing in mind
2337              * that the table can wrap around from V31 to V0.
2338              */
2339             const uint8_t *table = (const uint8_t *)
2340                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2341             result.b[H1(i)] = table[H1(index % 16)];
2342         }
2343     }
2344 
2345     memcpy(vd, &result, 16);
2346     clear_tail(vd, oprsz, simd_maxsz(desc));
2347 }
2348 #endif
2349 
2350 /*
2351  * NxN -> N highpart multiply
2352  *
2353  * TODO: expose this as a generic vector operation.
2354  */
2355 
2356 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2357 {
2358     intptr_t i, opr_sz = simd_oprsz(desc);
2359     int8_t *d = vd, *n = vn, *m = vm;
2360 
2361     for (i = 0; i < opr_sz; ++i) {
2362         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2363     }
2364     clear_tail(d, opr_sz, simd_maxsz(desc));
2365 }
2366 
2367 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2368 {
2369     intptr_t i, opr_sz = simd_oprsz(desc);
2370     int16_t *d = vd, *n = vn, *m = vm;
2371 
2372     for (i = 0; i < opr_sz / 2; ++i) {
2373         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2374     }
2375     clear_tail(d, opr_sz, simd_maxsz(desc));
2376 }
2377 
2378 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2379 {
2380     intptr_t i, opr_sz = simd_oprsz(desc);
2381     int32_t *d = vd, *n = vn, *m = vm;
2382 
2383     for (i = 0; i < opr_sz / 4; ++i) {
2384         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2385     }
2386     clear_tail(d, opr_sz, simd_maxsz(desc));
2387 }
2388 
2389 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2390 {
2391     intptr_t i, opr_sz = simd_oprsz(desc);
2392     uint64_t *d = vd, *n = vn, *m = vm;
2393     uint64_t discard;
2394 
2395     for (i = 0; i < opr_sz / 8; ++i) {
2396         muls64(&discard, &d[i], n[i], m[i]);
2397     }
2398     clear_tail(d, opr_sz, simd_maxsz(desc));
2399 }
2400 
2401 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2402 {
2403     intptr_t i, opr_sz = simd_oprsz(desc);
2404     uint8_t *d = vd, *n = vn, *m = vm;
2405 
2406     for (i = 0; i < opr_sz; ++i) {
2407         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2408     }
2409     clear_tail(d, opr_sz, simd_maxsz(desc));
2410 }
2411 
2412 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2413 {
2414     intptr_t i, opr_sz = simd_oprsz(desc);
2415     uint16_t *d = vd, *n = vn, *m = vm;
2416 
2417     for (i = 0; i < opr_sz / 2; ++i) {
2418         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2419     }
2420     clear_tail(d, opr_sz, simd_maxsz(desc));
2421 }
2422 
2423 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2424 {
2425     intptr_t i, opr_sz = simd_oprsz(desc);
2426     uint32_t *d = vd, *n = vn, *m = vm;
2427 
2428     for (i = 0; i < opr_sz / 4; ++i) {
2429         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2430     }
2431     clear_tail(d, opr_sz, simd_maxsz(desc));
2432 }
2433 
2434 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2435 {
2436     intptr_t i, opr_sz = simd_oprsz(desc);
2437     uint64_t *d = vd, *n = vn, *m = vm;
2438     uint64_t discard;
2439 
2440     for (i = 0; i < opr_sz / 8; ++i) {
2441         mulu64(&discard, &d[i], n[i], m[i]);
2442     }
2443     clear_tail(d, opr_sz, simd_maxsz(desc));
2444 }
2445 
2446 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2447 {
2448     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2449     int shr = simd_data(desc);
2450     uint64_t *d = vd, *n = vn, *m = vm;
2451 
2452     for (i = 0; i < opr_sz; ++i) {
2453         d[i] = ror64(n[i] ^ m[i], shr);
2454     }
2455     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2456 }
2457 
2458 /*
2459  * Integer matrix-multiply accumulate
2460  */
2461 
2462 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2463 {
2464     int8_t *n = vn, *m = vm;
2465 
2466     for (intptr_t k = 0; k < 8; ++k) {
2467         sum += n[H1(k)] * m[H1(k)];
2468     }
2469     return sum;
2470 }
2471 
2472 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2473 {
2474     uint8_t *n = vn, *m = vm;
2475 
2476     for (intptr_t k = 0; k < 8; ++k) {
2477         sum += n[H1(k)] * m[H1(k)];
2478     }
2479     return sum;
2480 }
2481 
2482 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2483 {
2484     uint8_t *n = vn;
2485     int8_t *m = vm;
2486 
2487     for (intptr_t k = 0; k < 8; ++k) {
2488         sum += n[H1(k)] * m[H1(k)];
2489     }
2490     return sum;
2491 }
2492 
2493 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2494                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2495 {
2496     intptr_t seg, opr_sz = simd_oprsz(desc);
2497 
2498     for (seg = 0; seg < opr_sz; seg += 16) {
2499         uint32_t *d = vd + seg;
2500         uint32_t *a = va + seg;
2501         uint32_t sum0, sum1, sum2, sum3;
2502 
2503         /*
2504          * Process the entire segment at once, writing back the
2505          * results only after we've consumed all of the inputs.
2506          *
2507          * Key to indices by column:
2508          *          i   j                  i             j
2509          */
2510         sum0 = a[H4(0 + 0)];
2511         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2512         sum1 = a[H4(0 + 1)];
2513         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2514         sum2 = a[H4(2 + 0)];
2515         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2516         sum3 = a[H4(2 + 1)];
2517         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2518 
2519         d[H4(0)] = sum0;
2520         d[H4(1)] = sum1;
2521         d[H4(2)] = sum2;
2522         d[H4(3)] = sum3;
2523     }
2524     clear_tail(vd, opr_sz, simd_maxsz(desc));
2525 }
2526 
2527 #define DO_MMLA_B(NAME, INNER) \
2528     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2529     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2530 
2531 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2532 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2533 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2534 
2535 /*
2536  * BFloat16 Dot Product
2537  */
2538 
2539 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2540 {
2541     /* FPCR is ignored for BFDOT and BFMMLA. */
2542     float_status bf_status = {
2543         .tininess_before_rounding = float_tininess_before_rounding,
2544         .float_rounding_mode = float_round_to_odd_inf,
2545         .flush_to_zero = true,
2546         .flush_inputs_to_zero = true,
2547         .default_nan_mode = true,
2548     };
2549     float32 t1, t2;
2550 
2551     /*
2552      * Extract each BFloat16 from the element pair, and shift
2553      * them such that they become float32.
2554      */
2555     t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2556     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2557     t1 = float32_add(t1, t2, &bf_status);
2558     t1 = float32_add(sum, t1, &bf_status);
2559 
2560     return t1;
2561 }
2562 
2563 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2564 {
2565     intptr_t i, opr_sz = simd_oprsz(desc);
2566     float32 *d = vd, *a = va;
2567     uint32_t *n = vn, *m = vm;
2568 
2569     for (i = 0; i < opr_sz / 4; ++i) {
2570         d[i] = bfdotadd(a[i], n[i], m[i]);
2571     }
2572     clear_tail(d, opr_sz, simd_maxsz(desc));
2573 }
2574 
2575 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2576                             void *va, uint32_t desc)
2577 {
2578     intptr_t i, j, opr_sz = simd_oprsz(desc);
2579     intptr_t index = simd_data(desc);
2580     intptr_t elements = opr_sz / 4;
2581     intptr_t eltspersegment = MIN(16 / 4, elements);
2582     float32 *d = vd, *a = va;
2583     uint32_t *n = vn, *m = vm;
2584 
2585     for (i = 0; i < elements; i += eltspersegment) {
2586         uint32_t m_idx = m[i + H4(index)];
2587 
2588         for (j = i; j < i + eltspersegment; j++) {
2589             d[j] = bfdotadd(a[j], n[j], m_idx);
2590         }
2591     }
2592     clear_tail(d, opr_sz, simd_maxsz(desc));
2593 }
2594 
2595 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2596 {
2597     intptr_t s, opr_sz = simd_oprsz(desc);
2598     float32 *d = vd, *a = va;
2599     uint32_t *n = vn, *m = vm;
2600 
2601     for (s = 0; s < opr_sz / 4; s += 4) {
2602         float32 sum00, sum01, sum10, sum11;
2603 
2604         /*
2605          * Process the entire segment at once, writing back the
2606          * results only after we've consumed all of the inputs.
2607          *
2608          * Key to indices by column:
2609          *               i   j           i   k             j   k
2610          */
2611         sum00 = a[s + H4(0 + 0)];
2612         sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2613         sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2614 
2615         sum01 = a[s + H4(0 + 1)];
2616         sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2617         sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2618 
2619         sum10 = a[s + H4(2 + 0)];
2620         sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2621         sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2622 
2623         sum11 = a[s + H4(2 + 1)];
2624         sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2625         sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2626 
2627         d[s + H4(0 + 0)] = sum00;
2628         d[s + H4(0 + 1)] = sum01;
2629         d[s + H4(2 + 0)] = sum10;
2630         d[s + H4(2 + 1)] = sum11;
2631     }
2632     clear_tail(d, opr_sz, simd_maxsz(desc));
2633 }
2634 
2635 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2636                          void *stat, uint32_t desc)
2637 {
2638     intptr_t i, opr_sz = simd_oprsz(desc);
2639     intptr_t sel = simd_data(desc);
2640     float32 *d = vd, *a = va;
2641     bfloat16 *n = vn, *m = vm;
2642 
2643     for (i = 0; i < opr_sz / 4; ++i) {
2644         float32 nn = n[H2(i * 2 + sel)] << 16;
2645         float32 mm = m[H2(i * 2 + sel)] << 16;
2646         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2647     }
2648     clear_tail(d, opr_sz, simd_maxsz(desc));
2649 }
2650 
2651 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2652                              void *va, void *stat, uint32_t desc)
2653 {
2654     intptr_t i, j, opr_sz = simd_oprsz(desc);
2655     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2656     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2657     intptr_t elements = opr_sz / 4;
2658     intptr_t eltspersegment = MIN(16 / 4, elements);
2659     float32 *d = vd, *a = va;
2660     bfloat16 *n = vn, *m = vm;
2661 
2662     for (i = 0; i < elements; i += eltspersegment) {
2663         float32 m_idx = m[H2(2 * i + index)] << 16;
2664 
2665         for (j = i; j < i + eltspersegment; j++) {
2666             float32 n_j = n[H2(2 * j + sel)] << 16;
2667             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2668         }
2669     }
2670     clear_tail(d, opr_sz, simd_maxsz(desc));
2671 }
2672 
2673 #define DO_CLAMP(NAME, TYPE) \
2674 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2675 {                                                                       \
2676     intptr_t i, opr_sz = simd_oprsz(desc);                              \
2677     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2678         TYPE aa = *(TYPE *)(a + i);                                     \
2679         TYPE nn = *(TYPE *)(n + i);                                     \
2680         TYPE mm = *(TYPE *)(m + i);                                     \
2681         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2682         *(TYPE *)(d + i) = dd;                                          \
2683     }                                                                   \
2684     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2685 }
2686 
2687 DO_CLAMP(gvec_sclamp_b, int8_t)
2688 DO_CLAMP(gvec_sclamp_h, int16_t)
2689 DO_CLAMP(gvec_sclamp_s, int32_t)
2690 DO_CLAMP(gvec_sclamp_d, int64_t)
2691 
2692 DO_CLAMP(gvec_uclamp_b, uint8_t)
2693 DO_CLAMP(gvec_uclamp_h, uint16_t)
2694 DO_CLAMP(gvec_uclamp_s, uint32_t)
2695 DO_CLAMP(gvec_uclamp_d, uint64_t)
2696