xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision 05caa062)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
840     intptr_t index = simd_data(desc);                                     \
841     TYPED *d = vd, *a = va;                                               \
842     TYPEN *n = vn;                                                        \
843     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
844     do {                                                                  \
845         TYPED m0 = m_indexed[i * 4 + 0];                                  \
846         TYPED m1 = m_indexed[i * 4 + 1];                                  \
847         TYPED m2 = m_indexed[i * 4 + 2];                                  \
848         TYPED m3 = m_indexed[i * 4 + 3];                                  \
849         do {                                                              \
850             d[i] = (a[i] +                                                \
851                     n[i * 4 + 0] * m0 +                                   \
852                     n[i * 4 + 1] * m1 +                                   \
853                     n[i * 4 + 2] * m2 +                                   \
854                     n[i * 4 + 3] * m3);                                   \
855         } while (++i < segend);                                           \
856         segend = i + 4;                                                   \
857     } while (i < opr_sz_n);                                               \
858     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
859 }
860 
861 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
862 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
863 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
864 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
865 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
866 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
867 
868 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
869                          void *vfpst, uint32_t desc)
870 {
871     uintptr_t opr_sz = simd_oprsz(desc);
872     float16 *d = vd;
873     float16 *n = vn;
874     float16 *m = vm;
875     float_status *fpst = vfpst;
876     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
877     uint32_t neg_imag = neg_real ^ 1;
878     uintptr_t i;
879 
880     /* Shift boolean to the sign bit so we can xor to negate.  */
881     neg_real <<= 15;
882     neg_imag <<= 15;
883 
884     for (i = 0; i < opr_sz / 2; i += 2) {
885         float16 e0 = n[H2(i)];
886         float16 e1 = m[H2(i + 1)] ^ neg_imag;
887         float16 e2 = n[H2(i + 1)];
888         float16 e3 = m[H2(i)] ^ neg_real;
889 
890         d[H2(i)] = float16_add(e0, e1, fpst);
891         d[H2(i + 1)] = float16_add(e2, e3, fpst);
892     }
893     clear_tail(d, opr_sz, simd_maxsz(desc));
894 }
895 
896 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
897                          void *vfpst, uint32_t desc)
898 {
899     uintptr_t opr_sz = simd_oprsz(desc);
900     float32 *d = vd;
901     float32 *n = vn;
902     float32 *m = vm;
903     float_status *fpst = vfpst;
904     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
905     uint32_t neg_imag = neg_real ^ 1;
906     uintptr_t i;
907 
908     /* Shift boolean to the sign bit so we can xor to negate.  */
909     neg_real <<= 31;
910     neg_imag <<= 31;
911 
912     for (i = 0; i < opr_sz / 4; i += 2) {
913         float32 e0 = n[H4(i)];
914         float32 e1 = m[H4(i + 1)] ^ neg_imag;
915         float32 e2 = n[H4(i + 1)];
916         float32 e3 = m[H4(i)] ^ neg_real;
917 
918         d[H4(i)] = float32_add(e0, e1, fpst);
919         d[H4(i + 1)] = float32_add(e2, e3, fpst);
920     }
921     clear_tail(d, opr_sz, simd_maxsz(desc));
922 }
923 
924 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
925                          void *vfpst, uint32_t desc)
926 {
927     uintptr_t opr_sz = simd_oprsz(desc);
928     float64 *d = vd;
929     float64 *n = vn;
930     float64 *m = vm;
931     float_status *fpst = vfpst;
932     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
933     uint64_t neg_imag = neg_real ^ 1;
934     uintptr_t i;
935 
936     /* Shift boolean to the sign bit so we can xor to negate.  */
937     neg_real <<= 63;
938     neg_imag <<= 63;
939 
940     for (i = 0; i < opr_sz / 8; i += 2) {
941         float64 e0 = n[i];
942         float64 e1 = m[i + 1] ^ neg_imag;
943         float64 e2 = n[i + 1];
944         float64 e3 = m[i] ^ neg_real;
945 
946         d[i] = float64_add(e0, e1, fpst);
947         d[i + 1] = float64_add(e2, e3, fpst);
948     }
949     clear_tail(d, opr_sz, simd_maxsz(desc));
950 }
951 
952 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
953                          void *vfpst, uint32_t desc)
954 {
955     uintptr_t opr_sz = simd_oprsz(desc);
956     float16 *d = vd, *n = vn, *m = vm, *a = va;
957     float_status *fpst = vfpst;
958     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
959     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
960     uint32_t neg_real = flip ^ neg_imag;
961     uintptr_t i;
962 
963     /* Shift boolean to the sign bit so we can xor to negate.  */
964     neg_real <<= 15;
965     neg_imag <<= 15;
966 
967     for (i = 0; i < opr_sz / 2; i += 2) {
968         float16 e2 = n[H2(i + flip)];
969         float16 e1 = m[H2(i + flip)] ^ neg_real;
970         float16 e4 = e2;
971         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
972 
973         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
974         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
975     }
976     clear_tail(d, opr_sz, simd_maxsz(desc));
977 }
978 
979 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
980                              void *vfpst, uint32_t desc)
981 {
982     uintptr_t opr_sz = simd_oprsz(desc);
983     float16 *d = vd, *n = vn, *m = vm, *a = va;
984     float_status *fpst = vfpst;
985     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
986     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
987     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
988     uint32_t neg_real = flip ^ neg_imag;
989     intptr_t elements = opr_sz / sizeof(float16);
990     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
991     intptr_t i, j;
992 
993     /* Shift boolean to the sign bit so we can xor to negate.  */
994     neg_real <<= 15;
995     neg_imag <<= 15;
996 
997     for (i = 0; i < elements; i += eltspersegment) {
998         float16 mr = m[H2(i + 2 * index + 0)];
999         float16 mi = m[H2(i + 2 * index + 1)];
1000         float16 e1 = neg_real ^ (flip ? mi : mr);
1001         float16 e3 = neg_imag ^ (flip ? mr : mi);
1002 
1003         for (j = i; j < i + eltspersegment; j += 2) {
1004             float16 e2 = n[H2(j + flip)];
1005             float16 e4 = e2;
1006 
1007             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1008             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1009         }
1010     }
1011     clear_tail(d, opr_sz, simd_maxsz(desc));
1012 }
1013 
1014 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1015                          void *vfpst, uint32_t desc)
1016 {
1017     uintptr_t opr_sz = simd_oprsz(desc);
1018     float32 *d = vd, *n = vn, *m = vm, *a = va;
1019     float_status *fpst = vfpst;
1020     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1021     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1022     uint32_t neg_real = flip ^ neg_imag;
1023     uintptr_t i;
1024 
1025     /* Shift boolean to the sign bit so we can xor to negate.  */
1026     neg_real <<= 31;
1027     neg_imag <<= 31;
1028 
1029     for (i = 0; i < opr_sz / 4; i += 2) {
1030         float32 e2 = n[H4(i + flip)];
1031         float32 e1 = m[H4(i + flip)] ^ neg_real;
1032         float32 e4 = e2;
1033         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1034 
1035         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1036         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1037     }
1038     clear_tail(d, opr_sz, simd_maxsz(desc));
1039 }
1040 
1041 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1042                              void *vfpst, uint32_t desc)
1043 {
1044     uintptr_t opr_sz = simd_oprsz(desc);
1045     float32 *d = vd, *n = vn, *m = vm, *a = va;
1046     float_status *fpst = vfpst;
1047     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1048     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1049     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1050     uint32_t neg_real = flip ^ neg_imag;
1051     intptr_t elements = opr_sz / sizeof(float32);
1052     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1053     intptr_t i, j;
1054 
1055     /* Shift boolean to the sign bit so we can xor to negate.  */
1056     neg_real <<= 31;
1057     neg_imag <<= 31;
1058 
1059     for (i = 0; i < elements; i += eltspersegment) {
1060         float32 mr = m[H4(i + 2 * index + 0)];
1061         float32 mi = m[H4(i + 2 * index + 1)];
1062         float32 e1 = neg_real ^ (flip ? mi : mr);
1063         float32 e3 = neg_imag ^ (flip ? mr : mi);
1064 
1065         for (j = i; j < i + eltspersegment; j += 2) {
1066             float32 e2 = n[H4(j + flip)];
1067             float32 e4 = e2;
1068 
1069             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1070             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1071         }
1072     }
1073     clear_tail(d, opr_sz, simd_maxsz(desc));
1074 }
1075 
1076 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1077                          void *vfpst, uint32_t desc)
1078 {
1079     uintptr_t opr_sz = simd_oprsz(desc);
1080     float64 *d = vd, *n = vn, *m = vm, *a = va;
1081     float_status *fpst = vfpst;
1082     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1083     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1084     uint64_t neg_real = flip ^ neg_imag;
1085     uintptr_t i;
1086 
1087     /* Shift boolean to the sign bit so we can xor to negate.  */
1088     neg_real <<= 63;
1089     neg_imag <<= 63;
1090 
1091     for (i = 0; i < opr_sz / 8; i += 2) {
1092         float64 e2 = n[i + flip];
1093         float64 e1 = m[i + flip] ^ neg_real;
1094         float64 e4 = e2;
1095         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1096 
1097         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1098         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1099     }
1100     clear_tail(d, opr_sz, simd_maxsz(desc));
1101 }
1102 
1103 /*
1104  * Floating point comparisons producing an integer result (all 1s or all 0s).
1105  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1106  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1107  */
1108 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1109 {
1110     return -float16_eq_quiet(op1, op2, stat);
1111 }
1112 
1113 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1114 {
1115     return -float32_eq_quiet(op1, op2, stat);
1116 }
1117 
1118 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1119 {
1120     return -float64_eq_quiet(op1, op2, stat);
1121 }
1122 
1123 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1124 {
1125     return -float16_le(op2, op1, stat);
1126 }
1127 
1128 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1129 {
1130     return -float32_le(op2, op1, stat);
1131 }
1132 
1133 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1134 {
1135     return -float64_le(op2, op1, stat);
1136 }
1137 
1138 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1139 {
1140     return -float16_lt(op2, op1, stat);
1141 }
1142 
1143 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1144 {
1145     return -float32_lt(op2, op1, stat);
1146 }
1147 
1148 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1149 {
1150     return -float64_lt(op2, op1, stat);
1151 }
1152 
1153 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1154 {
1155     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1156 }
1157 
1158 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1159 {
1160     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1161 }
1162 
1163 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1164 {
1165     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1166 }
1167 
1168 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1169 {
1170     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1171 }
1172 
1173 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1174 {
1175     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1176 }
1177 
1178 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1179 {
1180     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1181 }
1182 
1183 static int16_t vfp_tosszh(float16 x, void *fpstp)
1184 {
1185     float_status *fpst = fpstp;
1186     if (float16_is_any_nan(x)) {
1187         float_raise(float_flag_invalid, fpst);
1188         return 0;
1189     }
1190     return float16_to_int16_round_to_zero(x, fpst);
1191 }
1192 
1193 static uint16_t vfp_touszh(float16 x, void *fpstp)
1194 {
1195     float_status *fpst = fpstp;
1196     if (float16_is_any_nan(x)) {
1197         float_raise(float_flag_invalid, fpst);
1198         return 0;
1199     }
1200     return float16_to_uint16_round_to_zero(x, fpst);
1201 }
1202 
1203 #define DO_2OP(NAME, FUNC, TYPE) \
1204 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1205 {                                                                 \
1206     intptr_t i, oprsz = simd_oprsz(desc);                         \
1207     TYPE *d = vd, *n = vn;                                        \
1208     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1209         d[i] = FUNC(n[i], stat);                                  \
1210     }                                                             \
1211     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1212 }
1213 
1214 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1215 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1216 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1217 
1218 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1219 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1220 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1221 
1222 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1223 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1224 
1225 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1226 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1227 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1228 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1229 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1230 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1231 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1232 DO_2OP(gvec_touszh, vfp_touszh, float16)
1233 
1234 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1235     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1236     {                                                           \
1237         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1238     }
1239 
1240 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1241     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1242     {                                                           \
1243         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1244     }
1245 
1246 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1247     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1248     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1249     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1250     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1251 
1252 DO_2OP_CMP0(cgt, cgt, FWD)
1253 DO_2OP_CMP0(cge, cge, FWD)
1254 DO_2OP_CMP0(ceq, ceq, FWD)
1255 DO_2OP_CMP0(clt, cgt, REV)
1256 DO_2OP_CMP0(cle, cge, REV)
1257 
1258 #undef DO_2OP
1259 #undef DO_2OP_CMP0
1260 
1261 /* Floating-point trigonometric starting value.
1262  * See the ARM ARM pseudocode function FPTrigSMul.
1263  */
1264 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1265 {
1266     float16 result = float16_mul(op1, op1, stat);
1267     if (!float16_is_any_nan(result)) {
1268         result = float16_set_sign(result, op2 & 1);
1269     }
1270     return result;
1271 }
1272 
1273 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1274 {
1275     float32 result = float32_mul(op1, op1, stat);
1276     if (!float32_is_any_nan(result)) {
1277         result = float32_set_sign(result, op2 & 1);
1278     }
1279     return result;
1280 }
1281 
1282 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1283 {
1284     float64 result = float64_mul(op1, op1, stat);
1285     if (!float64_is_any_nan(result)) {
1286         result = float64_set_sign(result, op2 & 1);
1287     }
1288     return result;
1289 }
1290 
1291 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1292 {
1293     return float16_abs(float16_sub(op1, op2, stat));
1294 }
1295 
1296 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1297 {
1298     return float32_abs(float32_sub(op1, op2, stat));
1299 }
1300 
1301 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1302 {
1303     return float64_abs(float64_sub(op1, op2, stat));
1304 }
1305 
1306 /*
1307  * Reciprocal step. These are the AArch32 version which uses a
1308  * non-fused multiply-and-subtract.
1309  */
1310 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1311 {
1312     op1 = float16_squash_input_denormal(op1, stat);
1313     op2 = float16_squash_input_denormal(op2, stat);
1314 
1315     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1316         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1317         return float16_two;
1318     }
1319     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1320 }
1321 
1322 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1323 {
1324     op1 = float32_squash_input_denormal(op1, stat);
1325     op2 = float32_squash_input_denormal(op2, stat);
1326 
1327     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1328         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1329         return float32_two;
1330     }
1331     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1332 }
1333 
1334 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1335 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1336 {
1337     op1 = float16_squash_input_denormal(op1, stat);
1338     op2 = float16_squash_input_denormal(op2, stat);
1339 
1340     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1341         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1342         return float16_one_point_five;
1343     }
1344     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1345     return float16_div(op1, float16_two, stat);
1346 }
1347 
1348 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1349 {
1350     op1 = float32_squash_input_denormal(op1, stat);
1351     op2 = float32_squash_input_denormal(op2, stat);
1352 
1353     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1354         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1355         return float32_one_point_five;
1356     }
1357     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1358     return float32_div(op1, float32_two, stat);
1359 }
1360 
1361 #define DO_3OP(NAME, FUNC, TYPE) \
1362 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1363 {                                                                          \
1364     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1365     TYPE *d = vd, *n = vn, *m = vm;                                        \
1366     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1367         d[i] = FUNC(n[i], m[i], stat);                                     \
1368     }                                                                      \
1369     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1370 }
1371 
1372 DO_3OP(gvec_fadd_h, float16_add, float16)
1373 DO_3OP(gvec_fadd_s, float32_add, float32)
1374 DO_3OP(gvec_fadd_d, float64_add, float64)
1375 
1376 DO_3OP(gvec_fsub_h, float16_sub, float16)
1377 DO_3OP(gvec_fsub_s, float32_sub, float32)
1378 DO_3OP(gvec_fsub_d, float64_sub, float64)
1379 
1380 DO_3OP(gvec_fmul_h, float16_mul, float16)
1381 DO_3OP(gvec_fmul_s, float32_mul, float32)
1382 DO_3OP(gvec_fmul_d, float64_mul, float64)
1383 
1384 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1385 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1386 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1387 
1388 DO_3OP(gvec_fabd_h, float16_abd, float16)
1389 DO_3OP(gvec_fabd_s, float32_abd, float32)
1390 DO_3OP(gvec_fabd_d, float64_abd, float64)
1391 
1392 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1393 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1394 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1395 
1396 DO_3OP(gvec_fcge_h, float16_cge, float16)
1397 DO_3OP(gvec_fcge_s, float32_cge, float32)
1398 DO_3OP(gvec_fcge_d, float64_cge, float64)
1399 
1400 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1401 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1402 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1403 
1404 DO_3OP(gvec_facge_h, float16_acge, float16)
1405 DO_3OP(gvec_facge_s, float32_acge, float32)
1406 DO_3OP(gvec_facge_d, float64_acge, float64)
1407 
1408 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1409 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1410 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1411 
1412 DO_3OP(gvec_fmax_h, float16_max, float16)
1413 DO_3OP(gvec_fmax_s, float32_max, float32)
1414 DO_3OP(gvec_fmax_d, float64_max, float64)
1415 
1416 DO_3OP(gvec_fmin_h, float16_min, float16)
1417 DO_3OP(gvec_fmin_s, float32_min, float32)
1418 DO_3OP(gvec_fmin_d, float64_min, float64)
1419 
1420 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1421 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1422 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1423 
1424 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1425 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1426 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1427 
1428 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1429 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1430 
1431 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1432 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1433 
1434 #ifdef TARGET_AARCH64
1435 DO_3OP(gvec_fdiv_h, float16_div, float16)
1436 DO_3OP(gvec_fdiv_s, float32_div, float32)
1437 DO_3OP(gvec_fdiv_d, float64_div, float64)
1438 
1439 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1440 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1441 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1442 
1443 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1444 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1445 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1446 
1447 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1448 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1449 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1450 
1451 #endif
1452 #undef DO_3OP
1453 
1454 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1455 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1456                                  float_status *stat)
1457 {
1458     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1459 }
1460 
1461 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1462                                  float_status *stat)
1463 {
1464     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1465 }
1466 
1467 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1468                                  float_status *stat)
1469 {
1470     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1471 }
1472 
1473 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1474                                  float_status *stat)
1475 {
1476     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1477 }
1478 
1479 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1480 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1481                                 float_status *stat)
1482 {
1483     return float16_muladd(op1, op2, dest, 0, stat);
1484 }
1485 
1486 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1487                                  float_status *stat)
1488 {
1489     return float32_muladd(op1, op2, dest, 0, stat);
1490 }
1491 
1492 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1493                                  float_status *stat)
1494 {
1495     return float64_muladd(op1, op2, dest, 0, stat);
1496 }
1497 
1498 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1499                                  float_status *stat)
1500 {
1501     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1502 }
1503 
1504 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1505                                  float_status *stat)
1506 {
1507     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1508 }
1509 
1510 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1511                                  float_status *stat)
1512 {
1513     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1514 }
1515 
1516 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1517 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1518 {                                                                          \
1519     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1520     TYPE *d = vd, *n = vn, *m = vm;                                        \
1521     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1522         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1523     }                                                                      \
1524     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1525 }
1526 
1527 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1528 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1529 
1530 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1531 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1532 
1533 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1534 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1535 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1536 
1537 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1538 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1539 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1540 
1541 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1542  * For AdvSIMD, there is of course only one such vector segment.
1543  */
1544 
1545 #define DO_MUL_IDX(NAME, TYPE, H) \
1546 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1547 {                                                                          \
1548     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1549     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1550     intptr_t idx = simd_data(desc);                                        \
1551     TYPE *d = vd, *n = vn, *m = vm;                                        \
1552     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1553         TYPE mm = m[H(i + idx)];                                           \
1554         for (j = 0; j < segment; j++) {                                    \
1555             d[i + j] = n[i + j] * mm;                                      \
1556         }                                                                  \
1557     }                                                                      \
1558     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1559 }
1560 
1561 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1562 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1563 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1564 
1565 #undef DO_MUL_IDX
1566 
1567 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1568 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1569 {                                                                          \
1570     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1571     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1572     intptr_t idx = simd_data(desc);                                        \
1573     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1574     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1575         TYPE mm = m[H(i + idx)];                                           \
1576         for (j = 0; j < segment; j++) {                                    \
1577             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1578         }                                                                  \
1579     }                                                                      \
1580     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1581 }
1582 
1583 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1584 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1585 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1586 
1587 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1588 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1589 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1590 
1591 #undef DO_MLA_IDX
1592 
1593 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1594 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1595 {                                                                          \
1596     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1597     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1598     intptr_t idx = simd_data(desc);                                        \
1599     TYPE *d = vd, *n = vn, *m = vm;                                        \
1600     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1601         TYPE mm = m[H(i + idx)];                                           \
1602         for (j = 0; j < segment; j++) {                                    \
1603             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1604         }                                                                  \
1605     }                                                                      \
1606     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1607 }
1608 
1609 #define nop(N, M, S) (M)
1610 
1611 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1612 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1613 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1614 
1615 #ifdef TARGET_AARCH64
1616 
1617 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1618 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1619 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1620 
1621 #endif
1622 
1623 #undef nop
1624 
1625 /*
1626  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1627  * the fused ops below they assume accumulate both from and into Vd.
1628  */
1629 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1630 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1631 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1632 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1633 
1634 #undef DO_FMUL_IDX
1635 
1636 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1637 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1638                   void *stat, uint32_t desc)                               \
1639 {                                                                          \
1640     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1641     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1642     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1643     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1644     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1645     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1646     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1647         TYPE mm = m[H(i + idx)];                                           \
1648         for (j = 0; j < segment; j++) {                                    \
1649             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1650                                      mm, a[i + j], 0, stat);               \
1651         }                                                                  \
1652     }                                                                      \
1653     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1654 }
1655 
1656 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1657 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1658 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1659 
1660 #undef DO_FMLA_IDX
1661 
1662 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1663 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1664 {                                                                          \
1665     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1666     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1667     bool q = false;                                                        \
1668     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1669         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1670         if (dd < MIN) {                                                    \
1671             dd = MIN;                                                      \
1672             q = true;                                                      \
1673         } else if (dd > MAX) {                                             \
1674             dd = MAX;                                                      \
1675             q = true;                                                      \
1676         }                                                                  \
1677         d[i] = dd;                                                         \
1678     }                                                                      \
1679     if (q) {                                                               \
1680         uint32_t *qc = vq;                                                 \
1681         qc[0] = 1;                                                         \
1682     }                                                                      \
1683     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1684 }
1685 
1686 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1687 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1688 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1689 
1690 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1691 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1692 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1693 
1694 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1695 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1696 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1697 
1698 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1699 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1700 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1701 
1702 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1703 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1704 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1705 
1706 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1707 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1708 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1709 
1710 #undef DO_SAT
1711 
1712 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1713                           void *vm, uint32_t desc)
1714 {
1715     intptr_t i, oprsz = simd_oprsz(desc);
1716     uint64_t *d = vd, *n = vn, *m = vm;
1717     bool q = false;
1718 
1719     for (i = 0; i < oprsz / 8; i++) {
1720         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1721         if (dd < nn) {
1722             dd = UINT64_MAX;
1723             q = true;
1724         }
1725         d[i] = dd;
1726     }
1727     if (q) {
1728         uint32_t *qc = vq;
1729         qc[0] = 1;
1730     }
1731     clear_tail(d, oprsz, simd_maxsz(desc));
1732 }
1733 
1734 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1735                           void *vm, uint32_t desc)
1736 {
1737     intptr_t i, oprsz = simd_oprsz(desc);
1738     uint64_t *d = vd, *n = vn, *m = vm;
1739     bool q = false;
1740 
1741     for (i = 0; i < oprsz / 8; i++) {
1742         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1743         if (nn < mm) {
1744             dd = 0;
1745             q = true;
1746         }
1747         d[i] = dd;
1748     }
1749     if (q) {
1750         uint32_t *qc = vq;
1751         qc[0] = 1;
1752     }
1753     clear_tail(d, oprsz, simd_maxsz(desc));
1754 }
1755 
1756 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1757                           void *vm, uint32_t desc)
1758 {
1759     intptr_t i, oprsz = simd_oprsz(desc);
1760     int64_t *d = vd, *n = vn, *m = vm;
1761     bool q = false;
1762 
1763     for (i = 0; i < oprsz / 8; i++) {
1764         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1765         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1766             dd = (nn >> 63) ^ ~INT64_MIN;
1767             q = true;
1768         }
1769         d[i] = dd;
1770     }
1771     if (q) {
1772         uint32_t *qc = vq;
1773         qc[0] = 1;
1774     }
1775     clear_tail(d, oprsz, simd_maxsz(desc));
1776 }
1777 
1778 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1779                           void *vm, uint32_t desc)
1780 {
1781     intptr_t i, oprsz = simd_oprsz(desc);
1782     int64_t *d = vd, *n = vn, *m = vm;
1783     bool q = false;
1784 
1785     for (i = 0; i < oprsz / 8; i++) {
1786         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1787         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1788             dd = (nn >> 63) ^ ~INT64_MIN;
1789             q = true;
1790         }
1791         d[i] = dd;
1792     }
1793     if (q) {
1794         uint32_t *qc = vq;
1795         qc[0] = 1;
1796     }
1797     clear_tail(d, oprsz, simd_maxsz(desc));
1798 }
1799 
1800 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1801                            void *vm, uint32_t desc)
1802 {
1803     intptr_t i, oprsz = simd_oprsz(desc);
1804     uint64_t *d = vd, *n = vn, *m = vm;
1805     bool q = false;
1806 
1807     for (i = 0; i < oprsz / 8; i++) {
1808         uint64_t nn = n[i];
1809         int64_t mm = m[i];
1810         uint64_t dd = nn + mm;
1811 
1812         if (mm < 0) {
1813             if (nn < (uint64_t)-mm) {
1814                 dd = 0;
1815                 q = true;
1816             }
1817         } else {
1818             if (dd < nn) {
1819                 dd = UINT64_MAX;
1820                 q = true;
1821             }
1822         }
1823         d[i] = dd;
1824     }
1825     if (q) {
1826         uint32_t *qc = vq;
1827         qc[0] = 1;
1828     }
1829     clear_tail(d, oprsz, simd_maxsz(desc));
1830 }
1831 
1832 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1833                            void *vm, uint32_t desc)
1834 {
1835     intptr_t i, oprsz = simd_oprsz(desc);
1836     uint64_t *d = vd, *n = vn, *m = vm;
1837     bool q = false;
1838 
1839     for (i = 0; i < oprsz / 8; i++) {
1840         int64_t nn = n[i];
1841         uint64_t mm = m[i];
1842         int64_t dd = nn + mm;
1843 
1844         if (mm > (uint64_t)(INT64_MAX - nn)) {
1845             dd = INT64_MAX;
1846             q = true;
1847         }
1848         d[i] = dd;
1849     }
1850     if (q) {
1851         uint32_t *qc = vq;
1852         qc[0] = 1;
1853     }
1854     clear_tail(d, oprsz, simd_maxsz(desc));
1855 }
1856 
1857 #define DO_SRA(NAME, TYPE)                              \
1858 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1859 {                                                       \
1860     intptr_t i, oprsz = simd_oprsz(desc);               \
1861     int shift = simd_data(desc);                        \
1862     TYPE *d = vd, *n = vn;                              \
1863     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1864         d[i] += n[i] >> shift;                          \
1865     }                                                   \
1866     clear_tail(d, oprsz, simd_maxsz(desc));             \
1867 }
1868 
1869 DO_SRA(gvec_ssra_b, int8_t)
1870 DO_SRA(gvec_ssra_h, int16_t)
1871 DO_SRA(gvec_ssra_s, int32_t)
1872 DO_SRA(gvec_ssra_d, int64_t)
1873 
1874 DO_SRA(gvec_usra_b, uint8_t)
1875 DO_SRA(gvec_usra_h, uint16_t)
1876 DO_SRA(gvec_usra_s, uint32_t)
1877 DO_SRA(gvec_usra_d, uint64_t)
1878 
1879 #undef DO_SRA
1880 
1881 #define DO_RSHR(NAME, TYPE)                             \
1882 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1883 {                                                       \
1884     intptr_t i, oprsz = simd_oprsz(desc);               \
1885     int shift = simd_data(desc);                        \
1886     TYPE *d = vd, *n = vn;                              \
1887     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1888         TYPE tmp = n[i] >> (shift - 1);                 \
1889         d[i] = (tmp >> 1) + (tmp & 1);                  \
1890     }                                                   \
1891     clear_tail(d, oprsz, simd_maxsz(desc));             \
1892 }
1893 
1894 DO_RSHR(gvec_srshr_b, int8_t)
1895 DO_RSHR(gvec_srshr_h, int16_t)
1896 DO_RSHR(gvec_srshr_s, int32_t)
1897 DO_RSHR(gvec_srshr_d, int64_t)
1898 
1899 DO_RSHR(gvec_urshr_b, uint8_t)
1900 DO_RSHR(gvec_urshr_h, uint16_t)
1901 DO_RSHR(gvec_urshr_s, uint32_t)
1902 DO_RSHR(gvec_urshr_d, uint64_t)
1903 
1904 #undef DO_RSHR
1905 
1906 #define DO_RSRA(NAME, TYPE)                             \
1907 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1908 {                                                       \
1909     intptr_t i, oprsz = simd_oprsz(desc);               \
1910     int shift = simd_data(desc);                        \
1911     TYPE *d = vd, *n = vn;                              \
1912     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1913         TYPE tmp = n[i] >> (shift - 1);                 \
1914         d[i] += (tmp >> 1) + (tmp & 1);                 \
1915     }                                                   \
1916     clear_tail(d, oprsz, simd_maxsz(desc));             \
1917 }
1918 
1919 DO_RSRA(gvec_srsra_b, int8_t)
1920 DO_RSRA(gvec_srsra_h, int16_t)
1921 DO_RSRA(gvec_srsra_s, int32_t)
1922 DO_RSRA(gvec_srsra_d, int64_t)
1923 
1924 DO_RSRA(gvec_ursra_b, uint8_t)
1925 DO_RSRA(gvec_ursra_h, uint16_t)
1926 DO_RSRA(gvec_ursra_s, uint32_t)
1927 DO_RSRA(gvec_ursra_d, uint64_t)
1928 
1929 #undef DO_RSRA
1930 
1931 #define DO_SRI(NAME, TYPE)                              \
1932 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1933 {                                                       \
1934     intptr_t i, oprsz = simd_oprsz(desc);               \
1935     int shift = simd_data(desc);                        \
1936     TYPE *d = vd, *n = vn;                              \
1937     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1938         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1939     }                                                   \
1940     clear_tail(d, oprsz, simd_maxsz(desc));             \
1941 }
1942 
1943 DO_SRI(gvec_sri_b, uint8_t)
1944 DO_SRI(gvec_sri_h, uint16_t)
1945 DO_SRI(gvec_sri_s, uint32_t)
1946 DO_SRI(gvec_sri_d, uint64_t)
1947 
1948 #undef DO_SRI
1949 
1950 #define DO_SLI(NAME, TYPE)                              \
1951 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1952 {                                                       \
1953     intptr_t i, oprsz = simd_oprsz(desc);               \
1954     int shift = simd_data(desc);                        \
1955     TYPE *d = vd, *n = vn;                              \
1956     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1957         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1958     }                                                   \
1959     clear_tail(d, oprsz, simd_maxsz(desc));             \
1960 }
1961 
1962 DO_SLI(gvec_sli_b, uint8_t)
1963 DO_SLI(gvec_sli_h, uint16_t)
1964 DO_SLI(gvec_sli_s, uint32_t)
1965 DO_SLI(gvec_sli_d, uint64_t)
1966 
1967 #undef DO_SLI
1968 
1969 /*
1970  * Convert float16 to float32, raising no exceptions and
1971  * preserving exceptional values, including SNaN.
1972  * This is effectively an unpack+repack operation.
1973  */
1974 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1975 {
1976     const int f16_bias = 15;
1977     const int f32_bias = 127;
1978     uint32_t sign = extract32(f16, 15, 1);
1979     uint32_t exp = extract32(f16, 10, 5);
1980     uint32_t frac = extract32(f16, 0, 10);
1981 
1982     if (exp == 0x1f) {
1983         /* Inf or NaN */
1984         exp = 0xff;
1985     } else if (exp == 0) {
1986         /* Zero or denormal.  */
1987         if (frac != 0) {
1988             if (fz16) {
1989                 frac = 0;
1990             } else {
1991                 /*
1992                  * Denormal; these are all normal float32.
1993                  * Shift the fraction so that the msb is at bit 11,
1994                  * then remove bit 11 as the implicit bit of the
1995                  * normalized float32.  Note that we still go through
1996                  * the shift for normal numbers below, to put the
1997                  * float32 fraction at the right place.
1998                  */
1999                 int shift = clz32(frac) - 21;
2000                 frac = (frac << shift) & 0x3ff;
2001                 exp = f32_bias - f16_bias - shift + 1;
2002             }
2003         }
2004     } else {
2005         /* Normal number; adjust the bias.  */
2006         exp += f32_bias - f16_bias;
2007     }
2008     sign <<= 31;
2009     exp <<= 23;
2010     frac <<= 23 - 10;
2011 
2012     return sign | exp | frac;
2013 }
2014 
2015 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2016 {
2017     /*
2018      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2019      * Load the 2nd qword iff is_q & is_2.
2020      * Shift to the 2nd dword iff !is_q & is_2.
2021      * For !is_q & !is_2, the upper bits of the result are garbage.
2022      */
2023     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2024 }
2025 
2026 /*
2027  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2028  * as there is not yet SVE versions that might use blocking.
2029  */
2030 
2031 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2032                      uint32_t desc, bool fz16)
2033 {
2034     intptr_t i, oprsz = simd_oprsz(desc);
2035     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2036     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2037     int is_q = oprsz == 16;
2038     uint64_t n_4, m_4;
2039 
2040     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2041     n_4 = load4_f16(vn, is_q, is_2);
2042     m_4 = load4_f16(vm, is_q, is_2);
2043 
2044     /* Negate all inputs for FMLSL at once.  */
2045     if (is_s) {
2046         n_4 ^= 0x8000800080008000ull;
2047     }
2048 
2049     for (i = 0; i < oprsz / 4; i++) {
2050         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2051         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2052         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2053     }
2054     clear_tail(d, oprsz, simd_maxsz(desc));
2055 }
2056 
2057 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2058                             void *venv, uint32_t desc)
2059 {
2060     CPUARMState *env = venv;
2061     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2062              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2063 }
2064 
2065 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2066                             void *venv, uint32_t desc)
2067 {
2068     CPUARMState *env = venv;
2069     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
2070              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2071 }
2072 
2073 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2074                                void *venv, uint32_t desc)
2075 {
2076     intptr_t i, oprsz = simd_oprsz(desc);
2077     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2078     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2079     CPUARMState *env = venv;
2080     float_status *status = &env->vfp.fp_status;
2081     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2082 
2083     for (i = 0; i < oprsz; i += sizeof(float32)) {
2084         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2085         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2086         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2087         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2088         float32 aa = *(float32 *)(va + H1_4(i));
2089 
2090         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2091     }
2092 }
2093 
2094 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2095                          uint32_t desc, bool fz16)
2096 {
2097     intptr_t i, oprsz = simd_oprsz(desc);
2098     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2099     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2100     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2101     int is_q = oprsz == 16;
2102     uint64_t n_4;
2103     float32 m_1;
2104 
2105     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2106     n_4 = load4_f16(vn, is_q, is_2);
2107 
2108     /* Negate all inputs for FMLSL at once.  */
2109     if (is_s) {
2110         n_4 ^= 0x8000800080008000ull;
2111     }
2112 
2113     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2114 
2115     for (i = 0; i < oprsz / 4; i++) {
2116         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2117         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2118     }
2119     clear_tail(d, oprsz, simd_maxsz(desc));
2120 }
2121 
2122 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2123                                 void *venv, uint32_t desc)
2124 {
2125     CPUARMState *env = venv;
2126     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2127                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2128 }
2129 
2130 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2131                                 void *venv, uint32_t desc)
2132 {
2133     CPUARMState *env = venv;
2134     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2135                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2136 }
2137 
2138 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2139                                void *venv, uint32_t desc)
2140 {
2141     intptr_t i, j, oprsz = simd_oprsz(desc);
2142     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2143     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2144     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2145     CPUARMState *env = venv;
2146     float_status *status = &env->vfp.fp_status;
2147     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2148 
2149     for (i = 0; i < oprsz; i += 16) {
2150         float16 mm_16 = *(float16 *)(vm + i + idx);
2151         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2152 
2153         for (j = 0; j < 16; j += sizeof(float32)) {
2154             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2155             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2156             float32 aa = *(float32 *)(va + H1_4(i + j));
2157 
2158             *(float32 *)(vd + H1_4(i + j)) =
2159                 float32_muladd(nn, mm, aa, 0, status);
2160         }
2161     }
2162 }
2163 
2164 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2165 {
2166     intptr_t i, opr_sz = simd_oprsz(desc);
2167     int8_t *d = vd, *n = vn, *m = vm;
2168 
2169     for (i = 0; i < opr_sz; ++i) {
2170         int8_t mm = m[i];
2171         int8_t nn = n[i];
2172         int8_t res = 0;
2173         if (mm >= 0) {
2174             if (mm < 8) {
2175                 res = nn << mm;
2176             }
2177         } else {
2178             res = nn >> (mm > -8 ? -mm : 7);
2179         }
2180         d[i] = res;
2181     }
2182     clear_tail(d, opr_sz, simd_maxsz(desc));
2183 }
2184 
2185 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2186 {
2187     intptr_t i, opr_sz = simd_oprsz(desc);
2188     int16_t *d = vd, *n = vn, *m = vm;
2189 
2190     for (i = 0; i < opr_sz / 2; ++i) {
2191         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2192         int16_t nn = n[i];
2193         int16_t res = 0;
2194         if (mm >= 0) {
2195             if (mm < 16) {
2196                 res = nn << mm;
2197             }
2198         } else {
2199             res = nn >> (mm > -16 ? -mm : 15);
2200         }
2201         d[i] = res;
2202     }
2203     clear_tail(d, opr_sz, simd_maxsz(desc));
2204 }
2205 
2206 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2207 {
2208     intptr_t i, opr_sz = simd_oprsz(desc);
2209     uint8_t *d = vd, *n = vn, *m = vm;
2210 
2211     for (i = 0; i < opr_sz; ++i) {
2212         int8_t mm = m[i];
2213         uint8_t nn = n[i];
2214         uint8_t res = 0;
2215         if (mm >= 0) {
2216             if (mm < 8) {
2217                 res = nn << mm;
2218             }
2219         } else {
2220             if (mm > -8) {
2221                 res = nn >> -mm;
2222             }
2223         }
2224         d[i] = res;
2225     }
2226     clear_tail(d, opr_sz, simd_maxsz(desc));
2227 }
2228 
2229 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2230 {
2231     intptr_t i, opr_sz = simd_oprsz(desc);
2232     uint16_t *d = vd, *n = vn, *m = vm;
2233 
2234     for (i = 0; i < opr_sz / 2; ++i) {
2235         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2236         uint16_t nn = n[i];
2237         uint16_t res = 0;
2238         if (mm >= 0) {
2239             if (mm < 16) {
2240                 res = nn << mm;
2241             }
2242         } else {
2243             if (mm > -16) {
2244                 res = nn >> -mm;
2245             }
2246         }
2247         d[i] = res;
2248     }
2249     clear_tail(d, opr_sz, simd_maxsz(desc));
2250 }
2251 
2252 /*
2253  * 8x8->8 polynomial multiply.
2254  *
2255  * Polynomial multiplication is like integer multiplication except the
2256  * partial products are XORed, not added.
2257  *
2258  * TODO: expose this as a generic vector operation, as it is a common
2259  * crypto building block.
2260  */
2261 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2262 {
2263     intptr_t i, opr_sz = simd_oprsz(desc);
2264     uint64_t *d = vd, *n = vn, *m = vm;
2265 
2266     for (i = 0; i < opr_sz / 8; ++i) {
2267         d[i] = clmul_8x8_low(n[i], m[i]);
2268     }
2269     clear_tail(d, opr_sz, simd_maxsz(desc));
2270 }
2271 
2272 /*
2273  * 64x64->128 polynomial multiply.
2274  * Because of the lanes are not accessed in strict columns,
2275  * this probably cannot be turned into a generic helper.
2276  */
2277 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2278 {
2279     intptr_t i, opr_sz = simd_oprsz(desc);
2280     intptr_t hi = simd_data(desc);
2281     uint64_t *d = vd, *n = vn, *m = vm;
2282 
2283     for (i = 0; i < opr_sz / 8; i += 2) {
2284         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2285         d[i] = int128_getlo(r);
2286         d[i + 1] = int128_gethi(r);
2287     }
2288     clear_tail(d, opr_sz, simd_maxsz(desc));
2289 }
2290 
2291 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2292 {
2293     int hi = simd_data(desc);
2294     uint64_t *d = vd, *n = vn, *m = vm;
2295     uint64_t nn = n[hi], mm = m[hi];
2296 
2297     d[0] = clmul_8x4_packed(nn, mm);
2298     nn >>= 32;
2299     mm >>= 32;
2300     d[1] = clmul_8x4_packed(nn, mm);
2301 
2302     clear_tail(d, 16, simd_maxsz(desc));
2303 }
2304 
2305 #ifdef TARGET_AARCH64
2306 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2307 {
2308     int shift = simd_data(desc) * 8;
2309     intptr_t i, opr_sz = simd_oprsz(desc);
2310     uint64_t *d = vd, *n = vn, *m = vm;
2311 
2312     for (i = 0; i < opr_sz / 8; ++i) {
2313         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2314     }
2315 }
2316 
2317 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2318 {
2319     intptr_t sel = H4(simd_data(desc));
2320     intptr_t i, opr_sz = simd_oprsz(desc);
2321     uint32_t *n = vn, *m = vm;
2322     uint64_t *d = vd;
2323 
2324     for (i = 0; i < opr_sz / 8; ++i) {
2325         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2326     }
2327 }
2328 #endif
2329 
2330 #define DO_CMP0(NAME, TYPE, OP)                         \
2331 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2332 {                                                       \
2333     intptr_t i, opr_sz = simd_oprsz(desc);              \
2334     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2335         TYPE nn = *(TYPE *)(vn + i);                    \
2336         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2337     }                                                   \
2338     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2339 }
2340 
2341 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2342 DO_CMP0(gvec_clt0_b, int8_t, <)
2343 DO_CMP0(gvec_cle0_b, int8_t, <=)
2344 DO_CMP0(gvec_cgt0_b, int8_t, >)
2345 DO_CMP0(gvec_cge0_b, int8_t, >=)
2346 
2347 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2348 DO_CMP0(gvec_clt0_h, int16_t, <)
2349 DO_CMP0(gvec_cle0_h, int16_t, <=)
2350 DO_CMP0(gvec_cgt0_h, int16_t, >)
2351 DO_CMP0(gvec_cge0_h, int16_t, >=)
2352 
2353 #undef DO_CMP0
2354 
2355 #define DO_ABD(NAME, TYPE)                                      \
2356 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2357 {                                                               \
2358     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2359     TYPE *d = vd, *n = vn, *m = vm;                             \
2360                                                                 \
2361     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2362         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2363     }                                                           \
2364     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2365 }
2366 
2367 DO_ABD(gvec_sabd_b, int8_t)
2368 DO_ABD(gvec_sabd_h, int16_t)
2369 DO_ABD(gvec_sabd_s, int32_t)
2370 DO_ABD(gvec_sabd_d, int64_t)
2371 
2372 DO_ABD(gvec_uabd_b, uint8_t)
2373 DO_ABD(gvec_uabd_h, uint16_t)
2374 DO_ABD(gvec_uabd_s, uint32_t)
2375 DO_ABD(gvec_uabd_d, uint64_t)
2376 
2377 #undef DO_ABD
2378 
2379 #define DO_ABA(NAME, TYPE)                                      \
2380 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2381 {                                                               \
2382     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2383     TYPE *d = vd, *n = vn, *m = vm;                             \
2384                                                                 \
2385     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2386         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2387     }                                                           \
2388     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2389 }
2390 
2391 DO_ABA(gvec_saba_b, int8_t)
2392 DO_ABA(gvec_saba_h, int16_t)
2393 DO_ABA(gvec_saba_s, int32_t)
2394 DO_ABA(gvec_saba_d, int64_t)
2395 
2396 DO_ABA(gvec_uaba_b, uint8_t)
2397 DO_ABA(gvec_uaba_h, uint16_t)
2398 DO_ABA(gvec_uaba_s, uint32_t)
2399 DO_ABA(gvec_uaba_d, uint64_t)
2400 
2401 #undef DO_ABA
2402 
2403 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2404 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2405 {                                                                          \
2406     ARMVectorReg scratch;                                                  \
2407     intptr_t oprsz = simd_oprsz(desc);                                     \
2408     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2409     TYPE *d = vd, *n = vn, *m = vm;                                        \
2410     if (unlikely(d == m)) {                                                \
2411         m = memcpy(&scratch, m, oprsz);                                    \
2412     }                                                                      \
2413     for (intptr_t i = 0; i < half; ++i) {                                  \
2414         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2415     }                                                                      \
2416     for (intptr_t i = 0; i < half; ++i) {                                  \
2417         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2418     }                                                                      \
2419     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2420 }
2421 
2422 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2423 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2424 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2425 
2426 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2427 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2428 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2429 
2430 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2431 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2432 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2433 
2434 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2435 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2436 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2437 
2438 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2439 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2440 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2441 
2442 #undef DO_3OP_PAIR
2443 
2444 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2445 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2446 {                                                               \
2447     ARMVectorReg scratch;                                       \
2448     intptr_t oprsz = simd_oprsz(desc);                          \
2449     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2450     TYPE *d = vd, *n = vn, *m = vm;                             \
2451     if (unlikely(d == m)) {                                     \
2452         m = memcpy(&scratch, m, oprsz);                         \
2453     }                                                           \
2454     for (intptr_t i = 0; i < half; ++i) {                       \
2455         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2456     }                                                           \
2457     for (intptr_t i = 0; i < half; ++i) {                       \
2458         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2459     }                                                           \
2460     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2461 }
2462 
2463 #define ADD(A, B) (A + B)
2464 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2465 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2466 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2467 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2468 #undef  ADD
2469 
2470 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2471 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2472 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2473 
2474 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2475 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2476 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2477 
2478 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2479 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2480 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2481 
2482 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2483 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2484 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2485 
2486 #undef DO_3OP_PAIR
2487 
2488 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2489     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2490     {                                                                   \
2491         intptr_t i, oprsz = simd_oprsz(desc);                           \
2492         int shift = simd_data(desc);                                    \
2493         TYPE *d = vd, *n = vn;                                          \
2494         float_status *fpst = stat;                                      \
2495         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2496             d[i] = FUNC(n[i], shift, fpst);                             \
2497         }                                                               \
2498         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2499     }
2500 
2501 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2502 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2503 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2504 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2505 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2506 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2507 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2508 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2509 
2510 #undef DO_VCVT_FIXED
2511 
2512 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2513     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2514     {                                                                   \
2515         float_status *fpst = stat;                                      \
2516         intptr_t i, oprsz = simd_oprsz(desc);                           \
2517         uint32_t rmode = simd_data(desc);                               \
2518         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2519         TYPE *d = vd, *n = vn;                                          \
2520         set_float_rounding_mode(rmode, fpst);                           \
2521         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2522             d[i] = FUNC(n[i], 0, fpst);                                 \
2523         }                                                               \
2524         set_float_rounding_mode(prev_rmode, fpst);                      \
2525         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2526     }
2527 
2528 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2529 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2530 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2531 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2532 
2533 #undef DO_VCVT_RMODE
2534 
2535 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2536     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2537     {                                                                   \
2538         float_status *fpst = stat;                                      \
2539         intptr_t i, oprsz = simd_oprsz(desc);                           \
2540         uint32_t rmode = simd_data(desc);                               \
2541         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2542         TYPE *d = vd, *n = vn;                                          \
2543         set_float_rounding_mode(rmode, fpst);                           \
2544         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2545             d[i] = FUNC(n[i], fpst);                                    \
2546         }                                                               \
2547         set_float_rounding_mode(prev_rmode, fpst);                      \
2548         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2549     }
2550 
2551 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2552 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2553 
2554 #undef DO_VRINT_RMODE
2555 
2556 #ifdef TARGET_AARCH64
2557 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2558 {
2559     const uint8_t *indices = vm;
2560     CPUARMState *env = venv;
2561     size_t oprsz = simd_oprsz(desc);
2562     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2563     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2564     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2565     union {
2566         uint8_t b[16];
2567         uint64_t d[2];
2568     } result;
2569 
2570     /*
2571      * We must construct the final result in a temp, lest the output
2572      * overlaps the input table.  For TBL, begin with zero; for TBX,
2573      * begin with the original register contents.  Note that we always
2574      * copy 16 bytes here to avoid an extra branch; clearing the high
2575      * bits of the register for oprsz == 8 is handled below.
2576      */
2577     if (is_tbx) {
2578         memcpy(&result, vd, 16);
2579     } else {
2580         memset(&result, 0, 16);
2581     }
2582 
2583     for (size_t i = 0; i < oprsz; ++i) {
2584         uint32_t index = indices[H1(i)];
2585 
2586         if (index < table_len) {
2587             /*
2588              * Convert index (a byte offset into the virtual table
2589              * which is a series of 128-bit vectors concatenated)
2590              * into the correct register element, bearing in mind
2591              * that the table can wrap around from V31 to V0.
2592              */
2593             const uint8_t *table = (const uint8_t *)
2594                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2595             result.b[H1(i)] = table[H1(index % 16)];
2596         }
2597     }
2598 
2599     memcpy(vd, &result, 16);
2600     clear_tail(vd, oprsz, simd_maxsz(desc));
2601 }
2602 #endif
2603 
2604 /*
2605  * NxN -> N highpart multiply
2606  *
2607  * TODO: expose this as a generic vector operation.
2608  */
2609 
2610 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2611 {
2612     intptr_t i, opr_sz = simd_oprsz(desc);
2613     int8_t *d = vd, *n = vn, *m = vm;
2614 
2615     for (i = 0; i < opr_sz; ++i) {
2616         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2617     }
2618     clear_tail(d, opr_sz, simd_maxsz(desc));
2619 }
2620 
2621 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2622 {
2623     intptr_t i, opr_sz = simd_oprsz(desc);
2624     int16_t *d = vd, *n = vn, *m = vm;
2625 
2626     for (i = 0; i < opr_sz / 2; ++i) {
2627         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2628     }
2629     clear_tail(d, opr_sz, simd_maxsz(desc));
2630 }
2631 
2632 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2633 {
2634     intptr_t i, opr_sz = simd_oprsz(desc);
2635     int32_t *d = vd, *n = vn, *m = vm;
2636 
2637     for (i = 0; i < opr_sz / 4; ++i) {
2638         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2639     }
2640     clear_tail(d, opr_sz, simd_maxsz(desc));
2641 }
2642 
2643 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2644 {
2645     intptr_t i, opr_sz = simd_oprsz(desc);
2646     uint64_t *d = vd, *n = vn, *m = vm;
2647     uint64_t discard;
2648 
2649     for (i = 0; i < opr_sz / 8; ++i) {
2650         muls64(&discard, &d[i], n[i], m[i]);
2651     }
2652     clear_tail(d, opr_sz, simd_maxsz(desc));
2653 }
2654 
2655 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2656 {
2657     intptr_t i, opr_sz = simd_oprsz(desc);
2658     uint8_t *d = vd, *n = vn, *m = vm;
2659 
2660     for (i = 0; i < opr_sz; ++i) {
2661         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2662     }
2663     clear_tail(d, opr_sz, simd_maxsz(desc));
2664 }
2665 
2666 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2667 {
2668     intptr_t i, opr_sz = simd_oprsz(desc);
2669     uint16_t *d = vd, *n = vn, *m = vm;
2670 
2671     for (i = 0; i < opr_sz / 2; ++i) {
2672         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2673     }
2674     clear_tail(d, opr_sz, simd_maxsz(desc));
2675 }
2676 
2677 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2678 {
2679     intptr_t i, opr_sz = simd_oprsz(desc);
2680     uint32_t *d = vd, *n = vn, *m = vm;
2681 
2682     for (i = 0; i < opr_sz / 4; ++i) {
2683         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2684     }
2685     clear_tail(d, opr_sz, simd_maxsz(desc));
2686 }
2687 
2688 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2689 {
2690     intptr_t i, opr_sz = simd_oprsz(desc);
2691     uint64_t *d = vd, *n = vn, *m = vm;
2692     uint64_t discard;
2693 
2694     for (i = 0; i < opr_sz / 8; ++i) {
2695         mulu64(&discard, &d[i], n[i], m[i]);
2696     }
2697     clear_tail(d, opr_sz, simd_maxsz(desc));
2698 }
2699 
2700 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2701 {
2702     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2703     int shr = simd_data(desc);
2704     uint64_t *d = vd, *n = vn, *m = vm;
2705 
2706     for (i = 0; i < opr_sz; ++i) {
2707         d[i] = ror64(n[i] ^ m[i], shr);
2708     }
2709     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2710 }
2711 
2712 /*
2713  * Integer matrix-multiply accumulate
2714  */
2715 
2716 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2717 {
2718     int8_t *n = vn, *m = vm;
2719 
2720     for (intptr_t k = 0; k < 8; ++k) {
2721         sum += n[H1(k)] * m[H1(k)];
2722     }
2723     return sum;
2724 }
2725 
2726 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2727 {
2728     uint8_t *n = vn, *m = vm;
2729 
2730     for (intptr_t k = 0; k < 8; ++k) {
2731         sum += n[H1(k)] * m[H1(k)];
2732     }
2733     return sum;
2734 }
2735 
2736 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2737 {
2738     uint8_t *n = vn;
2739     int8_t *m = vm;
2740 
2741     for (intptr_t k = 0; k < 8; ++k) {
2742         sum += n[H1(k)] * m[H1(k)];
2743     }
2744     return sum;
2745 }
2746 
2747 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2748                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2749 {
2750     intptr_t seg, opr_sz = simd_oprsz(desc);
2751 
2752     for (seg = 0; seg < opr_sz; seg += 16) {
2753         uint32_t *d = vd + seg;
2754         uint32_t *a = va + seg;
2755         uint32_t sum0, sum1, sum2, sum3;
2756 
2757         /*
2758          * Process the entire segment at once, writing back the
2759          * results only after we've consumed all of the inputs.
2760          *
2761          * Key to indices by column:
2762          *          i   j                  i             j
2763          */
2764         sum0 = a[H4(0 + 0)];
2765         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2766         sum1 = a[H4(0 + 1)];
2767         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2768         sum2 = a[H4(2 + 0)];
2769         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2770         sum3 = a[H4(2 + 1)];
2771         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2772 
2773         d[H4(0)] = sum0;
2774         d[H4(1)] = sum1;
2775         d[H4(2)] = sum2;
2776         d[H4(3)] = sum3;
2777     }
2778     clear_tail(vd, opr_sz, simd_maxsz(desc));
2779 }
2780 
2781 #define DO_MMLA_B(NAME, INNER) \
2782     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2783     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2784 
2785 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2786 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2787 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2788 
2789 /*
2790  * BFloat16 Dot Product
2791  */
2792 
2793 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2794 {
2795     /* FPCR is ignored for BFDOT and BFMMLA. */
2796     float_status bf_status = {
2797         .tininess_before_rounding = float_tininess_before_rounding,
2798         .float_rounding_mode = float_round_to_odd_inf,
2799         .flush_to_zero = true,
2800         .flush_inputs_to_zero = true,
2801         .default_nan_mode = true,
2802     };
2803     float32 t1, t2;
2804 
2805     /*
2806      * Extract each BFloat16 from the element pair, and shift
2807      * them such that they become float32.
2808      */
2809     t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2810     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2811     t1 = float32_add(t1, t2, &bf_status);
2812     t1 = float32_add(sum, t1, &bf_status);
2813 
2814     return t1;
2815 }
2816 
2817 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2818 {
2819     intptr_t i, opr_sz = simd_oprsz(desc);
2820     float32 *d = vd, *a = va;
2821     uint32_t *n = vn, *m = vm;
2822 
2823     for (i = 0; i < opr_sz / 4; ++i) {
2824         d[i] = bfdotadd(a[i], n[i], m[i]);
2825     }
2826     clear_tail(d, opr_sz, simd_maxsz(desc));
2827 }
2828 
2829 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2830                             void *va, uint32_t desc)
2831 {
2832     intptr_t i, j, opr_sz = simd_oprsz(desc);
2833     intptr_t index = simd_data(desc);
2834     intptr_t elements = opr_sz / 4;
2835     intptr_t eltspersegment = MIN(16 / 4, elements);
2836     float32 *d = vd, *a = va;
2837     uint32_t *n = vn, *m = vm;
2838 
2839     for (i = 0; i < elements; i += eltspersegment) {
2840         uint32_t m_idx = m[i + H4(index)];
2841 
2842         for (j = i; j < i + eltspersegment; j++) {
2843             d[j] = bfdotadd(a[j], n[j], m_idx);
2844         }
2845     }
2846     clear_tail(d, opr_sz, simd_maxsz(desc));
2847 }
2848 
2849 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2850 {
2851     intptr_t s, opr_sz = simd_oprsz(desc);
2852     float32 *d = vd, *a = va;
2853     uint32_t *n = vn, *m = vm;
2854 
2855     for (s = 0; s < opr_sz / 4; s += 4) {
2856         float32 sum00, sum01, sum10, sum11;
2857 
2858         /*
2859          * Process the entire segment at once, writing back the
2860          * results only after we've consumed all of the inputs.
2861          *
2862          * Key to indices by column:
2863          *               i   j           i   k             j   k
2864          */
2865         sum00 = a[s + H4(0 + 0)];
2866         sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2867         sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2868 
2869         sum01 = a[s + H4(0 + 1)];
2870         sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2871         sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2872 
2873         sum10 = a[s + H4(2 + 0)];
2874         sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2875         sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2876 
2877         sum11 = a[s + H4(2 + 1)];
2878         sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2879         sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2880 
2881         d[s + H4(0 + 0)] = sum00;
2882         d[s + H4(0 + 1)] = sum01;
2883         d[s + H4(2 + 0)] = sum10;
2884         d[s + H4(2 + 1)] = sum11;
2885     }
2886     clear_tail(d, opr_sz, simd_maxsz(desc));
2887 }
2888 
2889 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2890                          void *stat, uint32_t desc)
2891 {
2892     intptr_t i, opr_sz = simd_oprsz(desc);
2893     intptr_t sel = simd_data(desc);
2894     float32 *d = vd, *a = va;
2895     bfloat16 *n = vn, *m = vm;
2896 
2897     for (i = 0; i < opr_sz / 4; ++i) {
2898         float32 nn = n[H2(i * 2 + sel)] << 16;
2899         float32 mm = m[H2(i * 2 + sel)] << 16;
2900         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2901     }
2902     clear_tail(d, opr_sz, simd_maxsz(desc));
2903 }
2904 
2905 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2906                              void *va, void *stat, uint32_t desc)
2907 {
2908     intptr_t i, j, opr_sz = simd_oprsz(desc);
2909     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2910     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2911     intptr_t elements = opr_sz / 4;
2912     intptr_t eltspersegment = MIN(16 / 4, elements);
2913     float32 *d = vd, *a = va;
2914     bfloat16 *n = vn, *m = vm;
2915 
2916     for (i = 0; i < elements; i += eltspersegment) {
2917         float32 m_idx = m[H2(2 * i + index)] << 16;
2918 
2919         for (j = i; j < i + eltspersegment; j++) {
2920             float32 n_j = n[H2(2 * j + sel)] << 16;
2921             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2922         }
2923     }
2924     clear_tail(d, opr_sz, simd_maxsz(desc));
2925 }
2926 
2927 #define DO_CLAMP(NAME, TYPE) \
2928 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2929 {                                                                       \
2930     intptr_t i, opr_sz = simd_oprsz(desc);                              \
2931     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2932         TYPE aa = *(TYPE *)(a + i);                                     \
2933         TYPE nn = *(TYPE *)(n + i);                                     \
2934         TYPE mm = *(TYPE *)(m + i);                                     \
2935         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2936         *(TYPE *)(d + i) = dd;                                          \
2937     }                                                                   \
2938     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2939 }
2940 
2941 DO_CLAMP(gvec_sclamp_b, int8_t)
2942 DO_CLAMP(gvec_sclamp_h, int16_t)
2943 DO_CLAMP(gvec_sclamp_s, int32_t)
2944 DO_CLAMP(gvec_sclamp_d, int64_t)
2945 
2946 DO_CLAMP(gvec_uclamp_b, uint8_t)
2947 DO_CLAMP(gvec_uclamp_h, uint16_t)
2948 DO_CLAMP(gvec_uclamp_s, uint32_t)
2949 DO_CLAMP(gvec_uclamp_d, uint64_t)
2950