xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision 1d32d1d1)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320 
321     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
322         int16_t mm = m[i];
323         for (j = 0; j < 16 / 2; ++j) {
324             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
325         }
326     }
327     clear_tail(d, opr_sz, simd_maxsz(desc));
328 }
329 
330 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
331                                  void *vq, uint32_t desc)
332 {
333     intptr_t i, j, opr_sz = simd_oprsz(desc);
334     int idx = simd_data(desc);
335     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
336 
337     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
338         int16_t mm = m[i];
339         for (j = 0; j < 16 / 2; ++j) {
340             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
341         }
342     }
343     clear_tail(d, opr_sz, simd_maxsz(desc));
344 }
345 
346 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
347                              void *va, uint32_t desc)
348 {
349     intptr_t i, opr_sz = simd_oprsz(desc);
350     int16_t *d = vd, *n = vn, *m = vm, *a = va;
351     uint32_t discard;
352 
353     for (i = 0; i < opr_sz / 2; ++i) {
354         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
355     }
356 }
357 
358 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
359                              void *va, uint32_t desc)
360 {
361     intptr_t i, opr_sz = simd_oprsz(desc);
362     int16_t *d = vd, *n = vn, *m = vm, *a = va;
363     uint32_t discard;
364 
365     for (i = 0; i < opr_sz / 2; ++i) {
366         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
367     }
368 }
369 
370 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
371 {
372     intptr_t i, opr_sz = simd_oprsz(desc);
373     int16_t *d = vd, *n = vn, *m = vm;
374     uint32_t discard;
375 
376     for (i = 0; i < opr_sz / 2; ++i) {
377         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
378     }
379 }
380 
381 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
382 {
383     intptr_t i, opr_sz = simd_oprsz(desc);
384     int16_t *d = vd, *n = vn, *m = vm;
385     uint32_t discard;
386 
387     for (i = 0; i < opr_sz / 2; ++i) {
388         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
389     }
390 }
391 
392 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
393 {
394     intptr_t i, j, opr_sz = simd_oprsz(desc);
395     int idx = simd_data(desc);
396     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
397     uint32_t discard;
398 
399     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
400         int16_t mm = m[i];
401         for (j = 0; j < 16 / 2; ++j) {
402             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
403         }
404     }
405 }
406 
407 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
408 {
409     intptr_t i, j, opr_sz = simd_oprsz(desc);
410     int idx = simd_data(desc);
411     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
412     uint32_t discard;
413 
414     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
415         int16_t mm = m[i];
416         for (j = 0; j < 16 / 2; ++j) {
417             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
418         }
419     }
420 }
421 
422 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
423 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
424                       bool neg, bool round, uint32_t *sat)
425 {
426     /* Simplify similarly to do_sqrdmlah_b above.  */
427     int64_t ret = (int64_t)src1 * src2;
428     if (neg) {
429         ret = -ret;
430     }
431     ret += ((int64_t)src3 << 31) + (round << 30);
432     ret >>= 31;
433 
434     if (ret != (int32_t)ret) {
435         *sat = 1;
436         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
437     }
438     return ret;
439 }
440 
441 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
442                                   int32_t src2, int32_t src3)
443 {
444     uint32_t *sat = &env->vfp.qc[0];
445     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
446 }
447 
448 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
449                               void *vq, uint32_t desc)
450 {
451     uintptr_t opr_sz = simd_oprsz(desc);
452     int32_t *d = vd;
453     int32_t *n = vn;
454     int32_t *m = vm;
455     uintptr_t i;
456 
457     for (i = 0; i < opr_sz / 4; ++i) {
458         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
459     }
460     clear_tail(d, opr_sz, simd_maxsz(desc));
461 }
462 
463 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
464                                   int32_t src2, int32_t src3)
465 {
466     uint32_t *sat = &env->vfp.qc[0];
467     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
468 }
469 
470 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
471                               void *vq, uint32_t desc)
472 {
473     uintptr_t opr_sz = simd_oprsz(desc);
474     int32_t *d = vd;
475     int32_t *n = vn;
476     int32_t *m = vm;
477     uintptr_t i;
478 
479     for (i = 0; i < opr_sz / 4; ++i) {
480         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
481     }
482     clear_tail(d, opr_sz, simd_maxsz(desc));
483 }
484 
485 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
486                             void *vq, uint32_t desc)
487 {
488     intptr_t i, opr_sz = simd_oprsz(desc);
489     int32_t *d = vd, *n = vn, *m = vm;
490 
491     for (i = 0; i < opr_sz / 4; ++i) {
492         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
493     }
494     clear_tail(d, opr_sz, simd_maxsz(desc));
495 }
496 
497 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
498                              void *vq, uint32_t desc)
499 {
500     intptr_t i, opr_sz = simd_oprsz(desc);
501     int32_t *d = vd, *n = vn, *m = vm;
502 
503     for (i = 0; i < opr_sz / 4; ++i) {
504         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
505     }
506     clear_tail(d, opr_sz, simd_maxsz(desc));
507 }
508 
509 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
510                                 void *vq, uint32_t desc)
511 {
512     intptr_t i, j, opr_sz = simd_oprsz(desc);
513     int idx = simd_data(desc);
514     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
515 
516     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
517         int32_t mm = m[i];
518         for (j = 0; j < 16 / 4; ++j) {
519             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
520         }
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
526                                  void *vq, uint32_t desc)
527 {
528     intptr_t i, j, opr_sz = simd_oprsz(desc);
529     int idx = simd_data(desc);
530     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
531 
532     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
533         int32_t mm = m[i];
534         for (j = 0; j < 16 / 4; ++j) {
535             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
536         }
537     }
538     clear_tail(d, opr_sz, simd_maxsz(desc));
539 }
540 
541 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
542                              void *va, uint32_t desc)
543 {
544     intptr_t i, opr_sz = simd_oprsz(desc);
545     int32_t *d = vd, *n = vn, *m = vm, *a = va;
546     uint32_t discard;
547 
548     for (i = 0; i < opr_sz / 4; ++i) {
549         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
550     }
551 }
552 
553 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
554                              void *va, uint32_t desc)
555 {
556     intptr_t i, opr_sz = simd_oprsz(desc);
557     int32_t *d = vd, *n = vn, *m = vm, *a = va;
558     uint32_t discard;
559 
560     for (i = 0; i < opr_sz / 4; ++i) {
561         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
562     }
563 }
564 
565 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
566 {
567     intptr_t i, opr_sz = simd_oprsz(desc);
568     int32_t *d = vd, *n = vn, *m = vm;
569     uint32_t discard;
570 
571     for (i = 0; i < opr_sz / 4; ++i) {
572         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
573     }
574 }
575 
576 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
577 {
578     intptr_t i, opr_sz = simd_oprsz(desc);
579     int32_t *d = vd, *n = vn, *m = vm;
580     uint32_t discard;
581 
582     for (i = 0; i < opr_sz / 4; ++i) {
583         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
584     }
585 }
586 
587 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
588 {
589     intptr_t i, j, opr_sz = simd_oprsz(desc);
590     int idx = simd_data(desc);
591     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
592     uint32_t discard;
593 
594     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < 16 / 4; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
598         }
599     }
600 }
601 
602 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
603 {
604     intptr_t i, j, opr_sz = simd_oprsz(desc);
605     int idx = simd_data(desc);
606     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
607     uint32_t discard;
608 
609     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
610         int32_t mm = m[i];
611         for (j = 0; j < 16 / 4; ++j) {
612             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
613         }
614     }
615 }
616 
617 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
618 static int64_t do_sat128_d(Int128 r)
619 {
620     int64_t ls = int128_getlo(r);
621     int64_t hs = int128_gethi(r);
622 
623     if (unlikely(hs != (ls >> 63))) {
624         return hs < 0 ? INT64_MIN : INT64_MAX;
625     }
626     return ls;
627 }
628 
629 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
630 {
631     uint64_t l, h;
632     Int128 r, t;
633 
634     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
635     muls64(&l, &h, m, n);
636     r = int128_make128(l, h);
637     if (neg) {
638         r = int128_neg(r);
639     }
640     if (a) {
641         t = int128_exts64(a);
642         t = int128_lshift(t, 63);
643         r = int128_add(r, t);
644     }
645     if (round) {
646         t = int128_exts64(1ll << 62);
647         r = int128_add(r, t);
648     }
649     r = int128_rshift(r, 63);
650 
651     return do_sat128_d(r);
652 }
653 
654 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
655                              void *va, uint32_t desc)
656 {
657     intptr_t i, opr_sz = simd_oprsz(desc);
658     int64_t *d = vd, *n = vn, *m = vm, *a = va;
659 
660     for (i = 0; i < opr_sz / 8; ++i) {
661         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
662     }
663 }
664 
665 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
666                              void *va, uint32_t desc)
667 {
668     intptr_t i, opr_sz = simd_oprsz(desc);
669     int64_t *d = vd, *n = vn, *m = vm, *a = va;
670 
671     for (i = 0; i < opr_sz / 8; ++i) {
672         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
673     }
674 }
675 
676 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
677 {
678     intptr_t i, opr_sz = simd_oprsz(desc);
679     int64_t *d = vd, *n = vn, *m = vm;
680 
681     for (i = 0; i < opr_sz / 8; ++i) {
682         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
683     }
684 }
685 
686 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
687 {
688     intptr_t i, opr_sz = simd_oprsz(desc);
689     int64_t *d = vd, *n = vn, *m = vm;
690 
691     for (i = 0; i < opr_sz / 8; ++i) {
692         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
693     }
694 }
695 
696 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
697 {
698     intptr_t i, j, opr_sz = simd_oprsz(desc);
699     int idx = simd_data(desc);
700     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
701 
702     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
703         int64_t mm = m[i];
704         for (j = 0; j < 16 / 8; ++j) {
705             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
706         }
707     }
708 }
709 
710 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
711 {
712     intptr_t i, j, opr_sz = simd_oprsz(desc);
713     int idx = simd_data(desc);
714     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
715 
716     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
717         int64_t mm = m[i];
718         for (j = 0; j < 16 / 8; ++j) {
719             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
720         }
721     }
722 }
723 
724 /* Integer 8 and 16-bit dot-product.
725  *
726  * Note that for the loops herein, host endianness does not matter
727  * with respect to the ordering of data within the quad-width lanes.
728  * All elements are treated equally, no matter where they are.
729  */
730 
731 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
732 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
733 {                                                                         \
734     intptr_t i, opr_sz = simd_oprsz(desc);                                \
735     TYPED *d = vd, *a = va;                                               \
736     TYPEN *n = vn;                                                        \
737     TYPEM *m = vm;                                                        \
738     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
739         d[i] = (a[i] +                                                    \
740                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
741                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
742                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
743                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
744     }                                                                     \
745     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
746 }
747 
748 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
749 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
750 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
751 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
752 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
753 
754 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
755 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
756 {                                                                         \
757     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
758     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
759     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
760     intptr_t index = simd_data(desc);                                     \
761     TYPED *d = vd, *a = va;                                               \
762     TYPEN *n = vn;                                                        \
763     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
764     do {                                                                  \
765         TYPED m0 = m_indexed[i * 4 + 0];                                  \
766         TYPED m1 = m_indexed[i * 4 + 1];                                  \
767         TYPED m2 = m_indexed[i * 4 + 2];                                  \
768         TYPED m3 = m_indexed[i * 4 + 3];                                  \
769         do {                                                              \
770             d[i] = (a[i] +                                                \
771                     n[i * 4 + 0] * m0 +                                   \
772                     n[i * 4 + 1] * m1 +                                   \
773                     n[i * 4 + 2] * m2 +                                   \
774                     n[i * 4 + 3] * m3);                                   \
775         } while (++i < segend);                                           \
776         segend = i + 4;                                                   \
777     } while (i < opr_sz_n);                                               \
778     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
779 }
780 
781 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
782 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
783 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
784 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
785 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
786 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
787 
788 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
789                          void *vfpst, uint32_t desc)
790 {
791     uintptr_t opr_sz = simd_oprsz(desc);
792     float16 *d = vd;
793     float16 *n = vn;
794     float16 *m = vm;
795     float_status *fpst = vfpst;
796     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
797     uint32_t neg_imag = neg_real ^ 1;
798     uintptr_t i;
799 
800     /* Shift boolean to the sign bit so we can xor to negate.  */
801     neg_real <<= 15;
802     neg_imag <<= 15;
803 
804     for (i = 0; i < opr_sz / 2; i += 2) {
805         float16 e0 = n[H2(i)];
806         float16 e1 = m[H2(i + 1)] ^ neg_imag;
807         float16 e2 = n[H2(i + 1)];
808         float16 e3 = m[H2(i)] ^ neg_real;
809 
810         d[H2(i)] = float16_add(e0, e1, fpst);
811         d[H2(i + 1)] = float16_add(e2, e3, fpst);
812     }
813     clear_tail(d, opr_sz, simd_maxsz(desc));
814 }
815 
816 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
817                          void *vfpst, uint32_t desc)
818 {
819     uintptr_t opr_sz = simd_oprsz(desc);
820     float32 *d = vd;
821     float32 *n = vn;
822     float32 *m = vm;
823     float_status *fpst = vfpst;
824     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
825     uint32_t neg_imag = neg_real ^ 1;
826     uintptr_t i;
827 
828     /* Shift boolean to the sign bit so we can xor to negate.  */
829     neg_real <<= 31;
830     neg_imag <<= 31;
831 
832     for (i = 0; i < opr_sz / 4; i += 2) {
833         float32 e0 = n[H4(i)];
834         float32 e1 = m[H4(i + 1)] ^ neg_imag;
835         float32 e2 = n[H4(i + 1)];
836         float32 e3 = m[H4(i)] ^ neg_real;
837 
838         d[H4(i)] = float32_add(e0, e1, fpst);
839         d[H4(i + 1)] = float32_add(e2, e3, fpst);
840     }
841     clear_tail(d, opr_sz, simd_maxsz(desc));
842 }
843 
844 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
845                          void *vfpst, uint32_t desc)
846 {
847     uintptr_t opr_sz = simd_oprsz(desc);
848     float64 *d = vd;
849     float64 *n = vn;
850     float64 *m = vm;
851     float_status *fpst = vfpst;
852     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
853     uint64_t neg_imag = neg_real ^ 1;
854     uintptr_t i;
855 
856     /* Shift boolean to the sign bit so we can xor to negate.  */
857     neg_real <<= 63;
858     neg_imag <<= 63;
859 
860     for (i = 0; i < opr_sz / 8; i += 2) {
861         float64 e0 = n[i];
862         float64 e1 = m[i + 1] ^ neg_imag;
863         float64 e2 = n[i + 1];
864         float64 e3 = m[i] ^ neg_real;
865 
866         d[i] = float64_add(e0, e1, fpst);
867         d[i + 1] = float64_add(e2, e3, fpst);
868     }
869     clear_tail(d, opr_sz, simd_maxsz(desc));
870 }
871 
872 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
873                          void *vfpst, uint32_t desc)
874 {
875     uintptr_t opr_sz = simd_oprsz(desc);
876     float16 *d = vd, *n = vn, *m = vm, *a = va;
877     float_status *fpst = vfpst;
878     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
879     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
880     uint32_t neg_real = flip ^ neg_imag;
881     uintptr_t i;
882 
883     /* Shift boolean to the sign bit so we can xor to negate.  */
884     neg_real <<= 15;
885     neg_imag <<= 15;
886 
887     for (i = 0; i < opr_sz / 2; i += 2) {
888         float16 e2 = n[H2(i + flip)];
889         float16 e1 = m[H2(i + flip)] ^ neg_real;
890         float16 e4 = e2;
891         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
892 
893         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
894         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
895     }
896     clear_tail(d, opr_sz, simd_maxsz(desc));
897 }
898 
899 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
900                              void *vfpst, uint32_t desc)
901 {
902     uintptr_t opr_sz = simd_oprsz(desc);
903     float16 *d = vd, *n = vn, *m = vm, *a = va;
904     float_status *fpst = vfpst;
905     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
906     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
907     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
908     uint32_t neg_real = flip ^ neg_imag;
909     intptr_t elements = opr_sz / sizeof(float16);
910     intptr_t eltspersegment = 16 / sizeof(float16);
911     intptr_t i, j;
912 
913     /* Shift boolean to the sign bit so we can xor to negate.  */
914     neg_real <<= 15;
915     neg_imag <<= 15;
916 
917     for (i = 0; i < elements; i += eltspersegment) {
918         float16 mr = m[H2(i + 2 * index + 0)];
919         float16 mi = m[H2(i + 2 * index + 1)];
920         float16 e1 = neg_real ^ (flip ? mi : mr);
921         float16 e3 = neg_imag ^ (flip ? mr : mi);
922 
923         for (j = i; j < i + eltspersegment; j += 2) {
924             float16 e2 = n[H2(j + flip)];
925             float16 e4 = e2;
926 
927             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
928             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
929         }
930     }
931     clear_tail(d, opr_sz, simd_maxsz(desc));
932 }
933 
934 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
935                          void *vfpst, uint32_t desc)
936 {
937     uintptr_t opr_sz = simd_oprsz(desc);
938     float32 *d = vd, *n = vn, *m = vm, *a = va;
939     float_status *fpst = vfpst;
940     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
941     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
942     uint32_t neg_real = flip ^ neg_imag;
943     uintptr_t i;
944 
945     /* Shift boolean to the sign bit so we can xor to negate.  */
946     neg_real <<= 31;
947     neg_imag <<= 31;
948 
949     for (i = 0; i < opr_sz / 4; i += 2) {
950         float32 e2 = n[H4(i + flip)];
951         float32 e1 = m[H4(i + flip)] ^ neg_real;
952         float32 e4 = e2;
953         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
954 
955         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
956         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
957     }
958     clear_tail(d, opr_sz, simd_maxsz(desc));
959 }
960 
961 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
962                              void *vfpst, uint32_t desc)
963 {
964     uintptr_t opr_sz = simd_oprsz(desc);
965     float32 *d = vd, *n = vn, *m = vm, *a = va;
966     float_status *fpst = vfpst;
967     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
968     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
969     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
970     uint32_t neg_real = flip ^ neg_imag;
971     intptr_t elements = opr_sz / sizeof(float32);
972     intptr_t eltspersegment = 16 / sizeof(float32);
973     intptr_t i, j;
974 
975     /* Shift boolean to the sign bit so we can xor to negate.  */
976     neg_real <<= 31;
977     neg_imag <<= 31;
978 
979     for (i = 0; i < elements; i += eltspersegment) {
980         float32 mr = m[H4(i + 2 * index + 0)];
981         float32 mi = m[H4(i + 2 * index + 1)];
982         float32 e1 = neg_real ^ (flip ? mi : mr);
983         float32 e3 = neg_imag ^ (flip ? mr : mi);
984 
985         for (j = i; j < i + eltspersegment; j += 2) {
986             float32 e2 = n[H4(j + flip)];
987             float32 e4 = e2;
988 
989             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
990             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
991         }
992     }
993     clear_tail(d, opr_sz, simd_maxsz(desc));
994 }
995 
996 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
997                          void *vfpst, uint32_t desc)
998 {
999     uintptr_t opr_sz = simd_oprsz(desc);
1000     float64 *d = vd, *n = vn, *m = vm, *a = va;
1001     float_status *fpst = vfpst;
1002     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1003     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1004     uint64_t neg_real = flip ^ neg_imag;
1005     uintptr_t i;
1006 
1007     /* Shift boolean to the sign bit so we can xor to negate.  */
1008     neg_real <<= 63;
1009     neg_imag <<= 63;
1010 
1011     for (i = 0; i < opr_sz / 8; i += 2) {
1012         float64 e2 = n[i + flip];
1013         float64 e1 = m[i + flip] ^ neg_real;
1014         float64 e4 = e2;
1015         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1016 
1017         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1018         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1019     }
1020     clear_tail(d, opr_sz, simd_maxsz(desc));
1021 }
1022 
1023 /*
1024  * Floating point comparisons producing an integer result (all 1s or all 0s).
1025  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1026  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1027  */
1028 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1029 {
1030     return -float16_eq_quiet(op1, op2, stat);
1031 }
1032 
1033 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1034 {
1035     return -float32_eq_quiet(op1, op2, stat);
1036 }
1037 
1038 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1039 {
1040     return -float64_eq_quiet(op1, op2, stat);
1041 }
1042 
1043 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1044 {
1045     return -float16_le(op2, op1, stat);
1046 }
1047 
1048 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1049 {
1050     return -float32_le(op2, op1, stat);
1051 }
1052 
1053 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1054 {
1055     return -float64_le(op2, op1, stat);
1056 }
1057 
1058 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1059 {
1060     return -float16_lt(op2, op1, stat);
1061 }
1062 
1063 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1064 {
1065     return -float32_lt(op2, op1, stat);
1066 }
1067 
1068 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1069 {
1070     return -float64_lt(op2, op1, stat);
1071 }
1072 
1073 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1074 {
1075     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1076 }
1077 
1078 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1079 {
1080     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1081 }
1082 
1083 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1084 {
1085     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1086 }
1087 
1088 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1089 {
1090     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1091 }
1092 
1093 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1094 {
1095     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1096 }
1097 
1098 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1099 {
1100     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1101 }
1102 
1103 static int16_t vfp_tosszh(float16 x, void *fpstp)
1104 {
1105     float_status *fpst = fpstp;
1106     if (float16_is_any_nan(x)) {
1107         float_raise(float_flag_invalid, fpst);
1108         return 0;
1109     }
1110     return float16_to_int16_round_to_zero(x, fpst);
1111 }
1112 
1113 static uint16_t vfp_touszh(float16 x, void *fpstp)
1114 {
1115     float_status *fpst = fpstp;
1116     if (float16_is_any_nan(x)) {
1117         float_raise(float_flag_invalid, fpst);
1118         return 0;
1119     }
1120     return float16_to_uint16_round_to_zero(x, fpst);
1121 }
1122 
1123 #define DO_2OP(NAME, FUNC, TYPE) \
1124 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1125 {                                                                 \
1126     intptr_t i, oprsz = simd_oprsz(desc);                         \
1127     TYPE *d = vd, *n = vn;                                        \
1128     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1129         d[i] = FUNC(n[i], stat);                                  \
1130     }                                                             \
1131     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1132 }
1133 
1134 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1135 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1136 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1137 
1138 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1139 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1140 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1141 
1142 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1143 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1144 
1145 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1146 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1147 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1148 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1149 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1150 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1151 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1152 DO_2OP(gvec_touszh, vfp_touszh, float16)
1153 
1154 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1155     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1156     {                                                           \
1157         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1158     }
1159 
1160 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1161     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1162     {                                                           \
1163         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1164     }
1165 
1166 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1167     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1168     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1169     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1170     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1171 
1172 DO_2OP_CMP0(cgt, cgt, FWD)
1173 DO_2OP_CMP0(cge, cge, FWD)
1174 DO_2OP_CMP0(ceq, ceq, FWD)
1175 DO_2OP_CMP0(clt, cgt, REV)
1176 DO_2OP_CMP0(cle, cge, REV)
1177 
1178 #undef DO_2OP
1179 #undef DO_2OP_CMP0
1180 
1181 /* Floating-point trigonometric starting value.
1182  * See the ARM ARM pseudocode function FPTrigSMul.
1183  */
1184 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1185 {
1186     float16 result = float16_mul(op1, op1, stat);
1187     if (!float16_is_any_nan(result)) {
1188         result = float16_set_sign(result, op2 & 1);
1189     }
1190     return result;
1191 }
1192 
1193 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1194 {
1195     float32 result = float32_mul(op1, op1, stat);
1196     if (!float32_is_any_nan(result)) {
1197         result = float32_set_sign(result, op2 & 1);
1198     }
1199     return result;
1200 }
1201 
1202 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1203 {
1204     float64 result = float64_mul(op1, op1, stat);
1205     if (!float64_is_any_nan(result)) {
1206         result = float64_set_sign(result, op2 & 1);
1207     }
1208     return result;
1209 }
1210 
1211 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1212 {
1213     return float16_abs(float16_sub(op1, op2, stat));
1214 }
1215 
1216 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1217 {
1218     return float32_abs(float32_sub(op1, op2, stat));
1219 }
1220 
1221 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1222 {
1223     return float64_abs(float64_sub(op1, op2, stat));
1224 }
1225 
1226 /*
1227  * Reciprocal step. These are the AArch32 version which uses a
1228  * non-fused multiply-and-subtract.
1229  */
1230 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1231 {
1232     op1 = float16_squash_input_denormal(op1, stat);
1233     op2 = float16_squash_input_denormal(op2, stat);
1234 
1235     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1236         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1237         return float16_two;
1238     }
1239     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1240 }
1241 
1242 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1243 {
1244     op1 = float32_squash_input_denormal(op1, stat);
1245     op2 = float32_squash_input_denormal(op2, stat);
1246 
1247     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1248         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1249         return float32_two;
1250     }
1251     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1252 }
1253 
1254 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1255 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1256 {
1257     op1 = float16_squash_input_denormal(op1, stat);
1258     op2 = float16_squash_input_denormal(op2, stat);
1259 
1260     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1261         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1262         return float16_one_point_five;
1263     }
1264     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1265     return float16_div(op1, float16_two, stat);
1266 }
1267 
1268 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1269 {
1270     op1 = float32_squash_input_denormal(op1, stat);
1271     op2 = float32_squash_input_denormal(op2, stat);
1272 
1273     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1274         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1275         return float32_one_point_five;
1276     }
1277     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1278     return float32_div(op1, float32_two, stat);
1279 }
1280 
1281 #define DO_3OP(NAME, FUNC, TYPE) \
1282 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1283 {                                                                          \
1284     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1285     TYPE *d = vd, *n = vn, *m = vm;                                        \
1286     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1287         d[i] = FUNC(n[i], m[i], stat);                                     \
1288     }                                                                      \
1289     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1290 }
1291 
1292 DO_3OP(gvec_fadd_h, float16_add, float16)
1293 DO_3OP(gvec_fadd_s, float32_add, float32)
1294 DO_3OP(gvec_fadd_d, float64_add, float64)
1295 
1296 DO_3OP(gvec_fsub_h, float16_sub, float16)
1297 DO_3OP(gvec_fsub_s, float32_sub, float32)
1298 DO_3OP(gvec_fsub_d, float64_sub, float64)
1299 
1300 DO_3OP(gvec_fmul_h, float16_mul, float16)
1301 DO_3OP(gvec_fmul_s, float32_mul, float32)
1302 DO_3OP(gvec_fmul_d, float64_mul, float64)
1303 
1304 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1305 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1306 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1307 
1308 DO_3OP(gvec_fabd_h, float16_abd, float16)
1309 DO_3OP(gvec_fabd_s, float32_abd, float32)
1310 DO_3OP(gvec_fabd_d, float64_abd, float64)
1311 
1312 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1313 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1314 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1315 
1316 DO_3OP(gvec_fcge_h, float16_cge, float16)
1317 DO_3OP(gvec_fcge_s, float32_cge, float32)
1318 DO_3OP(gvec_fcge_d, float64_cge, float64)
1319 
1320 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1321 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1322 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1323 
1324 DO_3OP(gvec_facge_h, float16_acge, float16)
1325 DO_3OP(gvec_facge_s, float32_acge, float32)
1326 DO_3OP(gvec_facge_d, float64_acge, float64)
1327 
1328 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1329 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1330 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1331 
1332 DO_3OP(gvec_fmax_h, float16_max, float16)
1333 DO_3OP(gvec_fmax_s, float32_max, float32)
1334 DO_3OP(gvec_fmax_d, float64_max, float64)
1335 
1336 DO_3OP(gvec_fmin_h, float16_min, float16)
1337 DO_3OP(gvec_fmin_s, float32_min, float32)
1338 DO_3OP(gvec_fmin_d, float64_min, float64)
1339 
1340 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1341 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1342 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1343 
1344 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1345 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1346 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1347 
1348 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1349 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1350 
1351 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1352 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1353 
1354 #ifdef TARGET_AARCH64
1355 DO_3OP(gvec_fdiv_h, float16_div, float16)
1356 DO_3OP(gvec_fdiv_s, float32_div, float32)
1357 DO_3OP(gvec_fdiv_d, float64_div, float64)
1358 
1359 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1360 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1361 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1362 
1363 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1364 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1365 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1366 
1367 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1368 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1369 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1370 
1371 #endif
1372 #undef DO_3OP
1373 
1374 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1375 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1376                                  float_status *stat)
1377 {
1378     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1379 }
1380 
1381 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1382                                  float_status *stat)
1383 {
1384     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1385 }
1386 
1387 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1388                                  float_status *stat)
1389 {
1390     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1391 }
1392 
1393 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1394                                  float_status *stat)
1395 {
1396     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1397 }
1398 
1399 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1400 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1401                                 float_status *stat)
1402 {
1403     return float16_muladd(op1, op2, dest, 0, stat);
1404 }
1405 
1406 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1407                                  float_status *stat)
1408 {
1409     return float32_muladd(op1, op2, dest, 0, stat);
1410 }
1411 
1412 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1413                                  float_status *stat)
1414 {
1415     return float64_muladd(op1, op2, dest, 0, stat);
1416 }
1417 
1418 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1419                                  float_status *stat)
1420 {
1421     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1422 }
1423 
1424 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1425                                  float_status *stat)
1426 {
1427     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1428 }
1429 
1430 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1431                                  float_status *stat)
1432 {
1433     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1434 }
1435 
1436 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1437 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1438 {                                                                          \
1439     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1440     TYPE *d = vd, *n = vn, *m = vm;                                        \
1441     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1442         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1443     }                                                                      \
1444     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1445 }
1446 
1447 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1448 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1449 
1450 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1451 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1452 
1453 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1454 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1455 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1456 
1457 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1458 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1459 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1460 
1461 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1462  * For AdvSIMD, there is of course only one such vector segment.
1463  */
1464 
1465 #define DO_MUL_IDX(NAME, TYPE, H) \
1466 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1467 {                                                                          \
1468     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1469     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1470     intptr_t idx = simd_data(desc);                                        \
1471     TYPE *d = vd, *n = vn, *m = vm;                                        \
1472     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1473         TYPE mm = m[H(i + idx)];                                           \
1474         for (j = 0; j < segment; j++) {                                    \
1475             d[i + j] = n[i + j] * mm;                                      \
1476         }                                                                  \
1477     }                                                                      \
1478     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1479 }
1480 
1481 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1482 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1483 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1484 
1485 #undef DO_MUL_IDX
1486 
1487 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1488 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1489 {                                                                          \
1490     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1491     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1492     intptr_t idx = simd_data(desc);                                        \
1493     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1494     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1495         TYPE mm = m[H(i + idx)];                                           \
1496         for (j = 0; j < segment; j++) {                                    \
1497             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1498         }                                                                  \
1499     }                                                                      \
1500     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1501 }
1502 
1503 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1504 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1505 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1506 
1507 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1508 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1509 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1510 
1511 #undef DO_MLA_IDX
1512 
1513 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1514 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1515 {                                                                          \
1516     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1517     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1518     intptr_t idx = simd_data(desc);                                        \
1519     TYPE *d = vd, *n = vn, *m = vm;                                        \
1520     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1521         TYPE mm = m[H(i + idx)];                                           \
1522         for (j = 0; j < segment; j++) {                                    \
1523             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1524         }                                                                  \
1525     }                                                                      \
1526     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1527 }
1528 
1529 #define nop(N, M, S) (M)
1530 
1531 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1532 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1533 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1534 
1535 #ifdef TARGET_AARCH64
1536 
1537 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1538 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1539 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1540 
1541 #endif
1542 
1543 #undef nop
1544 
1545 /*
1546  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1547  * the fused ops below they assume accumulate both from and into Vd.
1548  */
1549 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1550 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1551 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1552 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1553 
1554 #undef DO_FMUL_IDX
1555 
1556 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1557 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1558                   void *stat, uint32_t desc)                               \
1559 {                                                                          \
1560     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1561     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1562     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1563     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1564     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1565     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1566     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1567         TYPE mm = m[H(i + idx)];                                           \
1568         for (j = 0; j < segment; j++) {                                    \
1569             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1570                                      mm, a[i + j], 0, stat);               \
1571         }                                                                  \
1572     }                                                                      \
1573     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1574 }
1575 
1576 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1577 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1578 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1579 
1580 #undef DO_FMLA_IDX
1581 
1582 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1583 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1584 {                                                                          \
1585     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1586     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1587     bool q = false;                                                        \
1588     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1589         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1590         if (dd < MIN) {                                                    \
1591             dd = MIN;                                                      \
1592             q = true;                                                      \
1593         } else if (dd > MAX) {                                             \
1594             dd = MAX;                                                      \
1595             q = true;                                                      \
1596         }                                                                  \
1597         d[i] = dd;                                                         \
1598     }                                                                      \
1599     if (q) {                                                               \
1600         uint32_t *qc = vq;                                                 \
1601         qc[0] = 1;                                                         \
1602     }                                                                      \
1603     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1604 }
1605 
1606 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1607 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1608 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1609 
1610 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1611 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1612 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1613 
1614 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1615 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1616 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1617 
1618 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1619 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1620 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1621 
1622 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1623 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1624 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1625 
1626 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1627 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1628 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1629 
1630 #undef DO_SAT
1631 
1632 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1633                           void *vm, uint32_t desc)
1634 {
1635     intptr_t i, oprsz = simd_oprsz(desc);
1636     uint64_t *d = vd, *n = vn, *m = vm;
1637     bool q = false;
1638 
1639     for (i = 0; i < oprsz / 8; i++) {
1640         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1641         if (dd < nn) {
1642             dd = UINT64_MAX;
1643             q = true;
1644         }
1645         d[i] = dd;
1646     }
1647     if (q) {
1648         uint32_t *qc = vq;
1649         qc[0] = 1;
1650     }
1651     clear_tail(d, oprsz, simd_maxsz(desc));
1652 }
1653 
1654 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1655                           void *vm, uint32_t desc)
1656 {
1657     intptr_t i, oprsz = simd_oprsz(desc);
1658     uint64_t *d = vd, *n = vn, *m = vm;
1659     bool q = false;
1660 
1661     for (i = 0; i < oprsz / 8; i++) {
1662         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1663         if (nn < mm) {
1664             dd = 0;
1665             q = true;
1666         }
1667         d[i] = dd;
1668     }
1669     if (q) {
1670         uint32_t *qc = vq;
1671         qc[0] = 1;
1672     }
1673     clear_tail(d, oprsz, simd_maxsz(desc));
1674 }
1675 
1676 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1677                           void *vm, uint32_t desc)
1678 {
1679     intptr_t i, oprsz = simd_oprsz(desc);
1680     int64_t *d = vd, *n = vn, *m = vm;
1681     bool q = false;
1682 
1683     for (i = 0; i < oprsz / 8; i++) {
1684         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1685         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1686             dd = (nn >> 63) ^ ~INT64_MIN;
1687             q = true;
1688         }
1689         d[i] = dd;
1690     }
1691     if (q) {
1692         uint32_t *qc = vq;
1693         qc[0] = 1;
1694     }
1695     clear_tail(d, oprsz, simd_maxsz(desc));
1696 }
1697 
1698 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1699                           void *vm, uint32_t desc)
1700 {
1701     intptr_t i, oprsz = simd_oprsz(desc);
1702     int64_t *d = vd, *n = vn, *m = vm;
1703     bool q = false;
1704 
1705     for (i = 0; i < oprsz / 8; i++) {
1706         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1707         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1708             dd = (nn >> 63) ^ ~INT64_MIN;
1709             q = true;
1710         }
1711         d[i] = dd;
1712     }
1713     if (q) {
1714         uint32_t *qc = vq;
1715         qc[0] = 1;
1716     }
1717     clear_tail(d, oprsz, simd_maxsz(desc));
1718 }
1719 
1720 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1721                            void *vm, uint32_t desc)
1722 {
1723     intptr_t i, oprsz = simd_oprsz(desc);
1724     uint64_t *d = vd, *n = vn, *m = vm;
1725     bool q = false;
1726 
1727     for (i = 0; i < oprsz / 8; i++) {
1728         uint64_t nn = n[i];
1729         int64_t mm = m[i];
1730         uint64_t dd = nn + mm;
1731 
1732         if (mm < 0) {
1733             if (nn < (uint64_t)-mm) {
1734                 dd = 0;
1735                 q = true;
1736             }
1737         } else {
1738             if (dd < nn) {
1739                 dd = UINT64_MAX;
1740                 q = true;
1741             }
1742         }
1743         d[i] = dd;
1744     }
1745     if (q) {
1746         uint32_t *qc = vq;
1747         qc[0] = 1;
1748     }
1749     clear_tail(d, oprsz, simd_maxsz(desc));
1750 }
1751 
1752 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1753                            void *vm, uint32_t desc)
1754 {
1755     intptr_t i, oprsz = simd_oprsz(desc);
1756     uint64_t *d = vd, *n = vn, *m = vm;
1757     bool q = false;
1758 
1759     for (i = 0; i < oprsz / 8; i++) {
1760         int64_t nn = n[i];
1761         uint64_t mm = m[i];
1762         int64_t dd = nn + mm;
1763 
1764         if (mm > (uint64_t)(INT64_MAX - nn)) {
1765             dd = INT64_MAX;
1766             q = true;
1767         }
1768         d[i] = dd;
1769     }
1770     if (q) {
1771         uint32_t *qc = vq;
1772         qc[0] = 1;
1773     }
1774     clear_tail(d, oprsz, simd_maxsz(desc));
1775 }
1776 
1777 #define DO_SRA(NAME, TYPE)                              \
1778 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1779 {                                                       \
1780     intptr_t i, oprsz = simd_oprsz(desc);               \
1781     int shift = simd_data(desc);                        \
1782     TYPE *d = vd, *n = vn;                              \
1783     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1784         d[i] += n[i] >> shift;                          \
1785     }                                                   \
1786     clear_tail(d, oprsz, simd_maxsz(desc));             \
1787 }
1788 
1789 DO_SRA(gvec_ssra_b, int8_t)
1790 DO_SRA(gvec_ssra_h, int16_t)
1791 DO_SRA(gvec_ssra_s, int32_t)
1792 DO_SRA(gvec_ssra_d, int64_t)
1793 
1794 DO_SRA(gvec_usra_b, uint8_t)
1795 DO_SRA(gvec_usra_h, uint16_t)
1796 DO_SRA(gvec_usra_s, uint32_t)
1797 DO_SRA(gvec_usra_d, uint64_t)
1798 
1799 #undef DO_SRA
1800 
1801 #define DO_RSHR(NAME, TYPE)                             \
1802 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1803 {                                                       \
1804     intptr_t i, oprsz = simd_oprsz(desc);               \
1805     int shift = simd_data(desc);                        \
1806     TYPE *d = vd, *n = vn;                              \
1807     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1808         TYPE tmp = n[i] >> (shift - 1);                 \
1809         d[i] = (tmp >> 1) + (tmp & 1);                  \
1810     }                                                   \
1811     clear_tail(d, oprsz, simd_maxsz(desc));             \
1812 }
1813 
1814 DO_RSHR(gvec_srshr_b, int8_t)
1815 DO_RSHR(gvec_srshr_h, int16_t)
1816 DO_RSHR(gvec_srshr_s, int32_t)
1817 DO_RSHR(gvec_srshr_d, int64_t)
1818 
1819 DO_RSHR(gvec_urshr_b, uint8_t)
1820 DO_RSHR(gvec_urshr_h, uint16_t)
1821 DO_RSHR(gvec_urshr_s, uint32_t)
1822 DO_RSHR(gvec_urshr_d, uint64_t)
1823 
1824 #undef DO_RSHR
1825 
1826 #define DO_RSRA(NAME, TYPE)                             \
1827 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1828 {                                                       \
1829     intptr_t i, oprsz = simd_oprsz(desc);               \
1830     int shift = simd_data(desc);                        \
1831     TYPE *d = vd, *n = vn;                              \
1832     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1833         TYPE tmp = n[i] >> (shift - 1);                 \
1834         d[i] += (tmp >> 1) + (tmp & 1);                 \
1835     }                                                   \
1836     clear_tail(d, oprsz, simd_maxsz(desc));             \
1837 }
1838 
1839 DO_RSRA(gvec_srsra_b, int8_t)
1840 DO_RSRA(gvec_srsra_h, int16_t)
1841 DO_RSRA(gvec_srsra_s, int32_t)
1842 DO_RSRA(gvec_srsra_d, int64_t)
1843 
1844 DO_RSRA(gvec_ursra_b, uint8_t)
1845 DO_RSRA(gvec_ursra_h, uint16_t)
1846 DO_RSRA(gvec_ursra_s, uint32_t)
1847 DO_RSRA(gvec_ursra_d, uint64_t)
1848 
1849 #undef DO_RSRA
1850 
1851 #define DO_SRI(NAME, TYPE)                              \
1852 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1853 {                                                       \
1854     intptr_t i, oprsz = simd_oprsz(desc);               \
1855     int shift = simd_data(desc);                        \
1856     TYPE *d = vd, *n = vn;                              \
1857     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1858         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1859     }                                                   \
1860     clear_tail(d, oprsz, simd_maxsz(desc));             \
1861 }
1862 
1863 DO_SRI(gvec_sri_b, uint8_t)
1864 DO_SRI(gvec_sri_h, uint16_t)
1865 DO_SRI(gvec_sri_s, uint32_t)
1866 DO_SRI(gvec_sri_d, uint64_t)
1867 
1868 #undef DO_SRI
1869 
1870 #define DO_SLI(NAME, TYPE)                              \
1871 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1872 {                                                       \
1873     intptr_t i, oprsz = simd_oprsz(desc);               \
1874     int shift = simd_data(desc);                        \
1875     TYPE *d = vd, *n = vn;                              \
1876     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1877         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1878     }                                                   \
1879     clear_tail(d, oprsz, simd_maxsz(desc));             \
1880 }
1881 
1882 DO_SLI(gvec_sli_b, uint8_t)
1883 DO_SLI(gvec_sli_h, uint16_t)
1884 DO_SLI(gvec_sli_s, uint32_t)
1885 DO_SLI(gvec_sli_d, uint64_t)
1886 
1887 #undef DO_SLI
1888 
1889 /*
1890  * Convert float16 to float32, raising no exceptions and
1891  * preserving exceptional values, including SNaN.
1892  * This is effectively an unpack+repack operation.
1893  */
1894 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1895 {
1896     const int f16_bias = 15;
1897     const int f32_bias = 127;
1898     uint32_t sign = extract32(f16, 15, 1);
1899     uint32_t exp = extract32(f16, 10, 5);
1900     uint32_t frac = extract32(f16, 0, 10);
1901 
1902     if (exp == 0x1f) {
1903         /* Inf or NaN */
1904         exp = 0xff;
1905     } else if (exp == 0) {
1906         /* Zero or denormal.  */
1907         if (frac != 0) {
1908             if (fz16) {
1909                 frac = 0;
1910             } else {
1911                 /*
1912                  * Denormal; these are all normal float32.
1913                  * Shift the fraction so that the msb is at bit 11,
1914                  * then remove bit 11 as the implicit bit of the
1915                  * normalized float32.  Note that we still go through
1916                  * the shift for normal numbers below, to put the
1917                  * float32 fraction at the right place.
1918                  */
1919                 int shift = clz32(frac) - 21;
1920                 frac = (frac << shift) & 0x3ff;
1921                 exp = f32_bias - f16_bias - shift + 1;
1922             }
1923         }
1924     } else {
1925         /* Normal number; adjust the bias.  */
1926         exp += f32_bias - f16_bias;
1927     }
1928     sign <<= 31;
1929     exp <<= 23;
1930     frac <<= 23 - 10;
1931 
1932     return sign | exp | frac;
1933 }
1934 
1935 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1936 {
1937     /*
1938      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1939      * Load the 2nd qword iff is_q & is_2.
1940      * Shift to the 2nd dword iff !is_q & is_2.
1941      * For !is_q & !is_2, the upper bits of the result are garbage.
1942      */
1943     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1944 }
1945 
1946 /*
1947  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1948  * as there is not yet SVE versions that might use blocking.
1949  */
1950 
1951 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1952                      uint32_t desc, bool fz16)
1953 {
1954     intptr_t i, oprsz = simd_oprsz(desc);
1955     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1956     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1957     int is_q = oprsz == 16;
1958     uint64_t n_4, m_4;
1959 
1960     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1961     n_4 = load4_f16(vn, is_q, is_2);
1962     m_4 = load4_f16(vm, is_q, is_2);
1963 
1964     /* Negate all inputs for FMLSL at once.  */
1965     if (is_s) {
1966         n_4 ^= 0x8000800080008000ull;
1967     }
1968 
1969     for (i = 0; i < oprsz / 4; i++) {
1970         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1971         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1972         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1973     }
1974     clear_tail(d, oprsz, simd_maxsz(desc));
1975 }
1976 
1977 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1978                             void *venv, uint32_t desc)
1979 {
1980     CPUARMState *env = venv;
1981     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1982              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1983 }
1984 
1985 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1986                             void *venv, uint32_t desc)
1987 {
1988     CPUARMState *env = venv;
1989     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1990              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1991 }
1992 
1993 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1994                                void *venv, uint32_t desc)
1995 {
1996     intptr_t i, oprsz = simd_oprsz(desc);
1997     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1998     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1999     CPUARMState *env = venv;
2000     float_status *status = &env->vfp.fp_status;
2001     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2002 
2003     for (i = 0; i < oprsz; i += sizeof(float32)) {
2004         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2005         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2006         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2007         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2008         float32 aa = *(float32 *)(va + H1_4(i));
2009 
2010         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2011     }
2012 }
2013 
2014 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2015                          uint32_t desc, bool fz16)
2016 {
2017     intptr_t i, oprsz = simd_oprsz(desc);
2018     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2019     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2020     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2021     int is_q = oprsz == 16;
2022     uint64_t n_4;
2023     float32 m_1;
2024 
2025     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2026     n_4 = load4_f16(vn, is_q, is_2);
2027 
2028     /* Negate all inputs for FMLSL at once.  */
2029     if (is_s) {
2030         n_4 ^= 0x8000800080008000ull;
2031     }
2032 
2033     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2034 
2035     for (i = 0; i < oprsz / 4; i++) {
2036         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2037         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2038     }
2039     clear_tail(d, oprsz, simd_maxsz(desc));
2040 }
2041 
2042 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2043                                 void *venv, uint32_t desc)
2044 {
2045     CPUARMState *env = venv;
2046     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2047                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2048 }
2049 
2050 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2051                                 void *venv, uint32_t desc)
2052 {
2053     CPUARMState *env = venv;
2054     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2055                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2056 }
2057 
2058 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2059                                void *venv, uint32_t desc)
2060 {
2061     intptr_t i, j, oprsz = simd_oprsz(desc);
2062     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2063     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2064     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2065     CPUARMState *env = venv;
2066     float_status *status = &env->vfp.fp_status;
2067     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2068 
2069     for (i = 0; i < oprsz; i += 16) {
2070         float16 mm_16 = *(float16 *)(vm + i + idx);
2071         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2072 
2073         for (j = 0; j < 16; j += sizeof(float32)) {
2074             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2075             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2076             float32 aa = *(float32 *)(va + H1_4(i + j));
2077 
2078             *(float32 *)(vd + H1_4(i + j)) =
2079                 float32_muladd(nn, mm, aa, 0, status);
2080         }
2081     }
2082 }
2083 
2084 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2085 {
2086     intptr_t i, opr_sz = simd_oprsz(desc);
2087     int8_t *d = vd, *n = vn, *m = vm;
2088 
2089     for (i = 0; i < opr_sz; ++i) {
2090         int8_t mm = m[i];
2091         int8_t nn = n[i];
2092         int8_t res = 0;
2093         if (mm >= 0) {
2094             if (mm < 8) {
2095                 res = nn << mm;
2096             }
2097         } else {
2098             res = nn >> (mm > -8 ? -mm : 7);
2099         }
2100         d[i] = res;
2101     }
2102     clear_tail(d, opr_sz, simd_maxsz(desc));
2103 }
2104 
2105 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2106 {
2107     intptr_t i, opr_sz = simd_oprsz(desc);
2108     int16_t *d = vd, *n = vn, *m = vm;
2109 
2110     for (i = 0; i < opr_sz / 2; ++i) {
2111         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2112         int16_t nn = n[i];
2113         int16_t res = 0;
2114         if (mm >= 0) {
2115             if (mm < 16) {
2116                 res = nn << mm;
2117             }
2118         } else {
2119             res = nn >> (mm > -16 ? -mm : 15);
2120         }
2121         d[i] = res;
2122     }
2123     clear_tail(d, opr_sz, simd_maxsz(desc));
2124 }
2125 
2126 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2127 {
2128     intptr_t i, opr_sz = simd_oprsz(desc);
2129     uint8_t *d = vd, *n = vn, *m = vm;
2130 
2131     for (i = 0; i < opr_sz; ++i) {
2132         int8_t mm = m[i];
2133         uint8_t nn = n[i];
2134         uint8_t res = 0;
2135         if (mm >= 0) {
2136             if (mm < 8) {
2137                 res = nn << mm;
2138             }
2139         } else {
2140             if (mm > -8) {
2141                 res = nn >> -mm;
2142             }
2143         }
2144         d[i] = res;
2145     }
2146     clear_tail(d, opr_sz, simd_maxsz(desc));
2147 }
2148 
2149 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2150 {
2151     intptr_t i, opr_sz = simd_oprsz(desc);
2152     uint16_t *d = vd, *n = vn, *m = vm;
2153 
2154     for (i = 0; i < opr_sz / 2; ++i) {
2155         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2156         uint16_t nn = n[i];
2157         uint16_t res = 0;
2158         if (mm >= 0) {
2159             if (mm < 16) {
2160                 res = nn << mm;
2161             }
2162         } else {
2163             if (mm > -16) {
2164                 res = nn >> -mm;
2165             }
2166         }
2167         d[i] = res;
2168     }
2169     clear_tail(d, opr_sz, simd_maxsz(desc));
2170 }
2171 
2172 /*
2173  * 8x8->8 polynomial multiply.
2174  *
2175  * Polynomial multiplication is like integer multiplication except the
2176  * partial products are XORed, not added.
2177  *
2178  * TODO: expose this as a generic vector operation, as it is a common
2179  * crypto building block.
2180  */
2181 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2182 {
2183     intptr_t i, opr_sz = simd_oprsz(desc);
2184     uint64_t *d = vd, *n = vn, *m = vm;
2185 
2186     for (i = 0; i < opr_sz / 8; ++i) {
2187         d[i] = clmul_8x8_low(n[i], m[i]);
2188     }
2189     clear_tail(d, opr_sz, simd_maxsz(desc));
2190 }
2191 
2192 /*
2193  * 64x64->128 polynomial multiply.
2194  * Because of the lanes are not accessed in strict columns,
2195  * this probably cannot be turned into a generic helper.
2196  */
2197 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2198 {
2199     intptr_t i, opr_sz = simd_oprsz(desc);
2200     intptr_t hi = simd_data(desc);
2201     uint64_t *d = vd, *n = vn, *m = vm;
2202 
2203     for (i = 0; i < opr_sz / 8; i += 2) {
2204         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2205         d[i] = int128_getlo(r);
2206         d[i + 1] = int128_gethi(r);
2207     }
2208     clear_tail(d, opr_sz, simd_maxsz(desc));
2209 }
2210 
2211 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2212 {
2213     int hi = simd_data(desc);
2214     uint64_t *d = vd, *n = vn, *m = vm;
2215     uint64_t nn = n[hi], mm = m[hi];
2216 
2217     d[0] = clmul_8x4_packed(nn, mm);
2218     nn >>= 32;
2219     mm >>= 32;
2220     d[1] = clmul_8x4_packed(nn, mm);
2221 
2222     clear_tail(d, 16, simd_maxsz(desc));
2223 }
2224 
2225 #ifdef TARGET_AARCH64
2226 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2227 {
2228     int shift = simd_data(desc) * 8;
2229     intptr_t i, opr_sz = simd_oprsz(desc);
2230     uint64_t *d = vd, *n = vn, *m = vm;
2231 
2232     for (i = 0; i < opr_sz / 8; ++i) {
2233         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2234     }
2235 }
2236 
2237 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2238 {
2239     intptr_t sel = H4(simd_data(desc));
2240     intptr_t i, opr_sz = simd_oprsz(desc);
2241     uint32_t *n = vn, *m = vm;
2242     uint64_t *d = vd;
2243 
2244     for (i = 0; i < opr_sz / 8; ++i) {
2245         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2246     }
2247 }
2248 #endif
2249 
2250 #define DO_CMP0(NAME, TYPE, OP)                         \
2251 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2252 {                                                       \
2253     intptr_t i, opr_sz = simd_oprsz(desc);              \
2254     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2255         TYPE nn = *(TYPE *)(vn + i);                    \
2256         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2257     }                                                   \
2258     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2259 }
2260 
2261 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2262 DO_CMP0(gvec_clt0_b, int8_t, <)
2263 DO_CMP0(gvec_cle0_b, int8_t, <=)
2264 DO_CMP0(gvec_cgt0_b, int8_t, >)
2265 DO_CMP0(gvec_cge0_b, int8_t, >=)
2266 
2267 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2268 DO_CMP0(gvec_clt0_h, int16_t, <)
2269 DO_CMP0(gvec_cle0_h, int16_t, <=)
2270 DO_CMP0(gvec_cgt0_h, int16_t, >)
2271 DO_CMP0(gvec_cge0_h, int16_t, >=)
2272 
2273 #undef DO_CMP0
2274 
2275 #define DO_ABD(NAME, TYPE)                                      \
2276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2277 {                                                               \
2278     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2279     TYPE *d = vd, *n = vn, *m = vm;                             \
2280                                                                 \
2281     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2282         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2283     }                                                           \
2284     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2285 }
2286 
2287 DO_ABD(gvec_sabd_b, int8_t)
2288 DO_ABD(gvec_sabd_h, int16_t)
2289 DO_ABD(gvec_sabd_s, int32_t)
2290 DO_ABD(gvec_sabd_d, int64_t)
2291 
2292 DO_ABD(gvec_uabd_b, uint8_t)
2293 DO_ABD(gvec_uabd_h, uint16_t)
2294 DO_ABD(gvec_uabd_s, uint32_t)
2295 DO_ABD(gvec_uabd_d, uint64_t)
2296 
2297 #undef DO_ABD
2298 
2299 #define DO_ABA(NAME, TYPE)                                      \
2300 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2301 {                                                               \
2302     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2303     TYPE *d = vd, *n = vn, *m = vm;                             \
2304                                                                 \
2305     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2306         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2307     }                                                           \
2308     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2309 }
2310 
2311 DO_ABA(gvec_saba_b, int8_t)
2312 DO_ABA(gvec_saba_h, int16_t)
2313 DO_ABA(gvec_saba_s, int32_t)
2314 DO_ABA(gvec_saba_d, int64_t)
2315 
2316 DO_ABA(gvec_uaba_b, uint8_t)
2317 DO_ABA(gvec_uaba_h, uint16_t)
2318 DO_ABA(gvec_uaba_s, uint32_t)
2319 DO_ABA(gvec_uaba_d, uint64_t)
2320 
2321 #undef DO_ABA
2322 
2323 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2324 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2325 {                                                                          \
2326     ARMVectorReg scratch;                                                  \
2327     intptr_t oprsz = simd_oprsz(desc);                                     \
2328     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2329     TYPE *d = vd, *n = vn, *m = vm;                                        \
2330     if (unlikely(d == m)) {                                                \
2331         m = memcpy(&scratch, m, oprsz);                                    \
2332     }                                                                      \
2333     for (intptr_t i = 0; i < half; ++i) {                                  \
2334         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2335     }                                                                      \
2336     for (intptr_t i = 0; i < half; ++i) {                                  \
2337         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2338     }                                                                      \
2339     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2340 }
2341 
2342 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2343 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2344 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2345 
2346 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2347 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2348 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2349 
2350 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2351 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2352 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2353 
2354 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2355 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2356 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2357 
2358 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2359 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2360 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2361 
2362 #undef DO_3OP_PAIR
2363 
2364 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2365 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2366 {                                                               \
2367     ARMVectorReg scratch;                                       \
2368     intptr_t oprsz = simd_oprsz(desc);                          \
2369     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2370     TYPE *d = vd, *n = vn, *m = vm;                             \
2371     if (unlikely(d == m)) {                                     \
2372         m = memcpy(&scratch, m, oprsz);                         \
2373     }                                                           \
2374     for (intptr_t i = 0; i < half; ++i) {                       \
2375         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2376     }                                                           \
2377     for (intptr_t i = 0; i < half; ++i) {                       \
2378         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2379     }                                                           \
2380     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2381 }
2382 
2383 #define ADD(A, B) (A + B)
2384 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2385 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2386 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2387 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2388 #undef  ADD
2389 
2390 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2391 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2392 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2393 
2394 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2395 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2396 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2397 
2398 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2399 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2400 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2401 
2402 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2403 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2404 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2405 
2406 #undef DO_3OP_PAIR
2407 
2408 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2409     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2410     {                                                                   \
2411         intptr_t i, oprsz = simd_oprsz(desc);                           \
2412         int shift = simd_data(desc);                                    \
2413         TYPE *d = vd, *n = vn;                                          \
2414         float_status *fpst = stat;                                      \
2415         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2416             d[i] = FUNC(n[i], shift, fpst);                             \
2417         }                                                               \
2418         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2419     }
2420 
2421 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2422 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2423 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2424 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2425 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2426 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2427 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2428 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2429 
2430 #undef DO_VCVT_FIXED
2431 
2432 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2433     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2434     {                                                                   \
2435         float_status *fpst = stat;                                      \
2436         intptr_t i, oprsz = simd_oprsz(desc);                           \
2437         uint32_t rmode = simd_data(desc);                               \
2438         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2439         TYPE *d = vd, *n = vn;                                          \
2440         set_float_rounding_mode(rmode, fpst);                           \
2441         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2442             d[i] = FUNC(n[i], 0, fpst);                                 \
2443         }                                                               \
2444         set_float_rounding_mode(prev_rmode, fpst);                      \
2445         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2446     }
2447 
2448 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2449 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2450 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2451 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2452 
2453 #undef DO_VCVT_RMODE
2454 
2455 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2456     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2457     {                                                                   \
2458         float_status *fpst = stat;                                      \
2459         intptr_t i, oprsz = simd_oprsz(desc);                           \
2460         uint32_t rmode = simd_data(desc);                               \
2461         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2462         TYPE *d = vd, *n = vn;                                          \
2463         set_float_rounding_mode(rmode, fpst);                           \
2464         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2465             d[i] = FUNC(n[i], fpst);                                    \
2466         }                                                               \
2467         set_float_rounding_mode(prev_rmode, fpst);                      \
2468         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2469     }
2470 
2471 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2472 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2473 
2474 #undef DO_VRINT_RMODE
2475 
2476 #ifdef TARGET_AARCH64
2477 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2478 {
2479     const uint8_t *indices = vm;
2480     CPUARMState *env = venv;
2481     size_t oprsz = simd_oprsz(desc);
2482     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2483     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2484     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2485     union {
2486         uint8_t b[16];
2487         uint64_t d[2];
2488     } result;
2489 
2490     /*
2491      * We must construct the final result in a temp, lest the output
2492      * overlaps the input table.  For TBL, begin with zero; for TBX,
2493      * begin with the original register contents.  Note that we always
2494      * copy 16 bytes here to avoid an extra branch; clearing the high
2495      * bits of the register for oprsz == 8 is handled below.
2496      */
2497     if (is_tbx) {
2498         memcpy(&result, vd, 16);
2499     } else {
2500         memset(&result, 0, 16);
2501     }
2502 
2503     for (size_t i = 0; i < oprsz; ++i) {
2504         uint32_t index = indices[H1(i)];
2505 
2506         if (index < table_len) {
2507             /*
2508              * Convert index (a byte offset into the virtual table
2509              * which is a series of 128-bit vectors concatenated)
2510              * into the correct register element, bearing in mind
2511              * that the table can wrap around from V31 to V0.
2512              */
2513             const uint8_t *table = (const uint8_t *)
2514                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2515             result.b[H1(i)] = table[H1(index % 16)];
2516         }
2517     }
2518 
2519     memcpy(vd, &result, 16);
2520     clear_tail(vd, oprsz, simd_maxsz(desc));
2521 }
2522 #endif
2523 
2524 /*
2525  * NxN -> N highpart multiply
2526  *
2527  * TODO: expose this as a generic vector operation.
2528  */
2529 
2530 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2531 {
2532     intptr_t i, opr_sz = simd_oprsz(desc);
2533     int8_t *d = vd, *n = vn, *m = vm;
2534 
2535     for (i = 0; i < opr_sz; ++i) {
2536         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2537     }
2538     clear_tail(d, opr_sz, simd_maxsz(desc));
2539 }
2540 
2541 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2542 {
2543     intptr_t i, opr_sz = simd_oprsz(desc);
2544     int16_t *d = vd, *n = vn, *m = vm;
2545 
2546     for (i = 0; i < opr_sz / 2; ++i) {
2547         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2548     }
2549     clear_tail(d, opr_sz, simd_maxsz(desc));
2550 }
2551 
2552 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2553 {
2554     intptr_t i, opr_sz = simd_oprsz(desc);
2555     int32_t *d = vd, *n = vn, *m = vm;
2556 
2557     for (i = 0; i < opr_sz / 4; ++i) {
2558         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2559     }
2560     clear_tail(d, opr_sz, simd_maxsz(desc));
2561 }
2562 
2563 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2564 {
2565     intptr_t i, opr_sz = simd_oprsz(desc);
2566     uint64_t *d = vd, *n = vn, *m = vm;
2567     uint64_t discard;
2568 
2569     for (i = 0; i < opr_sz / 8; ++i) {
2570         muls64(&discard, &d[i], n[i], m[i]);
2571     }
2572     clear_tail(d, opr_sz, simd_maxsz(desc));
2573 }
2574 
2575 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2576 {
2577     intptr_t i, opr_sz = simd_oprsz(desc);
2578     uint8_t *d = vd, *n = vn, *m = vm;
2579 
2580     for (i = 0; i < opr_sz; ++i) {
2581         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2582     }
2583     clear_tail(d, opr_sz, simd_maxsz(desc));
2584 }
2585 
2586 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2587 {
2588     intptr_t i, opr_sz = simd_oprsz(desc);
2589     uint16_t *d = vd, *n = vn, *m = vm;
2590 
2591     for (i = 0; i < opr_sz / 2; ++i) {
2592         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2593     }
2594     clear_tail(d, opr_sz, simd_maxsz(desc));
2595 }
2596 
2597 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2598 {
2599     intptr_t i, opr_sz = simd_oprsz(desc);
2600     uint32_t *d = vd, *n = vn, *m = vm;
2601 
2602     for (i = 0; i < opr_sz / 4; ++i) {
2603         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2604     }
2605     clear_tail(d, opr_sz, simd_maxsz(desc));
2606 }
2607 
2608 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2609 {
2610     intptr_t i, opr_sz = simd_oprsz(desc);
2611     uint64_t *d = vd, *n = vn, *m = vm;
2612     uint64_t discard;
2613 
2614     for (i = 0; i < opr_sz / 8; ++i) {
2615         mulu64(&discard, &d[i], n[i], m[i]);
2616     }
2617     clear_tail(d, opr_sz, simd_maxsz(desc));
2618 }
2619 
2620 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2621 {
2622     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2623     int shr = simd_data(desc);
2624     uint64_t *d = vd, *n = vn, *m = vm;
2625 
2626     for (i = 0; i < opr_sz; ++i) {
2627         d[i] = ror64(n[i] ^ m[i], shr);
2628     }
2629     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2630 }
2631 
2632 /*
2633  * Integer matrix-multiply accumulate
2634  */
2635 
2636 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2637 {
2638     int8_t *n = vn, *m = vm;
2639 
2640     for (intptr_t k = 0; k < 8; ++k) {
2641         sum += n[H1(k)] * m[H1(k)];
2642     }
2643     return sum;
2644 }
2645 
2646 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2647 {
2648     uint8_t *n = vn, *m = vm;
2649 
2650     for (intptr_t k = 0; k < 8; ++k) {
2651         sum += n[H1(k)] * m[H1(k)];
2652     }
2653     return sum;
2654 }
2655 
2656 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2657 {
2658     uint8_t *n = vn;
2659     int8_t *m = vm;
2660 
2661     for (intptr_t k = 0; k < 8; ++k) {
2662         sum += n[H1(k)] * m[H1(k)];
2663     }
2664     return sum;
2665 }
2666 
2667 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2668                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2669 {
2670     intptr_t seg, opr_sz = simd_oprsz(desc);
2671 
2672     for (seg = 0; seg < opr_sz; seg += 16) {
2673         uint32_t *d = vd + seg;
2674         uint32_t *a = va + seg;
2675         uint32_t sum0, sum1, sum2, sum3;
2676 
2677         /*
2678          * Process the entire segment at once, writing back the
2679          * results only after we've consumed all of the inputs.
2680          *
2681          * Key to indices by column:
2682          *          i   j                  i             j
2683          */
2684         sum0 = a[H4(0 + 0)];
2685         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2686         sum1 = a[H4(0 + 1)];
2687         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2688         sum2 = a[H4(2 + 0)];
2689         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2690         sum3 = a[H4(2 + 1)];
2691         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2692 
2693         d[H4(0)] = sum0;
2694         d[H4(1)] = sum1;
2695         d[H4(2)] = sum2;
2696         d[H4(3)] = sum3;
2697     }
2698     clear_tail(vd, opr_sz, simd_maxsz(desc));
2699 }
2700 
2701 #define DO_MMLA_B(NAME, INNER) \
2702     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2703     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2704 
2705 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2706 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2707 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2708 
2709 /*
2710  * BFloat16 Dot Product
2711  */
2712 
2713 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2714 {
2715     /* FPCR is ignored for BFDOT and BFMMLA. */
2716     float_status bf_status = {
2717         .tininess_before_rounding = float_tininess_before_rounding,
2718         .float_rounding_mode = float_round_to_odd_inf,
2719         .flush_to_zero = true,
2720         .flush_inputs_to_zero = true,
2721         .default_nan_mode = true,
2722     };
2723     float32 t1, t2;
2724 
2725     /*
2726      * Extract each BFloat16 from the element pair, and shift
2727      * them such that they become float32.
2728      */
2729     t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2730     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2731     t1 = float32_add(t1, t2, &bf_status);
2732     t1 = float32_add(sum, t1, &bf_status);
2733 
2734     return t1;
2735 }
2736 
2737 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2738 {
2739     intptr_t i, opr_sz = simd_oprsz(desc);
2740     float32 *d = vd, *a = va;
2741     uint32_t *n = vn, *m = vm;
2742 
2743     for (i = 0; i < opr_sz / 4; ++i) {
2744         d[i] = bfdotadd(a[i], n[i], m[i]);
2745     }
2746     clear_tail(d, opr_sz, simd_maxsz(desc));
2747 }
2748 
2749 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2750                             void *va, uint32_t desc)
2751 {
2752     intptr_t i, j, opr_sz = simd_oprsz(desc);
2753     intptr_t index = simd_data(desc);
2754     intptr_t elements = opr_sz / 4;
2755     intptr_t eltspersegment = MIN(16 / 4, elements);
2756     float32 *d = vd, *a = va;
2757     uint32_t *n = vn, *m = vm;
2758 
2759     for (i = 0; i < elements; i += eltspersegment) {
2760         uint32_t m_idx = m[i + H4(index)];
2761 
2762         for (j = i; j < i + eltspersegment; j++) {
2763             d[j] = bfdotadd(a[j], n[j], m_idx);
2764         }
2765     }
2766     clear_tail(d, opr_sz, simd_maxsz(desc));
2767 }
2768 
2769 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2770 {
2771     intptr_t s, opr_sz = simd_oprsz(desc);
2772     float32 *d = vd, *a = va;
2773     uint32_t *n = vn, *m = vm;
2774 
2775     for (s = 0; s < opr_sz / 4; s += 4) {
2776         float32 sum00, sum01, sum10, sum11;
2777 
2778         /*
2779          * Process the entire segment at once, writing back the
2780          * results only after we've consumed all of the inputs.
2781          *
2782          * Key to indices by column:
2783          *               i   j           i   k             j   k
2784          */
2785         sum00 = a[s + H4(0 + 0)];
2786         sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2787         sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2788 
2789         sum01 = a[s + H4(0 + 1)];
2790         sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2791         sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2792 
2793         sum10 = a[s + H4(2 + 0)];
2794         sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2795         sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2796 
2797         sum11 = a[s + H4(2 + 1)];
2798         sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2799         sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2800 
2801         d[s + H4(0 + 0)] = sum00;
2802         d[s + H4(0 + 1)] = sum01;
2803         d[s + H4(2 + 0)] = sum10;
2804         d[s + H4(2 + 1)] = sum11;
2805     }
2806     clear_tail(d, opr_sz, simd_maxsz(desc));
2807 }
2808 
2809 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2810                          void *stat, uint32_t desc)
2811 {
2812     intptr_t i, opr_sz = simd_oprsz(desc);
2813     intptr_t sel = simd_data(desc);
2814     float32 *d = vd, *a = va;
2815     bfloat16 *n = vn, *m = vm;
2816 
2817     for (i = 0; i < opr_sz / 4; ++i) {
2818         float32 nn = n[H2(i * 2 + sel)] << 16;
2819         float32 mm = m[H2(i * 2 + sel)] << 16;
2820         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2821     }
2822     clear_tail(d, opr_sz, simd_maxsz(desc));
2823 }
2824 
2825 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2826                              void *va, void *stat, uint32_t desc)
2827 {
2828     intptr_t i, j, opr_sz = simd_oprsz(desc);
2829     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2830     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2831     intptr_t elements = opr_sz / 4;
2832     intptr_t eltspersegment = MIN(16 / 4, elements);
2833     float32 *d = vd, *a = va;
2834     bfloat16 *n = vn, *m = vm;
2835 
2836     for (i = 0; i < elements; i += eltspersegment) {
2837         float32 m_idx = m[H2(2 * i + index)] << 16;
2838 
2839         for (j = i; j < i + eltspersegment; j++) {
2840             float32 n_j = n[H2(2 * j + sel)] << 16;
2841             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2842         }
2843     }
2844     clear_tail(d, opr_sz, simd_maxsz(desc));
2845 }
2846 
2847 #define DO_CLAMP(NAME, TYPE) \
2848 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2849 {                                                                       \
2850     intptr_t i, opr_sz = simd_oprsz(desc);                              \
2851     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2852         TYPE aa = *(TYPE *)(a + i);                                     \
2853         TYPE nn = *(TYPE *)(n + i);                                     \
2854         TYPE mm = *(TYPE *)(m + i);                                     \
2855         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2856         *(TYPE *)(d + i) = dd;                                          \
2857     }                                                                   \
2858     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2859 }
2860 
2861 DO_CLAMP(gvec_sclamp_b, int8_t)
2862 DO_CLAMP(gvec_sclamp_h, int16_t)
2863 DO_CLAMP(gvec_sclamp_s, int32_t)
2864 DO_CLAMP(gvec_sclamp_d, int64_t)
2865 
2866 DO_CLAMP(gvec_uclamp_b, uint8_t)
2867 DO_CLAMP(gvec_uclamp_h, uint16_t)
2868 DO_CLAMP(gvec_uclamp_s, uint32_t)
2869 DO_CLAMP(gvec_uclamp_d, uint64_t)
2870