xref: /openbmc/qemu/target/arm/tcg/vec_helper.c (revision a5b72ccc)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
351                              void *va, uint32_t desc)
352 {
353     intptr_t i, opr_sz = simd_oprsz(desc);
354     int16_t *d = vd, *n = vn, *m = vm, *a = va;
355     uint32_t discard;
356 
357     for (i = 0; i < opr_sz / 2; ++i) {
358         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
359     }
360 }
361 
362 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
363                              void *va, uint32_t desc)
364 {
365     intptr_t i, opr_sz = simd_oprsz(desc);
366     int16_t *d = vd, *n = vn, *m = vm, *a = va;
367     uint32_t discard;
368 
369     for (i = 0; i < opr_sz / 2; ++i) {
370         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
371     }
372 }
373 
374 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
375 {
376     intptr_t i, opr_sz = simd_oprsz(desc);
377     int16_t *d = vd, *n = vn, *m = vm;
378     uint32_t discard;
379 
380     for (i = 0; i < opr_sz / 2; ++i) {
381         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
382     }
383 }
384 
385 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
386 {
387     intptr_t i, opr_sz = simd_oprsz(desc);
388     int16_t *d = vd, *n = vn, *m = vm;
389     uint32_t discard;
390 
391     for (i = 0; i < opr_sz / 2; ++i) {
392         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
393     }
394 }
395 
396 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
397 {
398     intptr_t i, j, opr_sz = simd_oprsz(desc);
399     int idx = simd_data(desc);
400     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
401     uint32_t discard;
402 
403     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
404         int16_t mm = m[i];
405         for (j = 0; j < 16 / 2; ++j) {
406             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
407         }
408     }
409 }
410 
411 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
412 {
413     intptr_t i, j, opr_sz = simd_oprsz(desc);
414     int idx = simd_data(desc);
415     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
416     uint32_t discard;
417 
418     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
419         int16_t mm = m[i];
420         for (j = 0; j < 16 / 2; ++j) {
421             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
422         }
423     }
424 }
425 
426 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
427 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
428                       bool neg, bool round, uint32_t *sat)
429 {
430     /* Simplify similarly to do_sqrdmlah_b above.  */
431     int64_t ret = (int64_t)src1 * src2;
432     if (neg) {
433         ret = -ret;
434     }
435     ret += ((int64_t)src3 << 31) + (round << 30);
436     ret >>= 31;
437 
438     if (ret != (int32_t)ret) {
439         *sat = 1;
440         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
441     }
442     return ret;
443 }
444 
445 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
446                                   int32_t src2, int32_t src3)
447 {
448     uint32_t *sat = &env->vfp.qc[0];
449     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
450 }
451 
452 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
453                               void *vq, uint32_t desc)
454 {
455     uintptr_t opr_sz = simd_oprsz(desc);
456     int32_t *d = vd;
457     int32_t *n = vn;
458     int32_t *m = vm;
459     uintptr_t i;
460 
461     for (i = 0; i < opr_sz / 4; ++i) {
462         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
463     }
464     clear_tail(d, opr_sz, simd_maxsz(desc));
465 }
466 
467 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
468                                   int32_t src2, int32_t src3)
469 {
470     uint32_t *sat = &env->vfp.qc[0];
471     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
472 }
473 
474 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
475                               void *vq, uint32_t desc)
476 {
477     uintptr_t opr_sz = simd_oprsz(desc);
478     int32_t *d = vd;
479     int32_t *n = vn;
480     int32_t *m = vm;
481     uintptr_t i;
482 
483     for (i = 0; i < opr_sz / 4; ++i) {
484         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
485     }
486     clear_tail(d, opr_sz, simd_maxsz(desc));
487 }
488 
489 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
490                             void *vq, uint32_t desc)
491 {
492     intptr_t i, opr_sz = simd_oprsz(desc);
493     int32_t *d = vd, *n = vn, *m = vm;
494 
495     for (i = 0; i < opr_sz / 4; ++i) {
496         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
497     }
498     clear_tail(d, opr_sz, simd_maxsz(desc));
499 }
500 
501 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
502                              void *vq, uint32_t desc)
503 {
504     intptr_t i, opr_sz = simd_oprsz(desc);
505     int32_t *d = vd, *n = vn, *m = vm;
506 
507     for (i = 0; i < opr_sz / 4; ++i) {
508         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
509     }
510     clear_tail(d, opr_sz, simd_maxsz(desc));
511 }
512 
513 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
514                                 void *vq, uint32_t desc)
515 {
516     intptr_t i, j, opr_sz = simd_oprsz(desc);
517     int idx = simd_data(desc);
518     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
519     intptr_t elements = opr_sz / 4;
520     intptr_t eltspersegment = MIN(16 / 4, elements);
521 
522     for (i = 0; i < elements; i += 16 / 4) {
523         int32_t mm = m[i];
524         for (j = 0; j < eltspersegment; ++j) {
525             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
526         }
527     }
528     clear_tail(d, opr_sz, simd_maxsz(desc));
529 }
530 
531 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
532                                  void *vq, uint32_t desc)
533 {
534     intptr_t i, j, opr_sz = simd_oprsz(desc);
535     int idx = simd_data(desc);
536     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
537     intptr_t elements = opr_sz / 4;
538     intptr_t eltspersegment = MIN(16 / 4, elements);
539 
540     for (i = 0; i < elements; i += 16 / 4) {
541         int32_t mm = m[i];
542         for (j = 0; j < eltspersegment; ++j) {
543             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
544         }
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
550                              void *va, uint32_t desc)
551 {
552     intptr_t i, opr_sz = simd_oprsz(desc);
553     int32_t *d = vd, *n = vn, *m = vm, *a = va;
554     uint32_t discard;
555 
556     for (i = 0; i < opr_sz / 4; ++i) {
557         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
558     }
559 }
560 
561 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
562                              void *va, uint32_t desc)
563 {
564     intptr_t i, opr_sz = simd_oprsz(desc);
565     int32_t *d = vd, *n = vn, *m = vm, *a = va;
566     uint32_t discard;
567 
568     for (i = 0; i < opr_sz / 4; ++i) {
569         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
570     }
571 }
572 
573 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
574 {
575     intptr_t i, opr_sz = simd_oprsz(desc);
576     int32_t *d = vd, *n = vn, *m = vm;
577     uint32_t discard;
578 
579     for (i = 0; i < opr_sz / 4; ++i) {
580         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
581     }
582 }
583 
584 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
585 {
586     intptr_t i, opr_sz = simd_oprsz(desc);
587     int32_t *d = vd, *n = vn, *m = vm;
588     uint32_t discard;
589 
590     for (i = 0; i < opr_sz / 4; ++i) {
591         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
592     }
593 }
594 
595 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
596 {
597     intptr_t i, j, opr_sz = simd_oprsz(desc);
598     int idx = simd_data(desc);
599     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
600     uint32_t discard;
601 
602     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
603         int32_t mm = m[i];
604         for (j = 0; j < 16 / 4; ++j) {
605             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
606         }
607     }
608 }
609 
610 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
611 {
612     intptr_t i, j, opr_sz = simd_oprsz(desc);
613     int idx = simd_data(desc);
614     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
615     uint32_t discard;
616 
617     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
618         int32_t mm = m[i];
619         for (j = 0; j < 16 / 4; ++j) {
620             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
621         }
622     }
623 }
624 
625 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
626 static int64_t do_sat128_d(Int128 r)
627 {
628     int64_t ls = int128_getlo(r);
629     int64_t hs = int128_gethi(r);
630 
631     if (unlikely(hs != (ls >> 63))) {
632         return hs < 0 ? INT64_MIN : INT64_MAX;
633     }
634     return ls;
635 }
636 
637 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
638 {
639     uint64_t l, h;
640     Int128 r, t;
641 
642     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
643     muls64(&l, &h, m, n);
644     r = int128_make128(l, h);
645     if (neg) {
646         r = int128_neg(r);
647     }
648     if (a) {
649         t = int128_exts64(a);
650         t = int128_lshift(t, 63);
651         r = int128_add(r, t);
652     }
653     if (round) {
654         t = int128_exts64(1ll << 62);
655         r = int128_add(r, t);
656     }
657     r = int128_rshift(r, 63);
658 
659     return do_sat128_d(r);
660 }
661 
662 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
663                              void *va, uint32_t desc)
664 {
665     intptr_t i, opr_sz = simd_oprsz(desc);
666     int64_t *d = vd, *n = vn, *m = vm, *a = va;
667 
668     for (i = 0; i < opr_sz / 8; ++i) {
669         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
670     }
671 }
672 
673 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
674                              void *va, uint32_t desc)
675 {
676     intptr_t i, opr_sz = simd_oprsz(desc);
677     int64_t *d = vd, *n = vn, *m = vm, *a = va;
678 
679     for (i = 0; i < opr_sz / 8; ++i) {
680         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
681     }
682 }
683 
684 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
685 {
686     intptr_t i, opr_sz = simd_oprsz(desc);
687     int64_t *d = vd, *n = vn, *m = vm;
688 
689     for (i = 0; i < opr_sz / 8; ++i) {
690         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
691     }
692 }
693 
694 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
695 {
696     intptr_t i, opr_sz = simd_oprsz(desc);
697     int64_t *d = vd, *n = vn, *m = vm;
698 
699     for (i = 0; i < opr_sz / 8; ++i) {
700         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
701     }
702 }
703 
704 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
705 {
706     intptr_t i, j, opr_sz = simd_oprsz(desc);
707     int idx = simd_data(desc);
708     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
709 
710     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
711         int64_t mm = m[i];
712         for (j = 0; j < 16 / 8; ++j) {
713             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
714         }
715     }
716 }
717 
718 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
719 {
720     intptr_t i, j, opr_sz = simd_oprsz(desc);
721     int idx = simd_data(desc);
722     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
723 
724     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
725         int64_t mm = m[i];
726         for (j = 0; j < 16 / 8; ++j) {
727             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
728         }
729     }
730 }
731 
732 /* Integer 8 and 16-bit dot-product.
733  *
734  * Note that for the loops herein, host endianness does not matter
735  * with respect to the ordering of data within the quad-width lanes.
736  * All elements are treated equally, no matter where they are.
737  */
738 
739 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
740 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
741 {                                                                         \
742     intptr_t i, opr_sz = simd_oprsz(desc);                                \
743     TYPED *d = vd, *a = va;                                               \
744     TYPEN *n = vn;                                                        \
745     TYPEM *m = vm;                                                        \
746     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
747         d[i] = (a[i] +                                                    \
748                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
749                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
750                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
751                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
752     }                                                                     \
753     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
754 }
755 
756 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
757 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
758 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
759 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
760 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
761 
762 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
763 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
764 {                                                                         \
765     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
766     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
767     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
768     intptr_t index = simd_data(desc);                                     \
769     TYPED *d = vd, *a = va;                                               \
770     TYPEN *n = vn;                                                        \
771     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
772     do {                                                                  \
773         TYPED m0 = m_indexed[i * 4 + 0];                                  \
774         TYPED m1 = m_indexed[i * 4 + 1];                                  \
775         TYPED m2 = m_indexed[i * 4 + 2];                                  \
776         TYPED m3 = m_indexed[i * 4 + 3];                                  \
777         do {                                                              \
778             d[i] = (a[i] +                                                \
779                     n[i * 4 + 0] * m0 +                                   \
780                     n[i * 4 + 1] * m1 +                                   \
781                     n[i * 4 + 2] * m2 +                                   \
782                     n[i * 4 + 3] * m3);                                   \
783         } while (++i < segend);                                           \
784         segend = i + 4;                                                   \
785     } while (i < opr_sz_n);                                               \
786     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
787 }
788 
789 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
790 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
791 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
792 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
793 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
794 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
795 
796 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
797                          void *vfpst, uint32_t desc)
798 {
799     uintptr_t opr_sz = simd_oprsz(desc);
800     float16 *d = vd;
801     float16 *n = vn;
802     float16 *m = vm;
803     float_status *fpst = vfpst;
804     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
805     uint32_t neg_imag = neg_real ^ 1;
806     uintptr_t i;
807 
808     /* Shift boolean to the sign bit so we can xor to negate.  */
809     neg_real <<= 15;
810     neg_imag <<= 15;
811 
812     for (i = 0; i < opr_sz / 2; i += 2) {
813         float16 e0 = n[H2(i)];
814         float16 e1 = m[H2(i + 1)] ^ neg_imag;
815         float16 e2 = n[H2(i + 1)];
816         float16 e3 = m[H2(i)] ^ neg_real;
817 
818         d[H2(i)] = float16_add(e0, e1, fpst);
819         d[H2(i + 1)] = float16_add(e2, e3, fpst);
820     }
821     clear_tail(d, opr_sz, simd_maxsz(desc));
822 }
823 
824 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
825                          void *vfpst, uint32_t desc)
826 {
827     uintptr_t opr_sz = simd_oprsz(desc);
828     float32 *d = vd;
829     float32 *n = vn;
830     float32 *m = vm;
831     float_status *fpst = vfpst;
832     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
833     uint32_t neg_imag = neg_real ^ 1;
834     uintptr_t i;
835 
836     /* Shift boolean to the sign bit so we can xor to negate.  */
837     neg_real <<= 31;
838     neg_imag <<= 31;
839 
840     for (i = 0; i < opr_sz / 4; i += 2) {
841         float32 e0 = n[H4(i)];
842         float32 e1 = m[H4(i + 1)] ^ neg_imag;
843         float32 e2 = n[H4(i + 1)];
844         float32 e3 = m[H4(i)] ^ neg_real;
845 
846         d[H4(i)] = float32_add(e0, e1, fpst);
847         d[H4(i + 1)] = float32_add(e2, e3, fpst);
848     }
849     clear_tail(d, opr_sz, simd_maxsz(desc));
850 }
851 
852 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
853                          void *vfpst, uint32_t desc)
854 {
855     uintptr_t opr_sz = simd_oprsz(desc);
856     float64 *d = vd;
857     float64 *n = vn;
858     float64 *m = vm;
859     float_status *fpst = vfpst;
860     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
861     uint64_t neg_imag = neg_real ^ 1;
862     uintptr_t i;
863 
864     /* Shift boolean to the sign bit so we can xor to negate.  */
865     neg_real <<= 63;
866     neg_imag <<= 63;
867 
868     for (i = 0; i < opr_sz / 8; i += 2) {
869         float64 e0 = n[i];
870         float64 e1 = m[i + 1] ^ neg_imag;
871         float64 e2 = n[i + 1];
872         float64 e3 = m[i] ^ neg_real;
873 
874         d[i] = float64_add(e0, e1, fpst);
875         d[i + 1] = float64_add(e2, e3, fpst);
876     }
877     clear_tail(d, opr_sz, simd_maxsz(desc));
878 }
879 
880 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
881                          void *vfpst, uint32_t desc)
882 {
883     uintptr_t opr_sz = simd_oprsz(desc);
884     float16 *d = vd, *n = vn, *m = vm, *a = va;
885     float_status *fpst = vfpst;
886     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
887     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
888     uint32_t neg_real = flip ^ neg_imag;
889     uintptr_t i;
890 
891     /* Shift boolean to the sign bit so we can xor to negate.  */
892     neg_real <<= 15;
893     neg_imag <<= 15;
894 
895     for (i = 0; i < opr_sz / 2; i += 2) {
896         float16 e2 = n[H2(i + flip)];
897         float16 e1 = m[H2(i + flip)] ^ neg_real;
898         float16 e4 = e2;
899         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
900 
901         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
902         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
903     }
904     clear_tail(d, opr_sz, simd_maxsz(desc));
905 }
906 
907 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
908                              void *vfpst, uint32_t desc)
909 {
910     uintptr_t opr_sz = simd_oprsz(desc);
911     float16 *d = vd, *n = vn, *m = vm, *a = va;
912     float_status *fpst = vfpst;
913     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
914     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
915     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
916     uint32_t neg_real = flip ^ neg_imag;
917     intptr_t elements = opr_sz / sizeof(float16);
918     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
919     intptr_t i, j;
920 
921     /* Shift boolean to the sign bit so we can xor to negate.  */
922     neg_real <<= 15;
923     neg_imag <<= 15;
924 
925     for (i = 0; i < elements; i += eltspersegment) {
926         float16 mr = m[H2(i + 2 * index + 0)];
927         float16 mi = m[H2(i + 2 * index + 1)];
928         float16 e1 = neg_real ^ (flip ? mi : mr);
929         float16 e3 = neg_imag ^ (flip ? mr : mi);
930 
931         for (j = i; j < i + eltspersegment; j += 2) {
932             float16 e2 = n[H2(j + flip)];
933             float16 e4 = e2;
934 
935             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
936             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
937         }
938     }
939     clear_tail(d, opr_sz, simd_maxsz(desc));
940 }
941 
942 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
943                          void *vfpst, uint32_t desc)
944 {
945     uintptr_t opr_sz = simd_oprsz(desc);
946     float32 *d = vd, *n = vn, *m = vm, *a = va;
947     float_status *fpst = vfpst;
948     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
949     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
950     uint32_t neg_real = flip ^ neg_imag;
951     uintptr_t i;
952 
953     /* Shift boolean to the sign bit so we can xor to negate.  */
954     neg_real <<= 31;
955     neg_imag <<= 31;
956 
957     for (i = 0; i < opr_sz / 4; i += 2) {
958         float32 e2 = n[H4(i + flip)];
959         float32 e1 = m[H4(i + flip)] ^ neg_real;
960         float32 e4 = e2;
961         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
962 
963         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
964         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
965     }
966     clear_tail(d, opr_sz, simd_maxsz(desc));
967 }
968 
969 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
970                              void *vfpst, uint32_t desc)
971 {
972     uintptr_t opr_sz = simd_oprsz(desc);
973     float32 *d = vd, *n = vn, *m = vm, *a = va;
974     float_status *fpst = vfpst;
975     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
976     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
977     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
978     uint32_t neg_real = flip ^ neg_imag;
979     intptr_t elements = opr_sz / sizeof(float32);
980     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
981     intptr_t i, j;
982 
983     /* Shift boolean to the sign bit so we can xor to negate.  */
984     neg_real <<= 31;
985     neg_imag <<= 31;
986 
987     for (i = 0; i < elements; i += eltspersegment) {
988         float32 mr = m[H4(i + 2 * index + 0)];
989         float32 mi = m[H4(i + 2 * index + 1)];
990         float32 e1 = neg_real ^ (flip ? mi : mr);
991         float32 e3 = neg_imag ^ (flip ? mr : mi);
992 
993         for (j = i; j < i + eltspersegment; j += 2) {
994             float32 e2 = n[H4(j + flip)];
995             float32 e4 = e2;
996 
997             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
998             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
999         }
1000     }
1001     clear_tail(d, opr_sz, simd_maxsz(desc));
1002 }
1003 
1004 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1005                          void *vfpst, uint32_t desc)
1006 {
1007     uintptr_t opr_sz = simd_oprsz(desc);
1008     float64 *d = vd, *n = vn, *m = vm, *a = va;
1009     float_status *fpst = vfpst;
1010     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1011     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1012     uint64_t neg_real = flip ^ neg_imag;
1013     uintptr_t i;
1014 
1015     /* Shift boolean to the sign bit so we can xor to negate.  */
1016     neg_real <<= 63;
1017     neg_imag <<= 63;
1018 
1019     for (i = 0; i < opr_sz / 8; i += 2) {
1020         float64 e2 = n[i + flip];
1021         float64 e1 = m[i + flip] ^ neg_real;
1022         float64 e4 = e2;
1023         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1024 
1025         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1026         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1027     }
1028     clear_tail(d, opr_sz, simd_maxsz(desc));
1029 }
1030 
1031 /*
1032  * Floating point comparisons producing an integer result (all 1s or all 0s).
1033  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1034  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1035  */
1036 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1037 {
1038     return -float16_eq_quiet(op1, op2, stat);
1039 }
1040 
1041 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1042 {
1043     return -float32_eq_quiet(op1, op2, stat);
1044 }
1045 
1046 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1047 {
1048     return -float64_eq_quiet(op1, op2, stat);
1049 }
1050 
1051 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1052 {
1053     return -float16_le(op2, op1, stat);
1054 }
1055 
1056 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1057 {
1058     return -float32_le(op2, op1, stat);
1059 }
1060 
1061 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1062 {
1063     return -float64_le(op2, op1, stat);
1064 }
1065 
1066 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1067 {
1068     return -float16_lt(op2, op1, stat);
1069 }
1070 
1071 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1072 {
1073     return -float32_lt(op2, op1, stat);
1074 }
1075 
1076 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1077 {
1078     return -float64_lt(op2, op1, stat);
1079 }
1080 
1081 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1082 {
1083     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1084 }
1085 
1086 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1087 {
1088     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1089 }
1090 
1091 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1092 {
1093     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1094 }
1095 
1096 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1097 {
1098     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1099 }
1100 
1101 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1102 {
1103     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1104 }
1105 
1106 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1107 {
1108     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1109 }
1110 
1111 static int16_t vfp_tosszh(float16 x, void *fpstp)
1112 {
1113     float_status *fpst = fpstp;
1114     if (float16_is_any_nan(x)) {
1115         float_raise(float_flag_invalid, fpst);
1116         return 0;
1117     }
1118     return float16_to_int16_round_to_zero(x, fpst);
1119 }
1120 
1121 static uint16_t vfp_touszh(float16 x, void *fpstp)
1122 {
1123     float_status *fpst = fpstp;
1124     if (float16_is_any_nan(x)) {
1125         float_raise(float_flag_invalid, fpst);
1126         return 0;
1127     }
1128     return float16_to_uint16_round_to_zero(x, fpst);
1129 }
1130 
1131 #define DO_2OP(NAME, FUNC, TYPE) \
1132 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1133 {                                                                 \
1134     intptr_t i, oprsz = simd_oprsz(desc);                         \
1135     TYPE *d = vd, *n = vn;                                        \
1136     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1137         d[i] = FUNC(n[i], stat);                                  \
1138     }                                                             \
1139     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1140 }
1141 
1142 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1143 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1144 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1145 
1146 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1147 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1148 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1149 
1150 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1151 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1152 
1153 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1154 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1155 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1156 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1157 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1158 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1159 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1160 DO_2OP(gvec_touszh, vfp_touszh, float16)
1161 
1162 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1163     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1164     {                                                           \
1165         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1166     }
1167 
1168 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1169     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1170     {                                                           \
1171         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1172     }
1173 
1174 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1175     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1176     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1177     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1178     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1179 
1180 DO_2OP_CMP0(cgt, cgt, FWD)
1181 DO_2OP_CMP0(cge, cge, FWD)
1182 DO_2OP_CMP0(ceq, ceq, FWD)
1183 DO_2OP_CMP0(clt, cgt, REV)
1184 DO_2OP_CMP0(cle, cge, REV)
1185 
1186 #undef DO_2OP
1187 #undef DO_2OP_CMP0
1188 
1189 /* Floating-point trigonometric starting value.
1190  * See the ARM ARM pseudocode function FPTrigSMul.
1191  */
1192 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1193 {
1194     float16 result = float16_mul(op1, op1, stat);
1195     if (!float16_is_any_nan(result)) {
1196         result = float16_set_sign(result, op2 & 1);
1197     }
1198     return result;
1199 }
1200 
1201 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1202 {
1203     float32 result = float32_mul(op1, op1, stat);
1204     if (!float32_is_any_nan(result)) {
1205         result = float32_set_sign(result, op2 & 1);
1206     }
1207     return result;
1208 }
1209 
1210 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1211 {
1212     float64 result = float64_mul(op1, op1, stat);
1213     if (!float64_is_any_nan(result)) {
1214         result = float64_set_sign(result, op2 & 1);
1215     }
1216     return result;
1217 }
1218 
1219 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1220 {
1221     return float16_abs(float16_sub(op1, op2, stat));
1222 }
1223 
1224 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1225 {
1226     return float32_abs(float32_sub(op1, op2, stat));
1227 }
1228 
1229 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1230 {
1231     return float64_abs(float64_sub(op1, op2, stat));
1232 }
1233 
1234 /*
1235  * Reciprocal step. These are the AArch32 version which uses a
1236  * non-fused multiply-and-subtract.
1237  */
1238 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1239 {
1240     op1 = float16_squash_input_denormal(op1, stat);
1241     op2 = float16_squash_input_denormal(op2, stat);
1242 
1243     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1244         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1245         return float16_two;
1246     }
1247     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1248 }
1249 
1250 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1251 {
1252     op1 = float32_squash_input_denormal(op1, stat);
1253     op2 = float32_squash_input_denormal(op2, stat);
1254 
1255     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1256         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1257         return float32_two;
1258     }
1259     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1260 }
1261 
1262 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1263 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1264 {
1265     op1 = float16_squash_input_denormal(op1, stat);
1266     op2 = float16_squash_input_denormal(op2, stat);
1267 
1268     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1269         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1270         return float16_one_point_five;
1271     }
1272     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1273     return float16_div(op1, float16_two, stat);
1274 }
1275 
1276 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1277 {
1278     op1 = float32_squash_input_denormal(op1, stat);
1279     op2 = float32_squash_input_denormal(op2, stat);
1280 
1281     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1282         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1283         return float32_one_point_five;
1284     }
1285     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1286     return float32_div(op1, float32_two, stat);
1287 }
1288 
1289 #define DO_3OP(NAME, FUNC, TYPE) \
1290 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1291 {                                                                          \
1292     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1293     TYPE *d = vd, *n = vn, *m = vm;                                        \
1294     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1295         d[i] = FUNC(n[i], m[i], stat);                                     \
1296     }                                                                      \
1297     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1298 }
1299 
1300 DO_3OP(gvec_fadd_h, float16_add, float16)
1301 DO_3OP(gvec_fadd_s, float32_add, float32)
1302 DO_3OP(gvec_fadd_d, float64_add, float64)
1303 
1304 DO_3OP(gvec_fsub_h, float16_sub, float16)
1305 DO_3OP(gvec_fsub_s, float32_sub, float32)
1306 DO_3OP(gvec_fsub_d, float64_sub, float64)
1307 
1308 DO_3OP(gvec_fmul_h, float16_mul, float16)
1309 DO_3OP(gvec_fmul_s, float32_mul, float32)
1310 DO_3OP(gvec_fmul_d, float64_mul, float64)
1311 
1312 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1313 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1314 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1315 
1316 DO_3OP(gvec_fabd_h, float16_abd, float16)
1317 DO_3OP(gvec_fabd_s, float32_abd, float32)
1318 DO_3OP(gvec_fabd_d, float64_abd, float64)
1319 
1320 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1321 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1322 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1323 
1324 DO_3OP(gvec_fcge_h, float16_cge, float16)
1325 DO_3OP(gvec_fcge_s, float32_cge, float32)
1326 DO_3OP(gvec_fcge_d, float64_cge, float64)
1327 
1328 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1329 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1330 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1331 
1332 DO_3OP(gvec_facge_h, float16_acge, float16)
1333 DO_3OP(gvec_facge_s, float32_acge, float32)
1334 DO_3OP(gvec_facge_d, float64_acge, float64)
1335 
1336 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1337 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1338 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1339 
1340 DO_3OP(gvec_fmax_h, float16_max, float16)
1341 DO_3OP(gvec_fmax_s, float32_max, float32)
1342 DO_3OP(gvec_fmax_d, float64_max, float64)
1343 
1344 DO_3OP(gvec_fmin_h, float16_min, float16)
1345 DO_3OP(gvec_fmin_s, float32_min, float32)
1346 DO_3OP(gvec_fmin_d, float64_min, float64)
1347 
1348 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1349 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1350 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1351 
1352 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1353 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1354 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1355 
1356 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1357 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1358 
1359 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1360 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1361 
1362 #ifdef TARGET_AARCH64
1363 DO_3OP(gvec_fdiv_h, float16_div, float16)
1364 DO_3OP(gvec_fdiv_s, float32_div, float32)
1365 DO_3OP(gvec_fdiv_d, float64_div, float64)
1366 
1367 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1368 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1369 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1370 
1371 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1372 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1373 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1374 
1375 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1376 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1377 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1378 
1379 #endif
1380 #undef DO_3OP
1381 
1382 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1383 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1384                                  float_status *stat)
1385 {
1386     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1387 }
1388 
1389 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1390                                  float_status *stat)
1391 {
1392     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1393 }
1394 
1395 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1396                                  float_status *stat)
1397 {
1398     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1399 }
1400 
1401 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1402                                  float_status *stat)
1403 {
1404     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1405 }
1406 
1407 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1408 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1409                                 float_status *stat)
1410 {
1411     return float16_muladd(op1, op2, dest, 0, stat);
1412 }
1413 
1414 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1415                                  float_status *stat)
1416 {
1417     return float32_muladd(op1, op2, dest, 0, stat);
1418 }
1419 
1420 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1421                                  float_status *stat)
1422 {
1423     return float64_muladd(op1, op2, dest, 0, stat);
1424 }
1425 
1426 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1427                                  float_status *stat)
1428 {
1429     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1430 }
1431 
1432 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1433                                  float_status *stat)
1434 {
1435     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1436 }
1437 
1438 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1439                                  float_status *stat)
1440 {
1441     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1442 }
1443 
1444 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1445 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1446 {                                                                          \
1447     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1448     TYPE *d = vd, *n = vn, *m = vm;                                        \
1449     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1450         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1451     }                                                                      \
1452     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1453 }
1454 
1455 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1456 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1457 
1458 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1459 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1460 
1461 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1462 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1463 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1464 
1465 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1466 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1467 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1468 
1469 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1470  * For AdvSIMD, there is of course only one such vector segment.
1471  */
1472 
1473 #define DO_MUL_IDX(NAME, TYPE, H) \
1474 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1475 {                                                                          \
1476     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1477     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1478     intptr_t idx = simd_data(desc);                                        \
1479     TYPE *d = vd, *n = vn, *m = vm;                                        \
1480     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1481         TYPE mm = m[H(i + idx)];                                           \
1482         for (j = 0; j < segment; j++) {                                    \
1483             d[i + j] = n[i + j] * mm;                                      \
1484         }                                                                  \
1485     }                                                                      \
1486     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1487 }
1488 
1489 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1490 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1491 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1492 
1493 #undef DO_MUL_IDX
1494 
1495 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1496 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1497 {                                                                          \
1498     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1499     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1500     intptr_t idx = simd_data(desc);                                        \
1501     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1502     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1503         TYPE mm = m[H(i + idx)];                                           \
1504         for (j = 0; j < segment; j++) {                                    \
1505             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1506         }                                                                  \
1507     }                                                                      \
1508     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1509 }
1510 
1511 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1512 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1513 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1514 
1515 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1516 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1517 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1518 
1519 #undef DO_MLA_IDX
1520 
1521 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1522 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1523 {                                                                          \
1524     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1525     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1526     intptr_t idx = simd_data(desc);                                        \
1527     TYPE *d = vd, *n = vn, *m = vm;                                        \
1528     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1529         TYPE mm = m[H(i + idx)];                                           \
1530         for (j = 0; j < segment; j++) {                                    \
1531             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1532         }                                                                  \
1533     }                                                                      \
1534     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1535 }
1536 
1537 #define nop(N, M, S) (M)
1538 
1539 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1540 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1541 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1542 
1543 #ifdef TARGET_AARCH64
1544 
1545 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1546 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1547 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1548 
1549 #endif
1550 
1551 #undef nop
1552 
1553 /*
1554  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1555  * the fused ops below they assume accumulate both from and into Vd.
1556  */
1557 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1558 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1559 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1560 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1561 
1562 #undef DO_FMUL_IDX
1563 
1564 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1565 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1566                   void *stat, uint32_t desc)                               \
1567 {                                                                          \
1568     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1569     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1570     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1571     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1572     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1573     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1574     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1575         TYPE mm = m[H(i + idx)];                                           \
1576         for (j = 0; j < segment; j++) {                                    \
1577             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1578                                      mm, a[i + j], 0, stat);               \
1579         }                                                                  \
1580     }                                                                      \
1581     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1582 }
1583 
1584 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1585 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1586 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1587 
1588 #undef DO_FMLA_IDX
1589 
1590 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1591 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1592 {                                                                          \
1593     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1594     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1595     bool q = false;                                                        \
1596     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1597         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1598         if (dd < MIN) {                                                    \
1599             dd = MIN;                                                      \
1600             q = true;                                                      \
1601         } else if (dd > MAX) {                                             \
1602             dd = MAX;                                                      \
1603             q = true;                                                      \
1604         }                                                                  \
1605         d[i] = dd;                                                         \
1606     }                                                                      \
1607     if (q) {                                                               \
1608         uint32_t *qc = vq;                                                 \
1609         qc[0] = 1;                                                         \
1610     }                                                                      \
1611     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1612 }
1613 
1614 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1615 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1616 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1617 
1618 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1619 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1620 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1621 
1622 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1623 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1624 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1625 
1626 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1627 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1628 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1629 
1630 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1631 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1632 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1633 
1634 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1635 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1636 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1637 
1638 #undef DO_SAT
1639 
1640 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1641                           void *vm, uint32_t desc)
1642 {
1643     intptr_t i, oprsz = simd_oprsz(desc);
1644     uint64_t *d = vd, *n = vn, *m = vm;
1645     bool q = false;
1646 
1647     for (i = 0; i < oprsz / 8; i++) {
1648         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1649         if (dd < nn) {
1650             dd = UINT64_MAX;
1651             q = true;
1652         }
1653         d[i] = dd;
1654     }
1655     if (q) {
1656         uint32_t *qc = vq;
1657         qc[0] = 1;
1658     }
1659     clear_tail(d, oprsz, simd_maxsz(desc));
1660 }
1661 
1662 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1663                           void *vm, uint32_t desc)
1664 {
1665     intptr_t i, oprsz = simd_oprsz(desc);
1666     uint64_t *d = vd, *n = vn, *m = vm;
1667     bool q = false;
1668 
1669     for (i = 0; i < oprsz / 8; i++) {
1670         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1671         if (nn < mm) {
1672             dd = 0;
1673             q = true;
1674         }
1675         d[i] = dd;
1676     }
1677     if (q) {
1678         uint32_t *qc = vq;
1679         qc[0] = 1;
1680     }
1681     clear_tail(d, oprsz, simd_maxsz(desc));
1682 }
1683 
1684 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1685                           void *vm, uint32_t desc)
1686 {
1687     intptr_t i, oprsz = simd_oprsz(desc);
1688     int64_t *d = vd, *n = vn, *m = vm;
1689     bool q = false;
1690 
1691     for (i = 0; i < oprsz / 8; i++) {
1692         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1693         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1694             dd = (nn >> 63) ^ ~INT64_MIN;
1695             q = true;
1696         }
1697         d[i] = dd;
1698     }
1699     if (q) {
1700         uint32_t *qc = vq;
1701         qc[0] = 1;
1702     }
1703     clear_tail(d, oprsz, simd_maxsz(desc));
1704 }
1705 
1706 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1707                           void *vm, uint32_t desc)
1708 {
1709     intptr_t i, oprsz = simd_oprsz(desc);
1710     int64_t *d = vd, *n = vn, *m = vm;
1711     bool q = false;
1712 
1713     for (i = 0; i < oprsz / 8; i++) {
1714         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1715         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1716             dd = (nn >> 63) ^ ~INT64_MIN;
1717             q = true;
1718         }
1719         d[i] = dd;
1720     }
1721     if (q) {
1722         uint32_t *qc = vq;
1723         qc[0] = 1;
1724     }
1725     clear_tail(d, oprsz, simd_maxsz(desc));
1726 }
1727 
1728 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1729                            void *vm, uint32_t desc)
1730 {
1731     intptr_t i, oprsz = simd_oprsz(desc);
1732     uint64_t *d = vd, *n = vn, *m = vm;
1733     bool q = false;
1734 
1735     for (i = 0; i < oprsz / 8; i++) {
1736         uint64_t nn = n[i];
1737         int64_t mm = m[i];
1738         uint64_t dd = nn + mm;
1739 
1740         if (mm < 0) {
1741             if (nn < (uint64_t)-mm) {
1742                 dd = 0;
1743                 q = true;
1744             }
1745         } else {
1746             if (dd < nn) {
1747                 dd = UINT64_MAX;
1748                 q = true;
1749             }
1750         }
1751         d[i] = dd;
1752     }
1753     if (q) {
1754         uint32_t *qc = vq;
1755         qc[0] = 1;
1756     }
1757     clear_tail(d, oprsz, simd_maxsz(desc));
1758 }
1759 
1760 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1761                            void *vm, uint32_t desc)
1762 {
1763     intptr_t i, oprsz = simd_oprsz(desc);
1764     uint64_t *d = vd, *n = vn, *m = vm;
1765     bool q = false;
1766 
1767     for (i = 0; i < oprsz / 8; i++) {
1768         int64_t nn = n[i];
1769         uint64_t mm = m[i];
1770         int64_t dd = nn + mm;
1771 
1772         if (mm > (uint64_t)(INT64_MAX - nn)) {
1773             dd = INT64_MAX;
1774             q = true;
1775         }
1776         d[i] = dd;
1777     }
1778     if (q) {
1779         uint32_t *qc = vq;
1780         qc[0] = 1;
1781     }
1782     clear_tail(d, oprsz, simd_maxsz(desc));
1783 }
1784 
1785 #define DO_SRA(NAME, TYPE)                              \
1786 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1787 {                                                       \
1788     intptr_t i, oprsz = simd_oprsz(desc);               \
1789     int shift = simd_data(desc);                        \
1790     TYPE *d = vd, *n = vn;                              \
1791     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1792         d[i] += n[i] >> shift;                          \
1793     }                                                   \
1794     clear_tail(d, oprsz, simd_maxsz(desc));             \
1795 }
1796 
1797 DO_SRA(gvec_ssra_b, int8_t)
1798 DO_SRA(gvec_ssra_h, int16_t)
1799 DO_SRA(gvec_ssra_s, int32_t)
1800 DO_SRA(gvec_ssra_d, int64_t)
1801 
1802 DO_SRA(gvec_usra_b, uint8_t)
1803 DO_SRA(gvec_usra_h, uint16_t)
1804 DO_SRA(gvec_usra_s, uint32_t)
1805 DO_SRA(gvec_usra_d, uint64_t)
1806 
1807 #undef DO_SRA
1808 
1809 #define DO_RSHR(NAME, TYPE)                             \
1810 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1811 {                                                       \
1812     intptr_t i, oprsz = simd_oprsz(desc);               \
1813     int shift = simd_data(desc);                        \
1814     TYPE *d = vd, *n = vn;                              \
1815     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1816         TYPE tmp = n[i] >> (shift - 1);                 \
1817         d[i] = (tmp >> 1) + (tmp & 1);                  \
1818     }                                                   \
1819     clear_tail(d, oprsz, simd_maxsz(desc));             \
1820 }
1821 
1822 DO_RSHR(gvec_srshr_b, int8_t)
1823 DO_RSHR(gvec_srshr_h, int16_t)
1824 DO_RSHR(gvec_srshr_s, int32_t)
1825 DO_RSHR(gvec_srshr_d, int64_t)
1826 
1827 DO_RSHR(gvec_urshr_b, uint8_t)
1828 DO_RSHR(gvec_urshr_h, uint16_t)
1829 DO_RSHR(gvec_urshr_s, uint32_t)
1830 DO_RSHR(gvec_urshr_d, uint64_t)
1831 
1832 #undef DO_RSHR
1833 
1834 #define DO_RSRA(NAME, TYPE)                             \
1835 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1836 {                                                       \
1837     intptr_t i, oprsz = simd_oprsz(desc);               \
1838     int shift = simd_data(desc);                        \
1839     TYPE *d = vd, *n = vn;                              \
1840     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1841         TYPE tmp = n[i] >> (shift - 1);                 \
1842         d[i] += (tmp >> 1) + (tmp & 1);                 \
1843     }                                                   \
1844     clear_tail(d, oprsz, simd_maxsz(desc));             \
1845 }
1846 
1847 DO_RSRA(gvec_srsra_b, int8_t)
1848 DO_RSRA(gvec_srsra_h, int16_t)
1849 DO_RSRA(gvec_srsra_s, int32_t)
1850 DO_RSRA(gvec_srsra_d, int64_t)
1851 
1852 DO_RSRA(gvec_ursra_b, uint8_t)
1853 DO_RSRA(gvec_ursra_h, uint16_t)
1854 DO_RSRA(gvec_ursra_s, uint32_t)
1855 DO_RSRA(gvec_ursra_d, uint64_t)
1856 
1857 #undef DO_RSRA
1858 
1859 #define DO_SRI(NAME, TYPE)                              \
1860 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1861 {                                                       \
1862     intptr_t i, oprsz = simd_oprsz(desc);               \
1863     int shift = simd_data(desc);                        \
1864     TYPE *d = vd, *n = vn;                              \
1865     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1866         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1867     }                                                   \
1868     clear_tail(d, oprsz, simd_maxsz(desc));             \
1869 }
1870 
1871 DO_SRI(gvec_sri_b, uint8_t)
1872 DO_SRI(gvec_sri_h, uint16_t)
1873 DO_SRI(gvec_sri_s, uint32_t)
1874 DO_SRI(gvec_sri_d, uint64_t)
1875 
1876 #undef DO_SRI
1877 
1878 #define DO_SLI(NAME, TYPE)                              \
1879 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1880 {                                                       \
1881     intptr_t i, oprsz = simd_oprsz(desc);               \
1882     int shift = simd_data(desc);                        \
1883     TYPE *d = vd, *n = vn;                              \
1884     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1885         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1886     }                                                   \
1887     clear_tail(d, oprsz, simd_maxsz(desc));             \
1888 }
1889 
1890 DO_SLI(gvec_sli_b, uint8_t)
1891 DO_SLI(gvec_sli_h, uint16_t)
1892 DO_SLI(gvec_sli_s, uint32_t)
1893 DO_SLI(gvec_sli_d, uint64_t)
1894 
1895 #undef DO_SLI
1896 
1897 /*
1898  * Convert float16 to float32, raising no exceptions and
1899  * preserving exceptional values, including SNaN.
1900  * This is effectively an unpack+repack operation.
1901  */
1902 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1903 {
1904     const int f16_bias = 15;
1905     const int f32_bias = 127;
1906     uint32_t sign = extract32(f16, 15, 1);
1907     uint32_t exp = extract32(f16, 10, 5);
1908     uint32_t frac = extract32(f16, 0, 10);
1909 
1910     if (exp == 0x1f) {
1911         /* Inf or NaN */
1912         exp = 0xff;
1913     } else if (exp == 0) {
1914         /* Zero or denormal.  */
1915         if (frac != 0) {
1916             if (fz16) {
1917                 frac = 0;
1918             } else {
1919                 /*
1920                  * Denormal; these are all normal float32.
1921                  * Shift the fraction so that the msb is at bit 11,
1922                  * then remove bit 11 as the implicit bit of the
1923                  * normalized float32.  Note that we still go through
1924                  * the shift for normal numbers below, to put the
1925                  * float32 fraction at the right place.
1926                  */
1927                 int shift = clz32(frac) - 21;
1928                 frac = (frac << shift) & 0x3ff;
1929                 exp = f32_bias - f16_bias - shift + 1;
1930             }
1931         }
1932     } else {
1933         /* Normal number; adjust the bias.  */
1934         exp += f32_bias - f16_bias;
1935     }
1936     sign <<= 31;
1937     exp <<= 23;
1938     frac <<= 23 - 10;
1939 
1940     return sign | exp | frac;
1941 }
1942 
1943 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1944 {
1945     /*
1946      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1947      * Load the 2nd qword iff is_q & is_2.
1948      * Shift to the 2nd dword iff !is_q & is_2.
1949      * For !is_q & !is_2, the upper bits of the result are garbage.
1950      */
1951     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1952 }
1953 
1954 /*
1955  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1956  * as there is not yet SVE versions that might use blocking.
1957  */
1958 
1959 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1960                      uint32_t desc, bool fz16)
1961 {
1962     intptr_t i, oprsz = simd_oprsz(desc);
1963     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1964     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1965     int is_q = oprsz == 16;
1966     uint64_t n_4, m_4;
1967 
1968     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1969     n_4 = load4_f16(vn, is_q, is_2);
1970     m_4 = load4_f16(vm, is_q, is_2);
1971 
1972     /* Negate all inputs for FMLSL at once.  */
1973     if (is_s) {
1974         n_4 ^= 0x8000800080008000ull;
1975     }
1976 
1977     for (i = 0; i < oprsz / 4; i++) {
1978         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1979         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1980         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1981     }
1982     clear_tail(d, oprsz, simd_maxsz(desc));
1983 }
1984 
1985 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1986                             void *venv, uint32_t desc)
1987 {
1988     CPUARMState *env = venv;
1989     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1990              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1991 }
1992 
1993 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1994                             void *venv, uint32_t desc)
1995 {
1996     CPUARMState *env = venv;
1997     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1998              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1999 }
2000 
2001 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2002                                void *venv, uint32_t desc)
2003 {
2004     intptr_t i, oprsz = simd_oprsz(desc);
2005     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2006     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2007     CPUARMState *env = venv;
2008     float_status *status = &env->vfp.fp_status;
2009     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2010 
2011     for (i = 0; i < oprsz; i += sizeof(float32)) {
2012         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2013         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2014         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2015         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2016         float32 aa = *(float32 *)(va + H1_4(i));
2017 
2018         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2019     }
2020 }
2021 
2022 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2023                          uint32_t desc, bool fz16)
2024 {
2025     intptr_t i, oprsz = simd_oprsz(desc);
2026     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2027     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2028     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2029     int is_q = oprsz == 16;
2030     uint64_t n_4;
2031     float32 m_1;
2032 
2033     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2034     n_4 = load4_f16(vn, is_q, is_2);
2035 
2036     /* Negate all inputs for FMLSL at once.  */
2037     if (is_s) {
2038         n_4 ^= 0x8000800080008000ull;
2039     }
2040 
2041     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2042 
2043     for (i = 0; i < oprsz / 4; i++) {
2044         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2045         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2046     }
2047     clear_tail(d, oprsz, simd_maxsz(desc));
2048 }
2049 
2050 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2051                                 void *venv, uint32_t desc)
2052 {
2053     CPUARMState *env = venv;
2054     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2055                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2056 }
2057 
2058 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2059                                 void *venv, uint32_t desc)
2060 {
2061     CPUARMState *env = venv;
2062     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2063                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2064 }
2065 
2066 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2067                                void *venv, uint32_t desc)
2068 {
2069     intptr_t i, j, oprsz = simd_oprsz(desc);
2070     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2071     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2072     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2073     CPUARMState *env = venv;
2074     float_status *status = &env->vfp.fp_status;
2075     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2076 
2077     for (i = 0; i < oprsz; i += 16) {
2078         float16 mm_16 = *(float16 *)(vm + i + idx);
2079         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2080 
2081         for (j = 0; j < 16; j += sizeof(float32)) {
2082             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2083             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2084             float32 aa = *(float32 *)(va + H1_4(i + j));
2085 
2086             *(float32 *)(vd + H1_4(i + j)) =
2087                 float32_muladd(nn, mm, aa, 0, status);
2088         }
2089     }
2090 }
2091 
2092 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2093 {
2094     intptr_t i, opr_sz = simd_oprsz(desc);
2095     int8_t *d = vd, *n = vn, *m = vm;
2096 
2097     for (i = 0; i < opr_sz; ++i) {
2098         int8_t mm = m[i];
2099         int8_t nn = n[i];
2100         int8_t res = 0;
2101         if (mm >= 0) {
2102             if (mm < 8) {
2103                 res = nn << mm;
2104             }
2105         } else {
2106             res = nn >> (mm > -8 ? -mm : 7);
2107         }
2108         d[i] = res;
2109     }
2110     clear_tail(d, opr_sz, simd_maxsz(desc));
2111 }
2112 
2113 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2114 {
2115     intptr_t i, opr_sz = simd_oprsz(desc);
2116     int16_t *d = vd, *n = vn, *m = vm;
2117 
2118     for (i = 0; i < opr_sz / 2; ++i) {
2119         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2120         int16_t nn = n[i];
2121         int16_t res = 0;
2122         if (mm >= 0) {
2123             if (mm < 16) {
2124                 res = nn << mm;
2125             }
2126         } else {
2127             res = nn >> (mm > -16 ? -mm : 15);
2128         }
2129         d[i] = res;
2130     }
2131     clear_tail(d, opr_sz, simd_maxsz(desc));
2132 }
2133 
2134 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2135 {
2136     intptr_t i, opr_sz = simd_oprsz(desc);
2137     uint8_t *d = vd, *n = vn, *m = vm;
2138 
2139     for (i = 0; i < opr_sz; ++i) {
2140         int8_t mm = m[i];
2141         uint8_t nn = n[i];
2142         uint8_t res = 0;
2143         if (mm >= 0) {
2144             if (mm < 8) {
2145                 res = nn << mm;
2146             }
2147         } else {
2148             if (mm > -8) {
2149                 res = nn >> -mm;
2150             }
2151         }
2152         d[i] = res;
2153     }
2154     clear_tail(d, opr_sz, simd_maxsz(desc));
2155 }
2156 
2157 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2158 {
2159     intptr_t i, opr_sz = simd_oprsz(desc);
2160     uint16_t *d = vd, *n = vn, *m = vm;
2161 
2162     for (i = 0; i < opr_sz / 2; ++i) {
2163         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2164         uint16_t nn = n[i];
2165         uint16_t res = 0;
2166         if (mm >= 0) {
2167             if (mm < 16) {
2168                 res = nn << mm;
2169             }
2170         } else {
2171             if (mm > -16) {
2172                 res = nn >> -mm;
2173             }
2174         }
2175         d[i] = res;
2176     }
2177     clear_tail(d, opr_sz, simd_maxsz(desc));
2178 }
2179 
2180 /*
2181  * 8x8->8 polynomial multiply.
2182  *
2183  * Polynomial multiplication is like integer multiplication except the
2184  * partial products are XORed, not added.
2185  *
2186  * TODO: expose this as a generic vector operation, as it is a common
2187  * crypto building block.
2188  */
2189 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2190 {
2191     intptr_t i, opr_sz = simd_oprsz(desc);
2192     uint64_t *d = vd, *n = vn, *m = vm;
2193 
2194     for (i = 0; i < opr_sz / 8; ++i) {
2195         d[i] = clmul_8x8_low(n[i], m[i]);
2196     }
2197     clear_tail(d, opr_sz, simd_maxsz(desc));
2198 }
2199 
2200 /*
2201  * 64x64->128 polynomial multiply.
2202  * Because of the lanes are not accessed in strict columns,
2203  * this probably cannot be turned into a generic helper.
2204  */
2205 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2206 {
2207     intptr_t i, opr_sz = simd_oprsz(desc);
2208     intptr_t hi = simd_data(desc);
2209     uint64_t *d = vd, *n = vn, *m = vm;
2210 
2211     for (i = 0; i < opr_sz / 8; i += 2) {
2212         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2213         d[i] = int128_getlo(r);
2214         d[i + 1] = int128_gethi(r);
2215     }
2216     clear_tail(d, opr_sz, simd_maxsz(desc));
2217 }
2218 
2219 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2220 {
2221     int hi = simd_data(desc);
2222     uint64_t *d = vd, *n = vn, *m = vm;
2223     uint64_t nn = n[hi], mm = m[hi];
2224 
2225     d[0] = clmul_8x4_packed(nn, mm);
2226     nn >>= 32;
2227     mm >>= 32;
2228     d[1] = clmul_8x4_packed(nn, mm);
2229 
2230     clear_tail(d, 16, simd_maxsz(desc));
2231 }
2232 
2233 #ifdef TARGET_AARCH64
2234 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2235 {
2236     int shift = simd_data(desc) * 8;
2237     intptr_t i, opr_sz = simd_oprsz(desc);
2238     uint64_t *d = vd, *n = vn, *m = vm;
2239 
2240     for (i = 0; i < opr_sz / 8; ++i) {
2241         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2242     }
2243 }
2244 
2245 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2246 {
2247     intptr_t sel = H4(simd_data(desc));
2248     intptr_t i, opr_sz = simd_oprsz(desc);
2249     uint32_t *n = vn, *m = vm;
2250     uint64_t *d = vd;
2251 
2252     for (i = 0; i < opr_sz / 8; ++i) {
2253         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2254     }
2255 }
2256 #endif
2257 
2258 #define DO_CMP0(NAME, TYPE, OP)                         \
2259 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2260 {                                                       \
2261     intptr_t i, opr_sz = simd_oprsz(desc);              \
2262     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2263         TYPE nn = *(TYPE *)(vn + i);                    \
2264         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2265     }                                                   \
2266     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2267 }
2268 
2269 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2270 DO_CMP0(gvec_clt0_b, int8_t, <)
2271 DO_CMP0(gvec_cle0_b, int8_t, <=)
2272 DO_CMP0(gvec_cgt0_b, int8_t, >)
2273 DO_CMP0(gvec_cge0_b, int8_t, >=)
2274 
2275 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2276 DO_CMP0(gvec_clt0_h, int16_t, <)
2277 DO_CMP0(gvec_cle0_h, int16_t, <=)
2278 DO_CMP0(gvec_cgt0_h, int16_t, >)
2279 DO_CMP0(gvec_cge0_h, int16_t, >=)
2280 
2281 #undef DO_CMP0
2282 
2283 #define DO_ABD(NAME, TYPE)                                      \
2284 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2285 {                                                               \
2286     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2287     TYPE *d = vd, *n = vn, *m = vm;                             \
2288                                                                 \
2289     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2290         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2291     }                                                           \
2292     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2293 }
2294 
2295 DO_ABD(gvec_sabd_b, int8_t)
2296 DO_ABD(gvec_sabd_h, int16_t)
2297 DO_ABD(gvec_sabd_s, int32_t)
2298 DO_ABD(gvec_sabd_d, int64_t)
2299 
2300 DO_ABD(gvec_uabd_b, uint8_t)
2301 DO_ABD(gvec_uabd_h, uint16_t)
2302 DO_ABD(gvec_uabd_s, uint32_t)
2303 DO_ABD(gvec_uabd_d, uint64_t)
2304 
2305 #undef DO_ABD
2306 
2307 #define DO_ABA(NAME, TYPE)                                      \
2308 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2309 {                                                               \
2310     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2311     TYPE *d = vd, *n = vn, *m = vm;                             \
2312                                                                 \
2313     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2314         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2315     }                                                           \
2316     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2317 }
2318 
2319 DO_ABA(gvec_saba_b, int8_t)
2320 DO_ABA(gvec_saba_h, int16_t)
2321 DO_ABA(gvec_saba_s, int32_t)
2322 DO_ABA(gvec_saba_d, int64_t)
2323 
2324 DO_ABA(gvec_uaba_b, uint8_t)
2325 DO_ABA(gvec_uaba_h, uint16_t)
2326 DO_ABA(gvec_uaba_s, uint32_t)
2327 DO_ABA(gvec_uaba_d, uint64_t)
2328 
2329 #undef DO_ABA
2330 
2331 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2332 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2333 {                                                                          \
2334     ARMVectorReg scratch;                                                  \
2335     intptr_t oprsz = simd_oprsz(desc);                                     \
2336     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2337     TYPE *d = vd, *n = vn, *m = vm;                                        \
2338     if (unlikely(d == m)) {                                                \
2339         m = memcpy(&scratch, m, oprsz);                                    \
2340     }                                                                      \
2341     for (intptr_t i = 0; i < half; ++i) {                                  \
2342         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2343     }                                                                      \
2344     for (intptr_t i = 0; i < half; ++i) {                                  \
2345         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2346     }                                                                      \
2347     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2348 }
2349 
2350 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2351 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2352 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2353 
2354 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2355 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2356 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2357 
2358 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2359 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2360 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2361 
2362 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2363 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2364 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2365 
2366 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2367 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2368 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2369 
2370 #undef DO_3OP_PAIR
2371 
2372 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2373 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2374 {                                                               \
2375     ARMVectorReg scratch;                                       \
2376     intptr_t oprsz = simd_oprsz(desc);                          \
2377     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2378     TYPE *d = vd, *n = vn, *m = vm;                             \
2379     if (unlikely(d == m)) {                                     \
2380         m = memcpy(&scratch, m, oprsz);                         \
2381     }                                                           \
2382     for (intptr_t i = 0; i < half; ++i) {                       \
2383         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2384     }                                                           \
2385     for (intptr_t i = 0; i < half; ++i) {                       \
2386         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2387     }                                                           \
2388     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2389 }
2390 
2391 #define ADD(A, B) (A + B)
2392 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2393 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2394 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2395 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2396 #undef  ADD
2397 
2398 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2399 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2400 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2401 
2402 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2403 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2404 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2405 
2406 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2407 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2408 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2409 
2410 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2411 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2412 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2413 
2414 #undef DO_3OP_PAIR
2415 
2416 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2417     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2418     {                                                                   \
2419         intptr_t i, oprsz = simd_oprsz(desc);                           \
2420         int shift = simd_data(desc);                                    \
2421         TYPE *d = vd, *n = vn;                                          \
2422         float_status *fpst = stat;                                      \
2423         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2424             d[i] = FUNC(n[i], shift, fpst);                             \
2425         }                                                               \
2426         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2427     }
2428 
2429 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2430 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2431 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2432 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2433 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2434 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2435 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2436 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2437 
2438 #undef DO_VCVT_FIXED
2439 
2440 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2441     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2442     {                                                                   \
2443         float_status *fpst = stat;                                      \
2444         intptr_t i, oprsz = simd_oprsz(desc);                           \
2445         uint32_t rmode = simd_data(desc);                               \
2446         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2447         TYPE *d = vd, *n = vn;                                          \
2448         set_float_rounding_mode(rmode, fpst);                           \
2449         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2450             d[i] = FUNC(n[i], 0, fpst);                                 \
2451         }                                                               \
2452         set_float_rounding_mode(prev_rmode, fpst);                      \
2453         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2454     }
2455 
2456 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2457 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2458 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2459 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2460 
2461 #undef DO_VCVT_RMODE
2462 
2463 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2464     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2465     {                                                                   \
2466         float_status *fpst = stat;                                      \
2467         intptr_t i, oprsz = simd_oprsz(desc);                           \
2468         uint32_t rmode = simd_data(desc);                               \
2469         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2470         TYPE *d = vd, *n = vn;                                          \
2471         set_float_rounding_mode(rmode, fpst);                           \
2472         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2473             d[i] = FUNC(n[i], fpst);                                    \
2474         }                                                               \
2475         set_float_rounding_mode(prev_rmode, fpst);                      \
2476         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2477     }
2478 
2479 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2480 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2481 
2482 #undef DO_VRINT_RMODE
2483 
2484 #ifdef TARGET_AARCH64
2485 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2486 {
2487     const uint8_t *indices = vm;
2488     CPUARMState *env = venv;
2489     size_t oprsz = simd_oprsz(desc);
2490     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2491     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2492     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2493     union {
2494         uint8_t b[16];
2495         uint64_t d[2];
2496     } result;
2497 
2498     /*
2499      * We must construct the final result in a temp, lest the output
2500      * overlaps the input table.  For TBL, begin with zero; for TBX,
2501      * begin with the original register contents.  Note that we always
2502      * copy 16 bytes here to avoid an extra branch; clearing the high
2503      * bits of the register for oprsz == 8 is handled below.
2504      */
2505     if (is_tbx) {
2506         memcpy(&result, vd, 16);
2507     } else {
2508         memset(&result, 0, 16);
2509     }
2510 
2511     for (size_t i = 0; i < oprsz; ++i) {
2512         uint32_t index = indices[H1(i)];
2513 
2514         if (index < table_len) {
2515             /*
2516              * Convert index (a byte offset into the virtual table
2517              * which is a series of 128-bit vectors concatenated)
2518              * into the correct register element, bearing in mind
2519              * that the table can wrap around from V31 to V0.
2520              */
2521             const uint8_t *table = (const uint8_t *)
2522                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2523             result.b[H1(i)] = table[H1(index % 16)];
2524         }
2525     }
2526 
2527     memcpy(vd, &result, 16);
2528     clear_tail(vd, oprsz, simd_maxsz(desc));
2529 }
2530 #endif
2531 
2532 /*
2533  * NxN -> N highpart multiply
2534  *
2535  * TODO: expose this as a generic vector operation.
2536  */
2537 
2538 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2539 {
2540     intptr_t i, opr_sz = simd_oprsz(desc);
2541     int8_t *d = vd, *n = vn, *m = vm;
2542 
2543     for (i = 0; i < opr_sz; ++i) {
2544         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2545     }
2546     clear_tail(d, opr_sz, simd_maxsz(desc));
2547 }
2548 
2549 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2550 {
2551     intptr_t i, opr_sz = simd_oprsz(desc);
2552     int16_t *d = vd, *n = vn, *m = vm;
2553 
2554     for (i = 0; i < opr_sz / 2; ++i) {
2555         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2556     }
2557     clear_tail(d, opr_sz, simd_maxsz(desc));
2558 }
2559 
2560 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2561 {
2562     intptr_t i, opr_sz = simd_oprsz(desc);
2563     int32_t *d = vd, *n = vn, *m = vm;
2564 
2565     for (i = 0; i < opr_sz / 4; ++i) {
2566         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2567     }
2568     clear_tail(d, opr_sz, simd_maxsz(desc));
2569 }
2570 
2571 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2572 {
2573     intptr_t i, opr_sz = simd_oprsz(desc);
2574     uint64_t *d = vd, *n = vn, *m = vm;
2575     uint64_t discard;
2576 
2577     for (i = 0; i < opr_sz / 8; ++i) {
2578         muls64(&discard, &d[i], n[i], m[i]);
2579     }
2580     clear_tail(d, opr_sz, simd_maxsz(desc));
2581 }
2582 
2583 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2584 {
2585     intptr_t i, opr_sz = simd_oprsz(desc);
2586     uint8_t *d = vd, *n = vn, *m = vm;
2587 
2588     for (i = 0; i < opr_sz; ++i) {
2589         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2590     }
2591     clear_tail(d, opr_sz, simd_maxsz(desc));
2592 }
2593 
2594 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2595 {
2596     intptr_t i, opr_sz = simd_oprsz(desc);
2597     uint16_t *d = vd, *n = vn, *m = vm;
2598 
2599     for (i = 0; i < opr_sz / 2; ++i) {
2600         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2601     }
2602     clear_tail(d, opr_sz, simd_maxsz(desc));
2603 }
2604 
2605 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2606 {
2607     intptr_t i, opr_sz = simd_oprsz(desc);
2608     uint32_t *d = vd, *n = vn, *m = vm;
2609 
2610     for (i = 0; i < opr_sz / 4; ++i) {
2611         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2612     }
2613     clear_tail(d, opr_sz, simd_maxsz(desc));
2614 }
2615 
2616 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2617 {
2618     intptr_t i, opr_sz = simd_oprsz(desc);
2619     uint64_t *d = vd, *n = vn, *m = vm;
2620     uint64_t discard;
2621 
2622     for (i = 0; i < opr_sz / 8; ++i) {
2623         mulu64(&discard, &d[i], n[i], m[i]);
2624     }
2625     clear_tail(d, opr_sz, simd_maxsz(desc));
2626 }
2627 
2628 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2629 {
2630     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2631     int shr = simd_data(desc);
2632     uint64_t *d = vd, *n = vn, *m = vm;
2633 
2634     for (i = 0; i < opr_sz; ++i) {
2635         d[i] = ror64(n[i] ^ m[i], shr);
2636     }
2637     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2638 }
2639 
2640 /*
2641  * Integer matrix-multiply accumulate
2642  */
2643 
2644 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2645 {
2646     int8_t *n = vn, *m = vm;
2647 
2648     for (intptr_t k = 0; k < 8; ++k) {
2649         sum += n[H1(k)] * m[H1(k)];
2650     }
2651     return sum;
2652 }
2653 
2654 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2655 {
2656     uint8_t *n = vn, *m = vm;
2657 
2658     for (intptr_t k = 0; k < 8; ++k) {
2659         sum += n[H1(k)] * m[H1(k)];
2660     }
2661     return sum;
2662 }
2663 
2664 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2665 {
2666     uint8_t *n = vn;
2667     int8_t *m = vm;
2668 
2669     for (intptr_t k = 0; k < 8; ++k) {
2670         sum += n[H1(k)] * m[H1(k)];
2671     }
2672     return sum;
2673 }
2674 
2675 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2676                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2677 {
2678     intptr_t seg, opr_sz = simd_oprsz(desc);
2679 
2680     for (seg = 0; seg < opr_sz; seg += 16) {
2681         uint32_t *d = vd + seg;
2682         uint32_t *a = va + seg;
2683         uint32_t sum0, sum1, sum2, sum3;
2684 
2685         /*
2686          * Process the entire segment at once, writing back the
2687          * results only after we've consumed all of the inputs.
2688          *
2689          * Key to indices by column:
2690          *          i   j                  i             j
2691          */
2692         sum0 = a[H4(0 + 0)];
2693         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2694         sum1 = a[H4(0 + 1)];
2695         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2696         sum2 = a[H4(2 + 0)];
2697         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2698         sum3 = a[H4(2 + 1)];
2699         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2700 
2701         d[H4(0)] = sum0;
2702         d[H4(1)] = sum1;
2703         d[H4(2)] = sum2;
2704         d[H4(3)] = sum3;
2705     }
2706     clear_tail(vd, opr_sz, simd_maxsz(desc));
2707 }
2708 
2709 #define DO_MMLA_B(NAME, INNER) \
2710     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2711     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2712 
2713 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2714 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2715 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2716 
2717 /*
2718  * BFloat16 Dot Product
2719  */
2720 
2721 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2722 {
2723     /* FPCR is ignored for BFDOT and BFMMLA. */
2724     float_status bf_status = {
2725         .tininess_before_rounding = float_tininess_before_rounding,
2726         .float_rounding_mode = float_round_to_odd_inf,
2727         .flush_to_zero = true,
2728         .flush_inputs_to_zero = true,
2729         .default_nan_mode = true,
2730     };
2731     float32 t1, t2;
2732 
2733     /*
2734      * Extract each BFloat16 from the element pair, and shift
2735      * them such that they become float32.
2736      */
2737     t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2738     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2739     t1 = float32_add(t1, t2, &bf_status);
2740     t1 = float32_add(sum, t1, &bf_status);
2741 
2742     return t1;
2743 }
2744 
2745 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2746 {
2747     intptr_t i, opr_sz = simd_oprsz(desc);
2748     float32 *d = vd, *a = va;
2749     uint32_t *n = vn, *m = vm;
2750 
2751     for (i = 0; i < opr_sz / 4; ++i) {
2752         d[i] = bfdotadd(a[i], n[i], m[i]);
2753     }
2754     clear_tail(d, opr_sz, simd_maxsz(desc));
2755 }
2756 
2757 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2758                             void *va, uint32_t desc)
2759 {
2760     intptr_t i, j, opr_sz = simd_oprsz(desc);
2761     intptr_t index = simd_data(desc);
2762     intptr_t elements = opr_sz / 4;
2763     intptr_t eltspersegment = MIN(16 / 4, elements);
2764     float32 *d = vd, *a = va;
2765     uint32_t *n = vn, *m = vm;
2766 
2767     for (i = 0; i < elements; i += eltspersegment) {
2768         uint32_t m_idx = m[i + H4(index)];
2769 
2770         for (j = i; j < i + eltspersegment; j++) {
2771             d[j] = bfdotadd(a[j], n[j], m_idx);
2772         }
2773     }
2774     clear_tail(d, opr_sz, simd_maxsz(desc));
2775 }
2776 
2777 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2778 {
2779     intptr_t s, opr_sz = simd_oprsz(desc);
2780     float32 *d = vd, *a = va;
2781     uint32_t *n = vn, *m = vm;
2782 
2783     for (s = 0; s < opr_sz / 4; s += 4) {
2784         float32 sum00, sum01, sum10, sum11;
2785 
2786         /*
2787          * Process the entire segment at once, writing back the
2788          * results only after we've consumed all of the inputs.
2789          *
2790          * Key to indices by column:
2791          *               i   j           i   k             j   k
2792          */
2793         sum00 = a[s + H4(0 + 0)];
2794         sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2795         sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2796 
2797         sum01 = a[s + H4(0 + 1)];
2798         sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2799         sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2800 
2801         sum10 = a[s + H4(2 + 0)];
2802         sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2803         sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2804 
2805         sum11 = a[s + H4(2 + 1)];
2806         sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2807         sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2808 
2809         d[s + H4(0 + 0)] = sum00;
2810         d[s + H4(0 + 1)] = sum01;
2811         d[s + H4(2 + 0)] = sum10;
2812         d[s + H4(2 + 1)] = sum11;
2813     }
2814     clear_tail(d, opr_sz, simd_maxsz(desc));
2815 }
2816 
2817 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2818                          void *stat, uint32_t desc)
2819 {
2820     intptr_t i, opr_sz = simd_oprsz(desc);
2821     intptr_t sel = simd_data(desc);
2822     float32 *d = vd, *a = va;
2823     bfloat16 *n = vn, *m = vm;
2824 
2825     for (i = 0; i < opr_sz / 4; ++i) {
2826         float32 nn = n[H2(i * 2 + sel)] << 16;
2827         float32 mm = m[H2(i * 2 + sel)] << 16;
2828         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2829     }
2830     clear_tail(d, opr_sz, simd_maxsz(desc));
2831 }
2832 
2833 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2834                              void *va, void *stat, uint32_t desc)
2835 {
2836     intptr_t i, j, opr_sz = simd_oprsz(desc);
2837     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2838     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2839     intptr_t elements = opr_sz / 4;
2840     intptr_t eltspersegment = MIN(16 / 4, elements);
2841     float32 *d = vd, *a = va;
2842     bfloat16 *n = vn, *m = vm;
2843 
2844     for (i = 0; i < elements; i += eltspersegment) {
2845         float32 m_idx = m[H2(2 * i + index)] << 16;
2846 
2847         for (j = i; j < i + eltspersegment; j++) {
2848             float32 n_j = n[H2(2 * j + sel)] << 16;
2849             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2850         }
2851     }
2852     clear_tail(d, opr_sz, simd_maxsz(desc));
2853 }
2854 
2855 #define DO_CLAMP(NAME, TYPE) \
2856 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2857 {                                                                       \
2858     intptr_t i, opr_sz = simd_oprsz(desc);                              \
2859     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2860         TYPE aa = *(TYPE *)(a + i);                                     \
2861         TYPE nn = *(TYPE *)(n + i);                                     \
2862         TYPE mm = *(TYPE *)(m + i);                                     \
2863         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2864         *(TYPE *)(d + i) = dd;                                          \
2865     }                                                                   \
2866     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2867 }
2868 
2869 DO_CLAMP(gvec_sclamp_b, int8_t)
2870 DO_CLAMP(gvec_sclamp_h, int16_t)
2871 DO_CLAMP(gvec_sclamp_s, int32_t)
2872 DO_CLAMP(gvec_sclamp_d, int64_t)
2873 
2874 DO_CLAMP(gvec_uclamp_b, uint8_t)
2875 DO_CLAMP(gvec_uclamp_h, uint16_t)
2876 DO_CLAMP(gvec_uclamp_s, uint32_t)
2877 DO_CLAMP(gvec_uclamp_d, uint64_t)
2878